├── data_mutil ├── data ├── READ └── labels.txt ├── README.md ├── configuration_roberta1.py ├── metrics1.py ├── utils.py ├── tokenization_roberta1.py ├── configuration_bert1.py ├── optimization1.py ├── tokenization_gpt21.py ├── configuration_utils1.py ├── file_utils1.py ├── glue1.py ├── tokenization_bert1.py ├── modeling_roberta1.py ├── run_glue.py └── modeling_utils1.py /data_mutil: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/READ: -------------------------------------------------------------------------------- 1 | 这是我们的文本存储文件夹 2 | -------------------------------------------------------------------------------- /data/labels.txt: -------------------------------------------------------------------------------- 1 | inform_theater; 2 | inform_starttime; 3 | inform_numberofpeople 4 | greeting; 5 | thanks 6 | inform_other 7 | request_moviename; 8 | inform_genre 9 | request_ticket; 10 | inform_city; 11 | inform_state; 12 | inform_date 13 | inform_moviename 14 | confirm_answer; 15 | inform_zip 16 | inform_video_format 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BERT 对多标签文本分类 2 | 调用bert-base-uncased的pytorch预训练模型做对一个多标签文本做分类 3 | 4 | 5 | 文件夹下包含子文件夹,其中data是我们文本存储文件夹,包含训练、验证和测试的数据。 6 | 对数据的描述:每个实例包含一句话和它所对应的标签,标签总共包含16个。具体标签内容在data文件夹下的label.txt文件中。 7 | 首先我们需要下载transformer包,利用pip install transformer或者conda命令即可安装。 8 | 然后我们需要打开run_glue.py文件,修改文件路径,将文本文件夹路径(data_dir)、模型(bert_base_uncased)路径进行修改。其中模型路径包含三部分内容,第一部分是uncased-model.bin,也就是pytorch版本模型路径。第二部分是模型所对应的json配置文件路径。第三部分就是vocab.txt路径,需要一一在run_glue.py进行修改。 9 | 10 | 模型相关的三个文件需要自己下载! 11 | debug run_glue.py文件 MODEL_ALL中包含三个文件的下载地址! 12 | -------------------------------------------------------------------------------- /configuration_roberta1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | from __future__ import (absolute_import, division, print_function, 19 | unicode_literals) 20 | 21 | import logging 22 | 23 | from configuration_bert1 import BertConfig 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 28 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", 29 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", 30 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", 31 | } 32 | 33 | 34 | class RobertaConfig(BertConfig): 35 | pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 36 | -------------------------------------------------------------------------------- /metrics1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import csv 18 | import sys 19 | import logging 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | try: 24 | from scipy.stats import pearsonr, spearmanr 25 | from sklearn.metrics import matthews_corrcoef, f1_score 26 | _has_sklearn = True 27 | except (AttributeError, ImportError) as e: 28 | logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html") 29 | _has_sklearn = False 30 | 31 | def is_sklearn_available(): 32 | return _has_sklearn 33 | 34 | if _has_sklearn: 35 | 36 | def simple_accuracy(preds, labels): 37 | return (preds == labels).mean() 38 | 39 | 40 | def acc_and_f1(preds, labels): 41 | acc = simple_accuracy(preds, labels) 42 | f1 = f1_score(y_true=labels, y_pred=preds) 43 | return { 44 | "acc": acc, 45 | "f1": f1, 46 | "acc_and_f1": (acc + f1) / 2, 47 | } 48 | 49 | 50 | def pearson_and_spearman(preds, labels): 51 | pearson_corr = pearsonr(preds, labels)[0] 52 | spearman_corr = spearmanr(preds, labels)[0] 53 | return { 54 | "pearson": pearson_corr, 55 | "spearmanr": spearman_corr, 56 | "corr": (pearson_corr + spearman_corr) / 2, 57 | } 58 | 59 | 60 | def glue_compute_metrics(task_name, preds, labels): 61 | assert len(preds) == len(labels) 62 | if task_name == "cola": 63 | return {"mcc": matthews_corrcoef(labels, preds)} 64 | elif task_name == "sst-2": 65 | return {"acc": simple_accuracy(preds, labels)} 66 | elif task_name == "mrpc": 67 | return acc_and_f1(preds, labels) 68 | elif task_name == "sts-b": 69 | return pearson_and_spearman(preds, labels) 70 | elif task_name == "qqp": 71 | return acc_and_f1(preds, labels) 72 | elif task_name == "mnli": 73 | return {"acc": simple_accuracy(preds, labels)} 74 | elif task_name == "mnli-mm": 75 | return {"acc": simple_accuracy(preds, labels)} 76 | elif task_name == "qnli": 77 | return {"acc": simple_accuracy(preds, labels)} 78 | elif task_name == "rte": 79 | return {"acc": simple_accuracy(preds, labels)} 80 | elif task_name == "wnli": 81 | return {"acc": simple_accuracy(preds, labels)} 82 | elif task_name == "multilabel": 83 | return {"acc": simple_accuracy(preds, labels)} 84 | else: 85 | raise KeyError(task_name) 86 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import csv 18 | import sys 19 | import copy 20 | import json 21 | 22 | class InputExample(object): 23 | """ 24 | A single training/test example for simple sequence classification. 25 | 26 | Args: 27 | guid: Unique id for the example. 28 | text_a: string. The untokenized text of the first sequence. For single 29 | sequence tasks, only this sequence must be specified. 30 | text_b: (Optional) string. The untokenized text of the second sequence. 31 | Only must be specified for sequence pair tasks. 32 | label: (Optional) string. The label of the example. This should be 33 | specified for train and dev examples, but not for test examples. 34 | """ 35 | def __init__(self, guid, text_a, text_b=None, label=None): 36 | self.guid = guid 37 | self.text_a = text_a 38 | self.text_b = text_b 39 | self.label = label 40 | 41 | 42 | 43 | 44 | 45 | def __repr__(self): 46 | return str(self.to_json_string()) 47 | 48 | def to_dict(self): 49 | """Serializes this instance to a Python dictionary.""" 50 | output = copy.deepcopy(self.__dict__) 51 | return output 52 | 53 | def to_json_string(self): 54 | """Serializes this instance to a JSON string.""" 55 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 56 | 57 | 58 | class InputFeatures(object): 59 | """ 60 | A single set of features of data. 61 | 62 | Args: 63 | input_ids: Indices of input sequence tokens in the vocabulary. 64 | attention_mask: Mask to avoid performing attention on padding token indices. 65 | Mask values selected in ``[0, 1]``: 66 | Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. 67 | token_type_ids: Segment token indices to indicate first and second portions of the inputs. 68 | label: Label corresponding to the input 69 | """ 70 | 71 | def __init__(self, input_ids, attention_mask, token_type_ids, label): 72 | self.input_ids = input_ids 73 | self.attention_mask = attention_mask 74 | self.token_type_ids = token_type_ids 75 | self.label = label 76 | 77 | def __repr__(self): 78 | return str(self.to_json_string()) 79 | 80 | def to_dict(self): 81 | """Serializes this instance to a Python dictionary.""" 82 | output = copy.deepcopy(self.__dict__) 83 | return output 84 | 85 | def to_json_string(self): 86 | """Serializes this instance to a JSON string.""" 87 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 88 | 89 | 90 | class DataProcessor(object): 91 | """Base class for data converters for sequence classification data sets.""" 92 | 93 | def get_example_from_tensor_dict(self, tensor_dict): 94 | """Gets an example from a dict with tensorflow tensors 95 | 96 | Args: 97 | tensor_dict: Keys and values should match the corresponding Glue 98 | tensorflow_dataset examples. 99 | """ 100 | raise NotImplementedError() 101 | 102 | def get_train_examples(self, data_dir): 103 | """Gets a collection of `InputExample`s for the train set.""" 104 | raise NotImplementedError() 105 | 106 | def get_dev_examples(self, data_dir): 107 | """Gets a collection of `InputExample`s for the dev set.""" 108 | raise NotImplementedError() 109 | 110 | def get_labels(self): 111 | """Gets the list of labels for this data set.""" 112 | raise NotImplementedError() 113 | 114 | @classmethod 115 | def _read_tsv(cls, input_file, quotechar=None): 116 | """Reads a tab separated value file.""" 117 | with open(input_file, "r", encoding="utf-8-sig") as f: 118 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 119 | lines = [] 120 | for line in reader: 121 | if sys.version_info[0] == 2: 122 | line = list(unicode(cell, 'utf-8') for cell in line) 123 | lines.append(line) 124 | return lines 125 | -------------------------------------------------------------------------------- /tokenization_roberta1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for RoBERTa.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import sys 20 | import json 21 | import logging 22 | import os 23 | import regex as re 24 | from io import open 25 | 26 | from tokenization_gpt21 import GPT2Tokenizer 27 | 28 | try: 29 | from functools import lru_cache 30 | except ImportError: 31 | # Just a dummy decorator to get the checks to run on python2 32 | # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. 33 | def lru_cache(): 34 | return lambda func: func 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | VOCAB_FILES_NAMES = { 39 | 'vocab_file': 'vocab.json', 40 | 'merges_file': 'merges.txt', 41 | } 42 | 43 | PRETRAINED_VOCAB_FILES_MAP = { 44 | 'vocab_file': 45 | { 46 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", 47 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", 48 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json", 49 | }, 50 | 'merges_file': 51 | { 52 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", 53 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", 54 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt", 55 | }, 56 | } 57 | 58 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 59 | 'roberta-base': 512, 60 | 'roberta-large': 512, 61 | 'roberta-large-mnli': 512, 62 | } 63 | 64 | 65 | class RobertaTokenizer(GPT2Tokenizer): 66 | """ 67 | RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: 68 | - Byte-level Byte-Pair-Encoding 69 | - Requires a space to start the input string => the encoding methods should be called with the 70 | ``add_prefix_space`` flag set to ``True``. 71 | Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve 72 | the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` 73 | """ 74 | vocab_files_names = VOCAB_FILES_NAMES 75 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 76 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 77 | 78 | def __init__(self, vocab_file, merges_file, errors='replace', bos_token="", eos_token="", sep_token="", 79 | cls_token="", unk_token="", pad_token='', mask_token='', **kwargs): 80 | super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors, 81 | bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, 82 | sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, 83 | mask_token=mask_token, **kwargs) 84 | self.max_len_single_sentence = self.max_len - 2 # take into account special tokens 85 | self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens 86 | 87 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): 88 | """ 89 | Build model inputs from a sequence or a pair of sequence for sequence classification tasks 90 | by concatenating and adding special tokens. 91 | A RoBERTa sequence has the following format: 92 | single sequence: X 93 | pair of sequences: A B 94 | """ 95 | if token_ids_1 is None: 96 | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] 97 | cls = [self.cls_token_id] 98 | sep = [self.sep_token_id] 99 | return cls + token_ids_0 + sep + sep + token_ids_1 + sep 100 | 101 | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): 102 | """ 103 | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding 104 | special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. 105 | 106 | Args: 107 | token_ids_0: list of ids (must not contain special tokens) 108 | token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids 109 | for sequence pairs 110 | already_has_special_tokens: (default False) Set to True if the token list is already formated with 111 | special tokens for the model 112 | 113 | Returns: 114 | A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. 115 | """ 116 | if already_has_special_tokens: 117 | if token_ids_1 is not None: 118 | raise ValueError("You should not supply a second sequence if the provided sequence of " 119 | "ids is already formated with special tokens for the model.") 120 | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) 121 | 122 | if token_ids_1 is None: 123 | return [1] + ([0] * len(token_ids_0)) + [1] 124 | return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] 125 | 126 | def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): 127 | """ 128 | Creates a mask from the two sequences passed to be used in a sequence-pair classification task. 129 | A RoBERTa sequence pair mask has the following format: 130 | 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 131 | | first sequence | second sequence 132 | 133 | if token_ids_1 is None, only returns the first portion of the mask (0's). 134 | """ 135 | sep = [self.sep_token_id] 136 | cls = [self.cls_token_id] 137 | 138 | if token_ids_1 is None: 139 | return len(cls + token_ids_0 + sep) * [0] 140 | return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] 141 | -------------------------------------------------------------------------------- /configuration_bert1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ BERT model configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from configuration_utils1 import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 30 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", 31 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", 32 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", 33 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", 34 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", 35 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", 36 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", 37 | 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", 38 | 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", 39 | 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", 40 | 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", 41 | 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", 42 | 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", 43 | 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", 44 | 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", 45 | } 46 | 47 | 48 | class BertConfig(PretrainedConfig): 49 | r""" 50 | :class:`~transformers.BertConfig` is the configuration class to store the configuration of a 51 | `BertModel`. 52 | 53 | 54 | Arguments: 55 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. 56 | hidden_size: Size of the encoder layers and the pooler layer. 57 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 58 | num_attention_heads: Number of attention heads for each attention layer in 59 | the Transformer encoder. 60 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 61 | layer in the Transformer encoder. 62 | hidden_act: The non-linear activation function (function or string) in the 63 | encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. 64 | hidden_dropout_prob: The dropout probabilitiy for all fully connected 65 | layers in the embeddings, encoder, and pooler. 66 | attention_probs_dropout_prob: The dropout ratio for the attention 67 | probabilities. 68 | max_position_embeddings: The maximum sequence length that this model might 69 | ever be used with. Typically set this to something large just in case 70 | (e.g., 512 or 1024 or 2048). 71 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 72 | `BertModel`. 73 | initializer_range: The sttdev of the truncated_normal_initializer for 74 | initializing all weight matrices. 75 | layer_norm_eps: The epsilon used by LayerNorm. 76 | """ 77 | pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP 78 | 79 | def __init__(self, 80 | vocab_size_or_config_json_file=30522, 81 | hidden_size=768, 82 | num_hidden_layers=12, 83 | num_attention_heads=12, 84 | intermediate_size=3072, 85 | hidden_act="gelu", 86 | hidden_dropout_prob=0.1, 87 | attention_probs_dropout_prob=0.1, 88 | max_position_embeddings=512, 89 | type_vocab_size=2, 90 | initializer_range=0.02, 91 | layer_norm_eps=1e-12, 92 | **kwargs): 93 | super(BertConfig, self).__init__(**kwargs) 94 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 95 | and isinstance(vocab_size_or_config_json_file, unicode)): 96 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 97 | json_config = json.loads(reader.read()) 98 | for key, value in json_config.items(): 99 | self.__dict__[key] = value 100 | elif isinstance(vocab_size_or_config_json_file, int): 101 | self.vocab_size = vocab_size_or_config_json_file 102 | self.hidden_size = hidden_size 103 | self.num_hidden_layers = num_hidden_layers 104 | self.num_attention_heads = num_attention_heads 105 | self.hidden_act = hidden_act 106 | self.intermediate_size = intermediate_size 107 | self.hidden_dropout_prob = hidden_dropout_prob 108 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 109 | self.max_position_embeddings = max_position_embeddings 110 | self.type_vocab_size = type_vocab_size 111 | self.initializer_range = initializer_range 112 | self.layer_norm_eps = layer_norm_eps 113 | else: 114 | raise ValueError("First argument must be either a vocabulary size (int)" 115 | " or the path to a pretrained model config file (str)") 116 | -------------------------------------------------------------------------------- /optimization1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for BERT model.""" 16 | 17 | import logging 18 | import math 19 | 20 | import torch 21 | from torch.optim import Optimizer 22 | from torch.optim.lr_scheduler import LambdaLR 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | class ConstantLRSchedule(LambdaLR): 27 | """ Constant learning rate schedule. 28 | """ 29 | def __init__(self, optimizer, last_epoch=-1): 30 | super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch) 31 | 32 | 33 | class WarmupConstantSchedule(LambdaLR): 34 | """ Linear warmup and then constant. 35 | Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps. 36 | Keeps learning rate schedule equal to 1. after warmup_steps. 37 | """ 38 | def __init__(self, optimizer, warmup_steps, last_epoch=-1): 39 | self.warmup_steps = warmup_steps 40 | super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) 41 | 42 | def lr_lambda(self, step): 43 | if step < self.warmup_steps: 44 | return float(step) / float(max(1.0, self.warmup_steps)) 45 | return 1. 46 | 47 | 48 | class WarmupLinearSchedule(LambdaLR): 49 | """ Linear warmup and then linear decay. 50 | Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. 51 | Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps. 52 | """ 53 | def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1): 54 | self.warmup_steps = warmup_steps 55 | self.t_total = t_total 56 | super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) 57 | 58 | def lr_lambda(self, step): 59 | if step < self.warmup_steps: 60 | return float(step) / float(max(1, self.warmup_steps)) 61 | return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps))) 62 | 63 | 64 | class WarmupCosineSchedule(LambdaLR): 65 | """ Linear warmup and then cosine decay. 66 | Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. 67 | Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve. 68 | If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup. 69 | """ 70 | def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1): 71 | self.warmup_steps = warmup_steps 72 | self.t_total = t_total 73 | self.cycles = cycles 74 | super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) 75 | 76 | def lr_lambda(self, step): 77 | if step < self.warmup_steps: 78 | return float(step) / float(max(1.0, self.warmup_steps)) 79 | # progress after warmup 80 | progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps)) 81 | return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) 82 | 83 | 84 | class WarmupCosineWithHardRestartsSchedule(LambdaLR): 85 | """ Linear warmup and then cosine cycles with hard restarts. 86 | Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. 87 | If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying 88 | learning rate (with hard restarts). 89 | """ 90 | def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1): 91 | self.warmup_steps = warmup_steps 92 | self.t_total = t_total 93 | self.cycles = cycles 94 | super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) 95 | 96 | def lr_lambda(self, step): 97 | if step < self.warmup_steps: 98 | return float(step) / float(max(1, self.warmup_steps)) 99 | # progress after warmup 100 | progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps)) 101 | if progress >= 1.0: 102 | return 0.0 103 | return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0)))) 104 | 105 | 106 | 107 | class AdamW(Optimizer): 108 | """ Implements Adam algorithm with weight decay fix. 109 | 110 | Parameters: 111 | lr (float): learning rate. Default 1e-3. 112 | betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999) 113 | eps (float): Adams epsilon. Default: 1e-6 114 | weight_decay (float): Weight decay. Default: 0.0 115 | correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True. 116 | """ 117 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True): 118 | if lr < 0.0: 119 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 120 | if not 0.0 <= betas[0] < 1.0: 121 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0])) 122 | if not 0.0 <= betas[1] < 1.0: 123 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1])) 124 | if not 0.0 <= eps: 125 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) 126 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, 127 | correct_bias=correct_bias) 128 | super(AdamW, self).__init__(params, defaults) 129 | 130 | def step(self, closure=None): 131 | """Performs a single optimization step. 132 | 133 | Arguments: 134 | closure (callable, optional): A closure that reevaluates the model 135 | and returns the loss. 136 | """ 137 | loss = None 138 | if closure is not None: 139 | loss = closure() 140 | 141 | for group in self.param_groups: 142 | for p in group['params']: 143 | if p.grad is None: 144 | continue 145 | grad = p.grad.data 146 | if grad.is_sparse: 147 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 148 | 149 | state = self.state[p] 150 | 151 | # State initialization 152 | if len(state) == 0: 153 | state['step'] = 0 154 | # Exponential moving average of gradient values 155 | state['exp_avg'] = torch.zeros_like(p.data) 156 | # Exponential moving average of squared gradient values 157 | state['exp_avg_sq'] = torch.zeros_like(p.data) 158 | 159 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 160 | beta1, beta2 = group['betas'] 161 | 162 | state['step'] += 1 163 | 164 | # Decay the first and second moment running average coefficient 165 | # In-place operations to update the averages at the same time 166 | exp_avg.mul_(beta1).add_(1.0 - beta1, grad) 167 | exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad) 168 | denom = exp_avg_sq.sqrt().add_(group['eps']) 169 | 170 | step_size = group['lr'] 171 | if group['correct_bias']: # No bias correction for Bert 172 | bias_correction1 = 1.0 - beta1 ** state['step'] 173 | bias_correction2 = 1.0 - beta2 ** state['step'] 174 | step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 175 | 176 | p.data.addcdiv_(-step_size, exp_avg, denom) 177 | 178 | # Just adding the square of the weights to the loss function is *not* 179 | # the correct way of using L2 regularization/weight decay with Adam, 180 | # since that will interact with the m and v parameters in strange ways. 181 | # 182 | # Instead we want to decay the weights in a manner that doesn't interact 183 | # with the m/v parameters. This is equivalent to adding the square 184 | # of the weights to the loss with plain (non-momentum) SGD. 185 | # Add weight decay at the end (fixed version) 186 | if group['weight_decay'] > 0.0: 187 | p.data.add_(-group['lr'] * group['weight_decay'], p.data) 188 | 189 | return loss 190 | -------------------------------------------------------------------------------- /tokenization_gpt21.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for OpenAI GPT.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import sys 20 | import json 21 | import logging 22 | import os 23 | import regex as re 24 | from io import open 25 | 26 | try: 27 | from functools import lru_cache 28 | except ImportError: 29 | # Just a dummy decorator to get the checks to run on python2 30 | # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. 31 | def lru_cache(): 32 | return lambda func: func 33 | 34 | from tokenization_utils1 import PreTrainedTokenizer 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | VOCAB_FILES_NAMES = { 39 | 'vocab_file': 'vocab.json', 40 | 'merges_file': 'merges.txt', 41 | } 42 | 43 | PRETRAINED_VOCAB_FILES_MAP = { 44 | 'vocab_file': 45 | { 46 | 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", 47 | 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json", 48 | 'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json", 49 | 'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json", 50 | }, 51 | 'merges_file': 52 | { 53 | 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", 54 | 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt", 55 | 'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt", 56 | 'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt", 57 | }, 58 | } 59 | 60 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 61 | 'gpt2': 1024, 62 | 'gpt2-medium': 1024, 63 | 'gpt2-large': 1024, 64 | 'distilgpt2': 1024, 65 | } 66 | 67 | 68 | @lru_cache() 69 | def bytes_to_unicode(): 70 | """ 71 | Returns list of utf-8 byte and a mapping to unicode strings. 72 | We specifically avoids mapping to whitespace/control characters the bpe code barfs on. 73 | 74 | The reversible bpe codes work on unicode strings. 75 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 76 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 77 | This is a signficant percentage of your normal, say, 32K bpe vocab. 78 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 79 | """ 80 | _chr = unichr if sys.version_info[0] == 2 else chr 81 | bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) 82 | cs = bs[:] 83 | n = 0 84 | for b in range(2 ** 8): 85 | if b not in bs: 86 | bs.append(b) 87 | cs.append(2 ** 8 + n) 88 | n += 1 89 | cs = [_chr(n) for n in cs] 90 | return dict(zip(bs, cs)) 91 | 92 | 93 | def get_pairs(word): 94 | """Return set of symbol pairs in a word. 95 | 96 | Word is represented as tuple of symbols (symbols being variable-length strings). 97 | """ 98 | pairs = set() 99 | prev_char = word[0] 100 | for char in word[1:]: 101 | pairs.add((prev_char, char)) 102 | prev_char = char 103 | return pairs 104 | 105 | 106 | class GPT2Tokenizer(PreTrainedTokenizer): 107 | """ 108 | GPT-2 BPE tokenizer. Peculiarities: 109 | - Byte-level Byte-Pair-Encoding 110 | - Requires a space to start the input string => the encoding methods should be called with the 111 | ``add_prefix_space`` flag set to ``True``. 112 | Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve 113 | the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` 114 | """ 115 | vocab_files_names = VOCAB_FILES_NAMES 116 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 117 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 118 | 119 | def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>", 120 | bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs): 121 | super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) 122 | self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens 123 | self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens 124 | 125 | self.encoder = json.load(open(vocab_file, encoding="utf-8")) 126 | self.decoder = {v: k for k, v in self.encoder.items()} 127 | self.errors = errors # how to handle errors in decoding 128 | self.byte_encoder = bytes_to_unicode() 129 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 130 | bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] 131 | bpe_merges = [tuple(merge.split()) for merge in bpe_data] 132 | self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) 133 | self.cache = {} 134 | 135 | # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions 136 | self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") 137 | 138 | @property 139 | def vocab_size(self): 140 | return len(self.encoder) 141 | 142 | def bpe(self, token): 143 | if token in self.cache: 144 | return self.cache[token] 145 | word = tuple(token) 146 | pairs = get_pairs(word) 147 | 148 | if not pairs: 149 | return token 150 | 151 | while True: 152 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) 153 | if bigram not in self.bpe_ranks: 154 | break 155 | first, second = bigram 156 | new_word = [] 157 | i = 0 158 | while i < len(word): 159 | try: 160 | j = word.index(first, i) 161 | new_word.extend(word[i:j]) 162 | i = j 163 | except: 164 | new_word.extend(word[i:]) 165 | break 166 | 167 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second: 168 | new_word.append(first + second) 169 | i += 2 170 | else: 171 | new_word.append(word[i]) 172 | i += 1 173 | new_word = tuple(new_word) 174 | word = new_word 175 | if len(word) == 1: 176 | break 177 | else: 178 | pairs = get_pairs(word) 179 | word = ' '.join(word) 180 | self.cache[token] = word 181 | return word 182 | 183 | def _tokenize(self, text, add_prefix_space=False): 184 | """ Tokenize a string. 185 | Args: 186 | - add_prefix_space (boolean, default False): 187 | Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers. 188 | """ 189 | if add_prefix_space: 190 | text = ' ' + text 191 | 192 | bpe_tokens = [] 193 | for token in re.findall(self.pat, text): 194 | if sys.version_info[0] == 2: 195 | token = ''.join(self.byte_encoder[ord(b)] for b in 196 | token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) 197 | else: 198 | token = ''.join(self.byte_encoder[b] for b in token.encode( 199 | 'utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) 200 | bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) 201 | return bpe_tokens 202 | 203 | def _convert_token_to_id(self, token): 204 | """ Converts a token (str/unicode) in an id using the vocab. """ 205 | return self.encoder.get(token, self.encoder.get(self.unk_token)) 206 | 207 | def _convert_id_to_token(self, index): 208 | """Converts an index (integer) in a token (string/unicode) using the vocab.""" 209 | return self.decoder.get(index) 210 | 211 | def convert_tokens_to_string(self, tokens): 212 | """ Converts a sequence of tokens (string) in a single string. """ 213 | text = ''.join(tokens) 214 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) 215 | return text 216 | 217 | def save_vocabulary(self, save_directory): 218 | """Save the tokenizer vocabulary and merge files to a directory.""" 219 | if not os.path.isdir(save_directory): 220 | logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) 221 | return 222 | vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) 223 | merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) 224 | 225 | with open(vocab_file, 'w', encoding='utf-8') as f: 226 | f.write(json.dumps(self.encoder, ensure_ascii=False)) 227 | 228 | index = 0 229 | with open(merge_file, "w", encoding="utf-8") as writer: 230 | writer.write(u'#version: 0.2\n') 231 | for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): 232 | if index != token_index: 233 | logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." 234 | " Please check that the tokenizer is not corrupted!".format(merge_file)) 235 | index = token_index 236 | writer.write(' '.join(bpe_tokens) + u'\n') 237 | index += 1 238 | 239 | return vocab_file, merge_file -------------------------------------------------------------------------------- /configuration_utils1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Configuration base class and utilities.""" 17 | 18 | from __future__ import (absolute_import, division, print_function, 19 | unicode_literals) 20 | 21 | import copy 22 | import json 23 | import logging 24 | import os 25 | from io import open 26 | 27 | from file_utils1 import cached_path, CONFIG_NAME 28 | 29 | logger = logging.getLogger(__name__) 30 | 31 | class PretrainedConfig(object): 32 | r""" Base class for all configuration classes. 33 | Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations. 34 | 35 | Note: 36 | A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights. 37 | It only affects the model's configuration. 38 | 39 | Class attributes (overridden by derived classes): 40 | - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values. 41 | 42 | Parameters: 43 | ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint. 44 | ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens) 45 | ``output_attentions``: boolean, default `False`. Should the model returns attentions weights. 46 | ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states. 47 | ``torchscript``: string, default `False`. Is the model used with Torchscript. 48 | """ 49 | pretrained_config_archive_map = {} 50 | 51 | def __init__(self, **kwargs): 52 | self.finetuning_task = kwargs.pop('finetuning_task', None) 53 | self.num_labels = kwargs.pop('num_labels', 2) 54 | self.output_attentions = kwargs.pop('output_attentions', False) 55 | self.output_hidden_states = kwargs.pop('output_hidden_states', False) 56 | self.output_past = kwargs.pop('output_past', True) # Not used by all models 57 | self.torchscript = kwargs.pop('torchscript', False) # Only used by PyTorch models 58 | self.use_bfloat16 = kwargs.pop('use_bfloat16', False) 59 | self.pruned_heads = kwargs.pop('pruned_heads', {}) 60 | 61 | def save_pretrained(self, save_directory): 62 | """ Save a configuration object to the directory `save_directory`, so that it 63 | can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method. 64 | """ 65 | assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" 66 | 67 | # If we save using the predefined names, we can load using `from_pretrained` 68 | output_config_file = os.path.join(save_directory, CONFIG_NAME) 69 | 70 | self.to_json_file(output_config_file) 71 | logger.info("Configuration saved in {}".format(output_config_file)) 72 | 73 | @classmethod 74 | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): 75 | r""" Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration. 76 | 77 | Parameters: 78 | pretrained_model_name_or_path: either: 79 | 80 | - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. 81 | - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. 82 | - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. 83 | 84 | cache_dir: (`optional`) string: 85 | Path to a directory in which a downloaded pre-trained model 86 | configuration should be cached if the standard cache should not be used. 87 | 88 | kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. 89 | 90 | - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. 91 | - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. 92 | 93 | force_download: (`optional`) boolean, default False: 94 | Force to (re-)download the model weights and configuration files and override the cached versions if they exists. 95 | 96 | proxies: (`optional`) dict, default None: 97 | A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. 98 | The proxies are used on each request. 99 | 100 | return_unused_kwargs: (`optional`) bool: 101 | 102 | - If False, then this function returns just the final configuration object. 103 | - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. 104 | 105 | Examples:: 106 | 107 | # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a 108 | # derived class: BertConfig 109 | config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. 110 | config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` 111 | config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') 112 | config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) 113 | assert config.output_attention == True 114 | config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, 115 | foo=False, return_unused_kwargs=True) 116 | assert config.output_attention == True 117 | assert unused_kwargs == {'foo': False} 118 | 119 | """ 120 | cache_dir = kwargs.pop('cache_dir', None) 121 | force_download = kwargs.pop('force_download', False) 122 | proxies = kwargs.pop('proxies', None) 123 | return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) 124 | 125 | if pretrained_model_name_or_path in cls.pretrained_config_archive_map: 126 | config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path] 127 | elif os.path.isdir(pretrained_model_name_or_path): 128 | config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) 129 | else: 130 | config_file = pretrained_model_name_or_path 131 | # redirect to the cache, if necessary 132 | try: 133 | resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) 134 | except EnvironmentError: 135 | if pretrained_model_name_or_path in cls.pretrained_config_archive_map: 136 | msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format( 137 | config_file) 138 | else: 139 | msg = "Model name '{}' was not found in model name list ({}). " \ 140 | "We assumed '{}' was a path or url to a configuration file named {} or " \ 141 | "a directory containing such a file but couldn't find any such file at this path or url.".format( 142 | pretrained_model_name_or_path, 143 | ', '.join(cls.pretrained_config_archive_map.keys()), 144 | config_file, CONFIG_NAME) 145 | raise EnvironmentError(msg) 146 | 147 | if resolved_config_file == config_file: 148 | logger.info("loading configuration file {}".format(config_file)) 149 | else: 150 | logger.info("loading configuration file {} from cache at {}".format( 151 | config_file, resolved_config_file)) 152 | 153 | # Load config 154 | config = cls.from_json_file(resolved_config_file) 155 | 156 | if hasattr(config, 'pruned_heads'): 157 | config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) 158 | 159 | # Update config with kwargs if needed 160 | to_remove = [] 161 | for key, value in kwargs.items(): 162 | if hasattr(config, key): 163 | setattr(config, key, value) 164 | to_remove.append(key) 165 | for key in to_remove: 166 | kwargs.pop(key, None) 167 | 168 | logger.info("Model config %s", str(config)) 169 | if return_unused_kwargs: 170 | return config, kwargs 171 | else: 172 | return config 173 | 174 | @classmethod 175 | def from_dict(cls, json_object): 176 | """Constructs a `Config` from a Python dictionary of parameters.""" 177 | config = cls(vocab_size_or_config_json_file=-1) 178 | for key, value in json_object.items(): 179 | setattr(config, key, value) 180 | return config 181 | 182 | @classmethod 183 | def from_json_file(cls, json_file): 184 | """Constructs a `BertConfig` from a json file of parameters.""" 185 | with open(json_file, "r", encoding='utf-8') as reader: 186 | text = reader.read() 187 | return cls.from_dict(json.loads(text)) 188 | 189 | def __eq__(self, other): 190 | return self.__dict__ == other.__dict__ 191 | 192 | def __repr__(self): 193 | return str(self.to_json_string()) 194 | 195 | def to_dict(self): 196 | """Serializes this instance to a Python dictionary.""" 197 | output = copy.deepcopy(self.__dict__) 198 | return output 199 | 200 | def to_json_string(self): 201 | """Serializes this instance to a JSON string.""" 202 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 203 | 204 | def to_json_file(self, json_file_path): 205 | """ Save this instance to a json file.""" 206 | with open(json_file_path, "w", encoding='utf-8') as writer: 207 | writer.write(self.to_json_string()) 208 | -------------------------------------------------------------------------------- /file_utils1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for working with the local dataset cache. 3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp 4 | Copyright by the AllenNLP authors. 5 | """ 6 | from __future__ import (absolute_import, division, print_function, unicode_literals) 7 | 8 | import sys 9 | import json 10 | import logging 11 | import os 12 | import six 13 | import shutil 14 | import tempfile 15 | import fnmatch 16 | from functools import wraps 17 | from hashlib import sha256 18 | from io import open 19 | 20 | import boto3 21 | from botocore.config import Config 22 | from botocore.exceptions import ClientError 23 | import requests 24 | from tqdm import tqdm 25 | 26 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 27 | 28 | try: 29 | import tensorflow as tf 30 | assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2 31 | _tf_available = True # pylint: disable=invalid-name 32 | logger.info("TensorFlow version {} available.".format(tf.__version__)) 33 | except (ImportError, AssertionError): 34 | _tf_available = False # pylint: disable=invalid-name 35 | 36 | try: 37 | import torch 38 | _torch_available = True # pylint: disable=invalid-name 39 | logger.info("PyTorch version {} available.".format(torch.__version__)) 40 | except ImportError: 41 | _torch_available = False # pylint: disable=invalid-name 42 | 43 | 44 | try: 45 | from torch.hub import _get_torch_home 46 | torch_cache_home = _get_torch_home() 47 | except ImportError: 48 | torch_cache_home = os.path.expanduser( 49 | os.getenv('TORCH_HOME', os.path.join( 50 | os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch'))) 51 | default_cache_path = os.path.join(torch_cache_home, 'transformers') 52 | 53 | try: 54 | from urllib.parse import urlparse 55 | except ImportError: 56 | from urlparse import urlparse 57 | 58 | try: 59 | from pathlib import Path 60 | PYTORCH_PRETRAINED_BERT_CACHE = Path( 61 | os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))) 62 | except (AttributeError, ImportError): 63 | PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE', 64 | os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 65 | default_cache_path)) 66 | 67 | PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility 68 | TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility 69 | 70 | WEIGHTS_NAME = "pytorch_model.bin" 71 | TF2_WEIGHTS_NAME = 'tf_model.h5' 72 | TF_WEIGHTS_NAME = 'model.ckpt' 73 | CONFIG_NAME = "config.json" 74 | 75 | def is_torch_available(): 76 | return _torch_available 77 | 78 | def is_tf_available(): 79 | return _tf_available 80 | 81 | if not six.PY2: 82 | def add_start_docstrings(*docstr): 83 | def docstring_decorator(fn): 84 | fn.__doc__ = ''.join(docstr) + fn.__doc__ 85 | return fn 86 | return docstring_decorator 87 | 88 | def add_end_docstrings(*docstr): 89 | def docstring_decorator(fn): 90 | fn.__doc__ = fn.__doc__ + ''.join(docstr) 91 | return fn 92 | return docstring_decorator 93 | else: 94 | # Not possible to update class docstrings on python2 95 | def add_start_docstrings(*docstr): 96 | def docstring_decorator(fn): 97 | return fn 98 | return docstring_decorator 99 | 100 | def add_end_docstrings(*docstr): 101 | def docstring_decorator(fn): 102 | return fn 103 | return docstring_decorator 104 | 105 | def url_to_filename(url, etag=None): 106 | """ 107 | Convert `url` into a hashed filename in a repeatable way. 108 | If `etag` is specified, append its hash to the url's, delimited 109 | by a period. 110 | If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name 111 | so that TF 2.0 can identify it as a HDF5 file 112 | (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380) 113 | """ 114 | url_bytes = url.encode('utf-8') 115 | url_hash = sha256(url_bytes) 116 | filename = url_hash.hexdigest() 117 | 118 | if etag: 119 | etag_bytes = etag.encode('utf-8') 120 | etag_hash = sha256(etag_bytes) 121 | filename += '.' + etag_hash.hexdigest() 122 | 123 | if url.endswith('.h5'): 124 | filename += '.h5' 125 | 126 | return filename 127 | 128 | 129 | def filename_to_url(filename, cache_dir=None): 130 | """ 131 | Return the url and etag (which may be ``None``) stored for `filename`. 132 | Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. 133 | """ 134 | if cache_dir is None: 135 | cache_dir = TRANSFORMERS_CACHE 136 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 137 | cache_dir = str(cache_dir) 138 | 139 | cache_path = os.path.join(cache_dir, filename) 140 | if not os.path.exists(cache_path): 141 | raise EnvironmentError("file {} not found".format(cache_path)) 142 | 143 | meta_path = cache_path + '.json' 144 | if not os.path.exists(meta_path): 145 | raise EnvironmentError("file {} not found".format(meta_path)) 146 | 147 | with open(meta_path, encoding="utf-8") as meta_file: 148 | metadata = json.load(meta_file) 149 | url = metadata['url'] 150 | etag = metadata['etag'] 151 | 152 | return url, etag 153 | 154 | 155 | def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None): 156 | """ 157 | Given something that might be a URL (or might be a local path), 158 | determine which. If it's a URL, download the file and cache it, and 159 | return the path to the cached file. If it's already a local path, 160 | make sure the file exists and then return the path. 161 | Args: 162 | cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). 163 | force_download: if True, re-dowload the file even if it's already cached in the cache dir. 164 | """ 165 | if cache_dir is None: 166 | cache_dir = TRANSFORMERS_CACHE 167 | if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): 168 | url_or_filename = str(url_or_filename) 169 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 170 | cache_dir = str(cache_dir) 171 | 172 | parsed = urlparse(url_or_filename) 173 | 174 | if parsed.scheme in ('http', 'https', 's3'): 175 | # URL, so get it from the cache (downloading if necessary) 176 | return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies) 177 | elif os.path.exists(url_or_filename): 178 | # File, and it exists. 179 | return url_or_filename 180 | elif parsed.scheme == '': 181 | # File, but it doesn't exist. 182 | raise EnvironmentError("file {} not found".format(url_or_filename)) 183 | else: 184 | # Something unknown 185 | raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) 186 | 187 | 188 | def split_s3_path(url): 189 | """Split a full s3 path into the bucket name and path.""" 190 | parsed = urlparse(url) 191 | if not parsed.netloc or not parsed.path: 192 | raise ValueError("bad s3 path {}".format(url)) 193 | bucket_name = parsed.netloc 194 | s3_path = parsed.path 195 | # Remove '/' at beginning of path. 196 | if s3_path.startswith("/"): 197 | s3_path = s3_path[1:] 198 | return bucket_name, s3_path 199 | 200 | 201 | def s3_request(func): 202 | """ 203 | Wrapper function for s3 requests in order to create more helpful error 204 | messages. 205 | """ 206 | 207 | @wraps(func) 208 | def wrapper(url, *args, **kwargs): 209 | try: 210 | return func(url, *args, **kwargs) 211 | except ClientError as exc: 212 | if int(exc.response["Error"]["Code"]) == 404: 213 | raise EnvironmentError("file {} not found".format(url)) 214 | else: 215 | raise 216 | 217 | return wrapper 218 | 219 | 220 | @s3_request 221 | def s3_etag(url, proxies=None): 222 | """Check ETag on S3 object.""" 223 | s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) 224 | bucket_name, s3_path = split_s3_path(url) 225 | s3_object = s3_resource.Object(bucket_name, s3_path) 226 | return s3_object.e_tag 227 | 228 | 229 | @s3_request 230 | def s3_get(url, temp_file, proxies=None): 231 | """Pull a file directly from S3.""" 232 | s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) 233 | bucket_name, s3_path = split_s3_path(url) 234 | s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) 235 | 236 | 237 | def http_get(url, temp_file, proxies=None): 238 | req = requests.get(url, stream=True, proxies=proxies) 239 | content_length = req.headers.get('Content-Length') 240 | total = int(content_length) if content_length is not None else None 241 | progress = tqdm(unit="B", total=total) 242 | for chunk in req.iter_content(chunk_size=1024): 243 | if chunk: # filter out keep-alive new chunks 244 | progress.update(len(chunk)) 245 | temp_file.write(chunk) 246 | progress.close() 247 | 248 | 249 | def get_from_cache(url, cache_dir=None, force_download=False, proxies=None): 250 | """ 251 | Given a URL, look for the corresponding dataset in the local cache. 252 | If it's not there, download it. Then return the path to the cached file. 253 | """ 254 | if cache_dir is None: 255 | cache_dir = TRANSFORMERS_CACHE 256 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 257 | cache_dir = str(cache_dir) 258 | if sys.version_info[0] == 2 and not isinstance(cache_dir, str): 259 | cache_dir = str(cache_dir) 260 | 261 | if not os.path.exists(cache_dir): 262 | os.makedirs(cache_dir) 263 | 264 | # Get eTag to add to filename, if it exists. 265 | if url.startswith("s3://"): 266 | etag = s3_etag(url, proxies=proxies) 267 | else: 268 | try: 269 | response = requests.head(url, allow_redirects=True, proxies=proxies) 270 | if response.status_code != 200: 271 | etag = None 272 | else: 273 | etag = response.headers.get("ETag") 274 | except EnvironmentError: 275 | etag = None 276 | 277 | if sys.version_info[0] == 2 and etag is not None: 278 | etag = etag.decode('utf-8') 279 | filename = url_to_filename(url, etag) 280 | 281 | # get cache path to put the file 282 | cache_path = os.path.join(cache_dir, filename) 283 | 284 | # If we don't have a connection (etag is None) and can't identify the file 285 | # try to get the last downloaded one 286 | if not os.path.exists(cache_path) and etag is None: 287 | matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*') 288 | matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files)) 289 | if matching_files: 290 | cache_path = os.path.join(cache_dir, matching_files[-1]) 291 | 292 | if not os.path.exists(cache_path) or force_download: 293 | # Download to temporary file, then copy to cache dir once finished. 294 | # Otherwise you get corrupt cache entries if the download gets interrupted. 295 | with tempfile.NamedTemporaryFile() as temp_file: 296 | logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) 297 | 298 | # GET file object 299 | if url.startswith("s3://"): 300 | s3_get(url, temp_file, proxies=proxies) 301 | else: 302 | http_get(url, temp_file, proxies=proxies) 303 | 304 | # we are copying the file before closing it, so flush to avoid truncation 305 | temp_file.flush() 306 | # shutil.copyfileobj() starts at the current position, so go to the start 307 | temp_file.seek(0) 308 | 309 | logger.info("copying %s to cache at %s", temp_file.name, cache_path) 310 | with open(cache_path, 'wb') as cache_file: 311 | shutil.copyfileobj(temp_file, cache_file) 312 | 313 | logger.info("creating metadata file for %s", cache_path) 314 | meta = {'url': url, 'etag': etag} 315 | meta_path = cache_path + '.json' 316 | with open(meta_path, 'w') as meta_file: 317 | output_string = json.dumps(meta) 318 | if sys.version_info[0] == 2 and isinstance(output_string, str): 319 | output_string = unicode(output_string, 'utf-8') # The beauty of python 2 320 | meta_file.write(output_string) 321 | 322 | logger.info("removing temp file %s", temp_file.name) 323 | 324 | return cache_path 325 | -------------------------------------------------------------------------------- /glue1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ GLUE processors and helpers """ 17 | 18 | import logging 19 | import os 20 | import numpy as np 21 | from utils import DataProcessor, InputExample, InputFeatures 22 | from file_utils1 import is_tf_available 23 | 24 | if is_tf_available(): 25 | import tensorflow as tf 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def glue_convert_examples_to_features(examples, tokenizer, 31 | max_length=512, 32 | task=None, 33 | label_list=None, 34 | output_mode=None, 35 | pad_on_left=False, 36 | pad_token=0, 37 | pad_token_segment_id=0, 38 | mask_padding_with_zero=True): 39 | """ 40 | Loads a data file into a list of ``InputFeatures`` 41 | 42 | Args: 43 | examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. 44 | tokenizer: Instance of a tokenizer that will tokenize the examples 45 | max_length: Maximum example length 46 | task: GLUE task 47 | label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method 48 | output_mode: String indicating the output mode. Either ``regression`` or ``classification`` 49 | pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) 50 | pad_token: Padding token 51 | pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) 52 | mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values 53 | and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for 54 | actual values) 55 | 56 | Returns: 57 | If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` 58 | containing the task-specific features. If the input is a list of ``InputExamples``, will return 59 | a list of task-specific ``InputFeatures`` which can be fed to the model. 60 | 61 | """ 62 | is_tf_dataset = False 63 | if is_tf_available() and isinstance(examples, tf.data.Dataset): 64 | is_tf_dataset = True 65 | 66 | if task is not None: 67 | processor = glue_processors[task]() 68 | if label_list is None: 69 | label_list = processor.get_labels() 70 | logger.info("Using label list %s for task %s" % (label_list, task)) 71 | if output_mode is None: 72 | output_mode = glue_output_modes[task] 73 | logger.info("Using output mode %s for task %s" % (output_mode, task)) 74 | 75 | label_map = {label: i for i, label in enumerate(label_list)} 76 | 77 | features = [] 78 | for (ex_index, example) in enumerate(examples): 79 | if ex_index % 10000 == 0: 80 | logger.info("Writing example %d" % (ex_index)) 81 | if is_tf_dataset: 82 | example = processor.get_example_from_tensor_dict(example) 83 | 84 | inputs = tokenizer.encode_plus( 85 | example.text_a, 86 | example.text_b, 87 | add_special_tokens=True, 88 | max_length=max_length, 89 | ) 90 | input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] 91 | 92 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 93 | # tokens are attended to. 94 | attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 95 | 96 | # Zero-pad up to the sequence length. 97 | padding_length = max_length - len(input_ids) 98 | if pad_on_left: 99 | input_ids = ([pad_token] * padding_length) + input_ids 100 | attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask 101 | token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids 102 | else: 103 | input_ids = input_ids + ([pad_token] * padding_length) 104 | attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) 105 | token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) 106 | 107 | assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) 108 | assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) 109 | assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) 110 | 111 | if output_mode == "classification": 112 | label = label_map[example.label] 113 | elif output_mode == "regression": 114 | label = example.label 115 | elif output_mode == "MultiLabelclassification": 116 | label = example.label 117 | else: 118 | raise KeyError(output_mode) 119 | 120 | if ex_index < 5: 121 | logger.info("*** Example ***") 122 | logger.info("guid: %s" % (example.guid)) 123 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 124 | logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) 125 | logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) 126 | # logger.info("label: %s (id = %d)" % (example.label, label)) 127 | 128 | features.append( 129 | InputFeatures(input_ids=input_ids, 130 | attention_mask=attention_mask, 131 | token_type_ids=token_type_ids, 132 | label=label)) 133 | 134 | if is_tf_available() and is_tf_dataset: 135 | def gen(): 136 | for ex in features: 137 | yield ({'input_ids': ex.input_ids, 138 | 'attention_mask': ex.attention_mask, 139 | 'token_type_ids': ex.token_type_ids}, 140 | ex.label) 141 | 142 | return tf.data.Dataset.from_generator(gen, 143 | ({'input_ids': tf.int32, 144 | 'attention_mask': tf.int32, 145 | 'token_type_ids': tf.int32}, 146 | tf.int64), 147 | ({'input_ids': tf.TensorShape([None]), 148 | 'attention_mask': tf.TensorShape([None]), 149 | 'token_type_ids': tf.TensorShape([None])}, 150 | tf.TensorShape([]))) 151 | 152 | return features 153 | 154 | 155 | class MrpcProcessor(DataProcessor): 156 | """Processor for the MRPC data set (GLUE version).""" 157 | 158 | def get_example_from_tensor_dict(self, tensor_dict): 159 | """See base class.""" 160 | return InputExample(tensor_dict['idx'].numpy(), 161 | tensor_dict['sentence1'].numpy().decode('utf-8'), 162 | tensor_dict['sentence2'].numpy().decode('utf-8'), 163 | str(tensor_dict['label'].numpy())) 164 | 165 | def get_train_examples(self, data_dir): 166 | """See base class.""" 167 | logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv"))) 168 | return self._create_examples( 169 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 170 | 171 | def get_dev_examples(self, data_dir): 172 | """See base class.""" 173 | return self._create_examples( 174 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 175 | 176 | def get_labels(self): 177 | """See base class.""" 178 | return ["0", "1"] 179 | 180 | def _create_examples(self, lines, set_type): 181 | """Creates examples for the training and dev sets.""" 182 | examples = [] 183 | for (i, line) in enumerate(lines): 184 | if i == 0: 185 | continue 186 | guid = "%s-%s" % (set_type, i) 187 | text_a = line[3] 188 | text_b = line[4] 189 | label = line[0] 190 | examples.append( 191 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 192 | return examples 193 | 194 | class StsbProcessor(DataProcessor): 195 | """Processor for the STS-B data set (GLUE version).""" 196 | 197 | def get_example_from_tensor_dict(self, tensor_dict): 198 | """See base class.""" 199 | return InputExample(tensor_dict['idx'].numpy(), 200 | tensor_dict['sentence1'].numpy().decode('utf-8'), 201 | tensor_dict['sentence2'].numpy().decode('utf-8'), 202 | str(tensor_dict['label'].numpy())) 203 | 204 | def get_train_examples(self, data_dir): 205 | """See base class.""" 206 | return self._create_examples( 207 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 208 | 209 | def get_dev_examples(self, data_dir): 210 | """See base class.""" 211 | return self._create_examples( 212 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 213 | 214 | def get_labels(self): 215 | """See base class.""" 216 | return [None] 217 | 218 | def _create_examples(self, lines, set_type): 219 | """Creates examples for the training and dev sets.""" 220 | examples = [] 221 | for (i, line) in enumerate(lines): 222 | if i == 0: 223 | continue 224 | guid = "%s-%s" % (set_type, line[0]) 225 | text_a = line[7] 226 | text_b = line[8] 227 | label = line[-1] 228 | examples.append( 229 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 230 | return examples 231 | 232 | class MultiLabelProcessor(DataProcessor): 233 | """Processor for the MultiNLI data set (GLUE version).""" 234 | 235 | def get_example_from_tensor_dict(self, tensor_dict): 236 | """See base class.""" 237 | return InputExample(tensor_dict['idx'].numpy(), 238 | tensor_dict['premise'].numpy().decode('utf-8'), 239 | tensor_dict['hypothesis'].numpy().decode('utf-8'), 240 | str(tensor_dict['label'].numpy())) 241 | 242 | def get_train_examples(self, data_dir): 243 | """See base class.""" 244 | return self._create_examples( 245 | self._read_tsv(os.path.join(data_dir, "train_data.tsv")), "train") 246 | 247 | def get_dev_examples(self, data_dir): 248 | """See base class.""" 249 | return self._create_examples( 250 | self._read_tsv(os.path.join(data_dir, "eval_data.tsv")), 251 | "dev_matched") 252 | 253 | def get_labels(self): 254 | """See base class.""" 255 | return ["0", "1", "2","3","4","5","6", "7", "8","9","10","11","12", "13", "14","15"] 256 | 257 | def _create_examples(self, lines, set_type): 258 | """Creates examples for the training and dev sets.""" 259 | examples = [] 260 | for (i, line) in enumerate(lines): 261 | guid = "%s" % (set_type) 262 | text_a = line[0] 263 | label=np.zeros((16,), dtype=int) 264 | label_sum=["inform_theater","inform_starttime","inform_numberofpeople","greeting","thanks","inform_other","request_moviename","inform_genre","request_ticket", 265 | "inform_city","inform_state","inform_date","inform_moviename","confirm_answer","inform_zip","inform_video_format"] 266 | for i in range(16): 267 | if label_sum[i] in line: 268 | label[i]=1 269 | examples.append( 270 | InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) 271 | 272 | return examples 273 | 274 | glue_tasks_num_labels = { 275 | "mrpc": 2, 276 | "sts-b": 1, 277 | "multilabel":16, 278 | } 279 | 280 | glue_processors = { 281 | "mrpc": MrpcProcessor, 282 | "sts-b": StsbProcessor, 283 | "multilabel": MultiLabelProcessor, 284 | } 285 | 286 | glue_output_modes = { 287 | "mrpc": "classification", 288 | "sts-b": "regression", 289 | "multilabel": "MultiLabelclassification", 290 | } 291 | -------------------------------------------------------------------------------- /tokenization_bert1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import collections 20 | import logging 21 | import os 22 | import unicodedata 23 | from io import open 24 | 25 | from tokenization_utils1 import PreTrainedTokenizer 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} 30 | 31 | PRETRAINED_VOCAB_FILES_MAP = { 32 | 'vocab_file': 33 | { 34 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 35 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 36 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", 37 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", 38 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", 39 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", 40 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", 41 | 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", 42 | 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", 43 | 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", 44 | 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", 45 | 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", 46 | 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", 47 | 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", 48 | 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", 49 | } 50 | } 51 | 52 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 53 | 'bert-base-uncased': 512, 54 | 'bert-large-uncased': 512, 55 | 'bert-base-cased': 512, 56 | 'bert-large-cased': 512, 57 | 'bert-base-multilingual-uncased': 512, 58 | 'bert-base-multilingual-cased': 512, 59 | 'bert-base-chinese': 512, 60 | 'bert-base-german-cased': 512, 61 | 'bert-large-uncased-whole-word-masking': 512, 62 | 'bert-large-cased-whole-word-masking': 512, 63 | 'bert-large-uncased-whole-word-masking-finetuned-squad': 512, 64 | 'bert-large-cased-whole-word-masking-finetuned-squad': 512, 65 | 'bert-base-cased-finetuned-mrpc': 512, 66 | 'bert-base-german-dbmdz-cased': 512, 67 | 'bert-base-german-dbmdz-uncased': 512, 68 | } 69 | 70 | PRETRAINED_INIT_CONFIGURATION = { 71 | 'bert-base-uncased': {'do_lower_case': True}, 72 | 'bert-large-uncased': {'do_lower_case': True}, 73 | 'bert-base-cased': {'do_lower_case': False}, 74 | 'bert-large-cased': {'do_lower_case': False}, 75 | 'bert-base-multilingual-uncased': {'do_lower_case': True}, 76 | 'bert-base-multilingual-cased': {'do_lower_case': False}, 77 | 'bert-base-chinese': {'do_lower_case': False}, 78 | 'bert-base-german-cased': {'do_lower_case': False}, 79 | 'bert-large-uncased-whole-word-masking': {'do_lower_case': True}, 80 | 'bert-large-cased-whole-word-masking': {'do_lower_case': False}, 81 | 'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True}, 82 | 'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False}, 83 | 'bert-base-cased-finetuned-mrpc': {'do_lower_case': False}, 84 | 'bert-base-german-dbmdz-cased': {'do_lower_case': False}, 85 | 'bert-base-german-dbmdz-uncased': {'do_lower_case': True}, 86 | } 87 | 88 | 89 | def load_vocab(vocab_file): 90 | """Loads a vocabulary file into a dictionary.""" 91 | vocab = collections.OrderedDict() 92 | with open(vocab_file, "r", encoding="utf-8") as reader: 93 | tokens = reader.readlines() 94 | for index, token in enumerate(tokens): 95 | token = token.rstrip('\n') 96 | vocab[token] = index 97 | return vocab 98 | 99 | 100 | def whitespace_tokenize(text): 101 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 102 | text = text.strip() 103 | if not text: 104 | return [] 105 | tokens = text.split() 106 | return tokens 107 | 108 | 109 | class BertTokenizer(PreTrainedTokenizer): 110 | r""" 111 | Constructs a BertTokenizer. 112 | :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece 113 | 114 | Args: 115 | vocab_file: Path to a one-wordpiece-per-line vocabulary file 116 | do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False 117 | do_basic_tokenize: Whether to do basic tokenization before wordpiece. 118 | max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the 119 | minimum of this value (if specified) and the underlying BERT model's sequence length. 120 | never_split: List of tokens which will never be split during tokenization. Only has an effect when 121 | do_wordpiece_only=False 122 | """ 123 | 124 | vocab_files_names = VOCAB_FILES_NAMES 125 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 126 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 127 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 128 | 129 | def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, 130 | unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", 131 | mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs): 132 | """Constructs a BertTokenizer. 133 | 134 | Args: 135 | **vocab_file**: Path to a one-wordpiece-per-line vocabulary file 136 | **do_lower_case**: (`optional`) boolean (default True) 137 | Whether to lower case the input 138 | Only has an effect when do_basic_tokenize=True 139 | **do_basic_tokenize**: (`optional`) boolean (default True) 140 | Whether to do basic tokenization before wordpiece. 141 | **never_split**: (`optional`) list of string 142 | List of tokens which will never be split during tokenization. 143 | Only has an effect when do_basic_tokenize=True 144 | **tokenize_chinese_chars**: (`optional`) boolean (default True) 145 | Whether to tokenize Chinese characters. 146 | This should likely be deactivated for Japanese: 147 | see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 148 | """ 149 | super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, 150 | pad_token=pad_token, cls_token=cls_token, 151 | mask_token=mask_token, **kwargs) 152 | self.max_len_single_sentence = self.max_len - 2 # take into account special tokens 153 | self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens 154 | 155 | if not os.path.isfile(vocab_file): 156 | raise ValueError( 157 | "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " 158 | "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) 159 | self.vocab = load_vocab(vocab_file) 160 | self.ids_to_tokens = collections.OrderedDict( 161 | [(ids, tok) for tok, ids in self.vocab.items()]) 162 | self.do_basic_tokenize = do_basic_tokenize 163 | if do_basic_tokenize: 164 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, 165 | never_split=never_split, 166 | tokenize_chinese_chars=tokenize_chinese_chars) 167 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) 168 | 169 | @property 170 | def vocab_size(self): 171 | return len(self.vocab) 172 | 173 | def _tokenize(self, text): 174 | split_tokens = [] 175 | if self.do_basic_tokenize: 176 | for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): 177 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 178 | split_tokens.append(sub_token) 179 | else: 180 | split_tokens = self.wordpiece_tokenizer.tokenize(text) 181 | return split_tokens 182 | 183 | def _convert_token_to_id(self, token): 184 | """ Converts a token (str/unicode) in an id using the vocab. """ 185 | return self.vocab.get(token, self.vocab.get(self.unk_token)) 186 | 187 | def _convert_id_to_token(self, index): 188 | """Converts an index (integer) in a token (string/unicode) using the vocab.""" 189 | return self.ids_to_tokens.get(index, self.unk_token) 190 | 191 | def convert_tokens_to_string(self, tokens): 192 | """ Converts a sequence of tokens (string) in a single string. """ 193 | out_string = ' '.join(tokens).replace(' ##', '').strip() 194 | return out_string 195 | 196 | def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): 197 | """ 198 | Build model inputs from a sequence or a pair of sequence for sequence classification tasks 199 | by concatenating and adding special tokens. 200 | A BERT sequence has the following format: 201 | single sequence: [CLS] X [SEP] 202 | pair of sequences: [CLS] A [SEP] B [SEP] 203 | """ 204 | if token_ids_1 is None: 205 | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] 206 | cls = [self.cls_token_id] 207 | sep = [self.sep_token_id] 208 | return cls + token_ids_0 + sep + token_ids_1 + sep 209 | 210 | def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): 211 | """ 212 | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding 213 | special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. 214 | 215 | Args: 216 | token_ids_0: list of ids (must not contain special tokens) 217 | token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids 218 | for sequence pairs 219 | already_has_special_tokens: (default False) Set to True if the token list is already formated with 220 | special tokens for the model 221 | 222 | Returns: 223 | A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. 224 | """ 225 | 226 | if already_has_special_tokens: 227 | if token_ids_1 is not None: 228 | raise ValueError("You should not supply a second sequence if the provided sequence of " 229 | "ids is already formated with special tokens for the model.") 230 | return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) 231 | 232 | if token_ids_1 is not None: 233 | return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] 234 | return [1] + ([0] * len(token_ids_0)) + [1] 235 | 236 | def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): 237 | """ 238 | Creates a mask from the two sequences passed to be used in a sequence-pair classification task. 239 | A BERT sequence pair mask has the following format: 240 | 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 241 | | first sequence | second sequence 242 | 243 | if token_ids_1 is None, only returns the first portion of the mask (0's). 244 | """ 245 | sep = [self.sep_token_id] 246 | cls = [self.cls_token_id] 247 | if token_ids_1 is None: 248 | return len(cls + token_ids_0 + sep) * [0] 249 | return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] 250 | 251 | def save_vocabulary(self, vocab_path): 252 | """Save the tokenizer vocabulary to a directory or file.""" 253 | index = 0 254 | if os.path.isdir(vocab_path): 255 | vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file']) 256 | else: 257 | vocab_file = vocab_path 258 | with open(vocab_file, "w", encoding="utf-8") as writer: 259 | for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): 260 | if index != token_index: 261 | logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." 262 | " Please check that the vocabulary is not corrupted!".format(vocab_file)) 263 | index = token_index 264 | writer.write(token + u'\n') 265 | index += 1 266 | return (vocab_file,) 267 | 268 | 269 | class BasicTokenizer(object): 270 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 271 | 272 | def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True): 273 | """ Constructs a BasicTokenizer. 274 | 275 | Args: 276 | **do_lower_case**: Whether to lower case the input. 277 | **never_split**: (`optional`) list of str 278 | Kept for backward compatibility purposes. 279 | Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) 280 | List of token not to split. 281 | **tokenize_chinese_chars**: (`optional`) boolean (default True) 282 | Whether to tokenize Chinese characters. 283 | This should likely be deactivated for Japanese: 284 | see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 285 | """ 286 | if never_split is None: 287 | never_split = [] 288 | self.do_lower_case = do_lower_case 289 | self.never_split = never_split 290 | self.tokenize_chinese_chars = tokenize_chinese_chars 291 | 292 | def tokenize(self, text, never_split=None): 293 | """ Basic Tokenization of a piece of text. 294 | Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. 295 | 296 | Args: 297 | **never_split**: (`optional`) list of str 298 | Kept for backward compatibility purposes. 299 | Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) 300 | List of token not to split. 301 | """ 302 | never_split = self.never_split + (never_split if never_split is not None else []) 303 | text = self._clean_text(text) 304 | # This was added on November 1st, 2018 for the multilingual and Chinese 305 | # models. This is also applied to the English models now, but it doesn't 306 | # matter since the English models were not trained on any Chinese data 307 | # and generally don't have any Chinese data in them (there are Chinese 308 | # characters in the vocabulary because Wikipedia does have some Chinese 309 | # words in the English Wikipedia.). 310 | if self.tokenize_chinese_chars: 311 | text = self._tokenize_chinese_chars(text) 312 | orig_tokens = whitespace_tokenize(text) 313 | split_tokens = [] 314 | for token in orig_tokens: 315 | if self.do_lower_case and token not in never_split: 316 | token = token.lower() 317 | token = self._run_strip_accents(token) 318 | split_tokens.extend(self._run_split_on_punc(token)) 319 | 320 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 321 | return output_tokens 322 | 323 | def _run_strip_accents(self, text): 324 | """Strips accents from a piece of text.""" 325 | text = unicodedata.normalize("NFD", text) 326 | output = [] 327 | for char in text: 328 | cat = unicodedata.category(char) 329 | if cat == "Mn": 330 | continue 331 | output.append(char) 332 | return "".join(output) 333 | 334 | def _run_split_on_punc(self, text, never_split=None): 335 | """Splits punctuation on a piece of text.""" 336 | if never_split is not None and text in never_split: 337 | return [text] 338 | chars = list(text) 339 | i = 0 340 | start_new_word = True 341 | output = [] 342 | while i < len(chars): 343 | char = chars[i] 344 | if _is_punctuation(char): 345 | output.append([char]) 346 | start_new_word = True 347 | else: 348 | if start_new_word: 349 | output.append([]) 350 | start_new_word = False 351 | output[-1].append(char) 352 | i += 1 353 | 354 | return ["".join(x) for x in output] 355 | 356 | def _tokenize_chinese_chars(self, text): 357 | """Adds whitespace around any CJK character.""" 358 | output = [] 359 | for char in text: 360 | cp = ord(char) 361 | if self._is_chinese_char(cp): 362 | output.append(" ") 363 | output.append(char) 364 | output.append(" ") 365 | else: 366 | output.append(char) 367 | return "".join(output) 368 | 369 | def _is_chinese_char(self, cp): 370 | """Checks whether CP is the codepoint of a CJK character.""" 371 | # This defines a "chinese character" as anything in the CJK Unicode block: 372 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 373 | # 374 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 375 | # despite its name. The modern Korean Hangul alphabet is a different block, 376 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 377 | # space-separated words, so they are not treated specially and handled 378 | # like the all of the other languages. 379 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 380 | (cp >= 0x3400 and cp <= 0x4DBF) or # 381 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 382 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 383 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 384 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 385 | (cp >= 0xF900 and cp <= 0xFAFF) or # 386 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 387 | return True 388 | 389 | return False 390 | 391 | def _clean_text(self, text): 392 | """Performs invalid character removal and whitespace cleanup on text.""" 393 | output = [] 394 | for char in text: 395 | cp = ord(char) 396 | if cp == 0 or cp == 0xfffd or _is_control(char): 397 | continue 398 | if _is_whitespace(char): 399 | output.append(" ") 400 | else: 401 | output.append(char) 402 | return "".join(output) 403 | 404 | 405 | class WordpieceTokenizer(object): 406 | """Runs WordPiece tokenization.""" 407 | 408 | def __init__(self, vocab, unk_token, max_input_chars_per_word=100): 409 | self.vocab = vocab 410 | self.unk_token = unk_token 411 | self.max_input_chars_per_word = max_input_chars_per_word 412 | 413 | def tokenize(self, text): 414 | """Tokenizes a piece of text into its word pieces. 415 | 416 | This uses a greedy longest-match-first algorithm to perform tokenization 417 | using the given vocabulary. 418 | 419 | For example: 420 | input = "unaffable" 421 | output = ["un", "##aff", "##able"] 422 | 423 | Args: 424 | text: A single token or whitespace separated tokens. This should have 425 | already been passed through `BasicTokenizer`. 426 | 427 | Returns: 428 | A list of wordpiece tokens. 429 | """ 430 | 431 | output_tokens = [] 432 | for token in whitespace_tokenize(text): 433 | chars = list(token) 434 | if len(chars) > self.max_input_chars_per_word: 435 | output_tokens.append(self.unk_token) 436 | continue 437 | 438 | is_bad = False 439 | start = 0 440 | sub_tokens = [] 441 | while start < len(chars): 442 | end = len(chars) 443 | cur_substr = None 444 | while start < end: 445 | substr = "".join(chars[start:end]) 446 | if start > 0: 447 | substr = "##" + substr 448 | if substr in self.vocab: 449 | cur_substr = substr 450 | break 451 | end -= 1 452 | if cur_substr is None: 453 | is_bad = True 454 | break 455 | sub_tokens.append(cur_substr) 456 | start = end 457 | 458 | if is_bad: 459 | output_tokens.append(self.unk_token) 460 | else: 461 | output_tokens.extend(sub_tokens) 462 | return output_tokens 463 | 464 | 465 | def _is_whitespace(char): 466 | """Checks whether `chars` is a whitespace character.""" 467 | # \t, \n, and \r are technically contorl characters but we treat them 468 | # as whitespace since they are generally considered as such. 469 | if char == " " or char == "\t" or char == "\n" or char == "\r": 470 | return True 471 | cat = unicodedata.category(char) 472 | if cat == "Zs": 473 | return True 474 | return False 475 | 476 | 477 | def _is_control(char): 478 | """Checks whether `chars` is a control character.""" 479 | # These are technically control characters but we count them as whitespace 480 | # characters. 481 | if char == "\t" or char == "\n" or char == "\r": 482 | return False 483 | cat = unicodedata.category(char) 484 | if cat.startswith("C"): 485 | return True 486 | return False 487 | 488 | 489 | def _is_punctuation(char): 490 | """Checks whether `chars` is a punctuation character.""" 491 | cp = ord(char) 492 | # We treat all non-letter/number ASCII as punctuation. 493 | # Characters such as "^", "$", and "`" are not in the Unicode 494 | # Punctuation class but we treat them as punctuation anyways, for 495 | # consistency. 496 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 497 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 498 | return True 499 | cat = unicodedata.category(char) 500 | if cat.startswith("P"): 501 | return True 502 | return False 503 | -------------------------------------------------------------------------------- /modeling_roberta1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """PyTorch RoBERTa model. """ 17 | 18 | from __future__ import (absolute_import, division, print_function, 19 | unicode_literals) 20 | 21 | import logging 22 | 23 | import torch 24 | import torch.nn as nn 25 | from torch.nn import CrossEntropyLoss, MSELoss 26 | 27 | from modeling_bert1 import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu 28 | from configuration_roberta1 import RobertaConfig 29 | from file_utils1 import add_start_docstrings 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { 34 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin", 35 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin", 36 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin", 37 | } 38 | 39 | 40 | class RobertaEmbeddings(BertEmbeddings): 41 | """ 42 | Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. 43 | """ 44 | 45 | def __init__(self, config): 46 | super(RobertaEmbeddings, self).__init__(config) 47 | self.padding_idx = 1 48 | self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx) 49 | self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, 50 | padding_idx=self.padding_idx) 51 | 52 | def forward(self, input_ids, token_type_ids=None, position_ids=None): 53 | seq_length = input_ids.size(1) 54 | if position_ids is None: 55 | # Position numbers begin at padding_idx+1. Padding symbols are ignored. 56 | # cf. fairseq's `utils.make_positions` 57 | position_ids = torch.arange(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=torch.long, 58 | device=input_ids.device) 59 | position_ids = position_ids.unsqueeze(0).expand_as(input_ids) 60 | return super(RobertaEmbeddings, self).forward(input_ids, 61 | token_type_ids=token_type_ids, 62 | position_ids=position_ids) 63 | 64 | 65 | ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in 66 | `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_ 67 | by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, 68 | Veselin Stoyanov. It is based on Google's BERT model released in 2018. 69 | 70 | It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining 71 | objective and training with much larger mini-batches and learning rates. 72 | 73 | This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained 74 | models. 75 | 76 | This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and 77 | refer to the PyTorch documentation for all matter related to general usage and behavior. 78 | 79 | .. _`RoBERTa: A Robustly Optimized BERT Pretraining Approach`: 80 | https://arxiv.org/abs/1907.11692 81 | 82 | .. _`torch.nn.Module`: 83 | https://pytorch.org/docs/stable/nn.html#module 84 | 85 | Parameters: 86 | config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 87 | model. Initializing with a config file does not load the weights associated with the model, only the configuration. 88 | Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. 89 | """ 90 | 91 | ROBERTA_INPUTS_DOCSTRING = r""" 92 | Inputs: 93 | **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: 94 | Indices of input sequence tokens in the vocabulary. 95 | To match pre-training, RoBERTa input sequence should be formatted with and tokens as follows: 96 | 97 | (a) For sequence pairs: 98 | 99 | ``tokens: Is this Jacksonville ? No it is not . `` 100 | 101 | (b) For single sequences: 102 | 103 | ``tokens: the dog is hairy . `` 104 | 105 | Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with 106 | the ``add_special_tokens`` parameter set to ``True``. 107 | 108 | RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on 109 | the right rather than the left. 110 | 111 | See :func:`transformers.PreTrainedTokenizer.encode` and 112 | :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. 113 | **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: 114 | Mask to avoid performing attention on padding token indices. 115 | Mask values selected in ``[0, 1]``: 116 | ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. 117 | **token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: 118 | Optional segment token indices to indicate first and second portions of the inputs. 119 | This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it 120 | during finetuning. 121 | Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` 122 | corresponds to a `sentence B` token 123 | (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). 124 | **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: 125 | Indices of positions of each input sequence tokens in the position embeddings. 126 | Selected in the range ``[0, config.max_position_embeddings - 1[``. 127 | **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: 128 | Mask to nullify selected heads of the self-attention modules. 129 | Mask values selected in ``[0, 1]``: 130 | ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. 131 | """ 132 | 133 | 134 | @add_start_docstrings( 135 | "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", 136 | ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) 137 | class RobertaModel(BertModel): 138 | r""" 139 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: 140 | **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` 141 | Sequence of hidden-states at the output of the last layer of the model. 142 | **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` 143 | Last layer hidden-state of the first token of the sequence (classification token) 144 | further processed by a Linear layer and a Tanh activation function. The Linear 145 | layer weights are trained from the next sentence prediction (classification) 146 | objective during Bert pretraining. This output is usually *not* a good summary 147 | of the semantic content of the input, you're often better with averaging or pooling 148 | the sequence of hidden-states for the whole input sequence. 149 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) 150 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) 151 | of shape ``(batch_size, sequence_length, hidden_size)``: 152 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 153 | **attentions**: (`optional`, returned when ``config.output_attentions=True``) 154 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: 155 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. 156 | 157 | Examples:: 158 | 159 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 160 | model = RobertaModel.from_pretrained('roberta-base') 161 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 162 | outputs = model(input_ids) 163 | last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple 164 | 165 | """ 166 | config_class = RobertaConfig 167 | pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 168 | base_model_prefix = "roberta" 169 | 170 | def __init__(self, config): 171 | super(RobertaModel, self).__init__(config) 172 | 173 | self.embeddings = RobertaEmbeddings(config) 174 | self.init_weights() 175 | 176 | def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None): 177 | if input_ids[:, 0].sum().item() != 0: 178 | logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. " 179 | "This model requires special tokens in order to work. " 180 | "Please specify add_special_tokens=True in your tokenize.encode()" 181 | "or tokenizer.convert_tokens_to_ids().") 182 | return super(RobertaModel, self).forward(input_ids, 183 | attention_mask=attention_mask, 184 | token_type_ids=token_type_ids, 185 | position_ids=position_ids, 186 | head_mask=head_mask) 187 | 188 | 189 | @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, 190 | ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) 191 | class RobertaForMaskedLM(BertPreTrainedModel): 192 | r""" 193 | **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: 194 | Labels for computing the masked language modeling loss. 195 | Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) 196 | Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels 197 | in ``[0, ..., config.vocab_size]`` 198 | 199 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: 200 | **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: 201 | Masked language modeling loss. 202 | **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` 203 | Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). 204 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) 205 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) 206 | of shape ``(batch_size, sequence_length, hidden_size)``: 207 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 208 | **attentions**: (`optional`, returned when ``config.output_attentions=True``) 209 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: 210 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. 211 | 212 | Examples:: 213 | 214 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 215 | model = RobertaForMaskedLM.from_pretrained('roberta-base') 216 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 217 | outputs = model(input_ids, masked_lm_labels=input_ids) 218 | loss, prediction_scores = outputs[:2] 219 | 220 | """ 221 | config_class = RobertaConfig 222 | pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 223 | base_model_prefix = "roberta" 224 | 225 | def __init__(self, config): 226 | super(RobertaForMaskedLM, self).__init__(config) 227 | 228 | self.roberta = RobertaModel(config) 229 | self.lm_head = RobertaLMHead(config) 230 | 231 | self.init_weights() 232 | self.tie_weights() 233 | 234 | def tie_weights(self): 235 | """ Make sure we are sharing the input and output embeddings. 236 | Export to TorchScript can't handle parameter sharing so we are cloning them instead. 237 | """ 238 | self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings) 239 | 240 | def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, 241 | masked_lm_labels=None): 242 | outputs = self.roberta(input_ids, 243 | attention_mask=attention_mask, 244 | token_type_ids=token_type_ids, 245 | position_ids=position_ids, 246 | head_mask=head_mask) 247 | sequence_output = outputs[0] 248 | prediction_scores = self.lm_head(sequence_output) 249 | 250 | outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here 251 | 252 | if masked_lm_labels is not None: 253 | loss_fct = CrossEntropyLoss(ignore_index=-1) 254 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) 255 | outputs = (masked_lm_loss,) + outputs 256 | 257 | return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) 258 | 259 | 260 | class RobertaLMHead(nn.Module): 261 | """Roberta Head for masked language modeling.""" 262 | 263 | def __init__(self, config): 264 | super(RobertaLMHead, self).__init__() 265 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 266 | self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) 267 | 268 | self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 269 | self.bias = nn.Parameter(torch.zeros(config.vocab_size)) 270 | 271 | def forward(self, features, **kwargs): 272 | x = self.dense(features) 273 | x = gelu(x) 274 | x = self.layer_norm(x) 275 | 276 | # project back to size of vocabulary with bias 277 | x = self.decoder(x) + self.bias 278 | 279 | return x 280 | 281 | 282 | @add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 283 | on top of the pooled output) e.g. for GLUE tasks. """, 284 | ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) 285 | class RobertaForSequenceClassification(BertPreTrainedModel): 286 | r""" 287 | **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: 288 | Labels for computing the sequence classification/regression loss. 289 | Indices should be in ``[0, ..., config.num_labels]``. 290 | If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), 291 | If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). 292 | 293 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: 294 | **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: 295 | Classification (or regression if config.num_labels==1) loss. 296 | **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` 297 | Classification (or regression if config.num_labels==1) scores (before SoftMax). 298 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) 299 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) 300 | of shape ``(batch_size, sequence_length, hidden_size)``: 301 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 302 | **attentions**: (`optional`, returned when ``config.output_attentions=True``) 303 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: 304 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. 305 | 306 | Examples:: 307 | 308 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 309 | model = RobertaForSequenceClassification.from_pretrained('roberta-base') 310 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 311 | labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 312 | outputs = model(input_ids, labels=labels) 313 | loss, logits = outputs[:2] 314 | 315 | """ 316 | config_class = RobertaConfig 317 | pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 318 | base_model_prefix = "roberta" 319 | 320 | def __init__(self, config): 321 | super(RobertaForSequenceClassification, self).__init__(config) 322 | self.num_labels = config.num_labels 323 | 324 | self.roberta = RobertaModel(config) 325 | self.classifier = RobertaClassificationHead(config) 326 | 327 | def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, 328 | labels=None): 329 | outputs = self.roberta(input_ids, 330 | attention_mask=attention_mask, 331 | token_type_ids=token_type_ids, 332 | position_ids=position_ids, 333 | head_mask=head_mask) 334 | sequence_output = outputs[0] 335 | logits = self.classifier(sequence_output) 336 | 337 | outputs = (logits,) + outputs[2:] 338 | if labels is not None: 339 | if self.num_labels == 1: 340 | # We are doing regression 341 | loss_fct = MSELoss() 342 | loss = loss_fct(logits.view(-1), labels.view(-1)) 343 | else: 344 | loss_fct = CrossEntropyLoss() 345 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 346 | outputs = (loss,) + outputs 347 | 348 | return outputs # (loss), logits, (hidden_states), (attentions) 349 | 350 | 351 | @add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of 352 | the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, 353 | ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) 354 | class RobertaForMultipleChoice(BertPreTrainedModel): 355 | r""" 356 | Inputs: 357 | **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: 358 | Indices of input sequence tokens in the vocabulary. 359 | The second dimension of the input (`num_choices`) indicates the number of choices to score. 360 | To match pre-training, RoBerta input sequence should be formatted with [CLS] and [SEP] tokens as follows: 361 | 362 | (a) For sequence pairs: 363 | 364 | ``tokens: [CLS] is this jack ##son ##ville ? [SEP] [SEP] no it is not . [SEP]`` 365 | 366 | ``token_type_ids: 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` 367 | 368 | (b) For single sequences: 369 | 370 | ``tokens: [CLS] the dog is hairy . [SEP]`` 371 | 372 | ``token_type_ids: 0 0 0 0 0 0 0`` 373 | 374 | Indices can be obtained using :class:`transformers.BertTokenizer`. 375 | See :func:`transformers.PreTrainedTokenizer.encode` and 376 | :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. 377 | **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: 378 | Segment token indices to indicate first and second portions of the inputs. 379 | The second dimension of the input (`num_choices`) indicates the number of choices to score. 380 | Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` 381 | **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``: 382 | Mask to avoid performing attention on padding token indices. 383 | The second dimension of the input (`num_choices`) indicates the number of choices to score. 384 | Mask values selected in ``[0, 1]``: 385 | ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. 386 | **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: 387 | Mask to nullify selected heads of the self-attention modules. 388 | Mask values selected in ``[0, 1]``: 389 | ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. 390 | **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: 391 | Labels for computing the multiple choice classification loss. 392 | Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension 393 | of the input tensors. (see `input_ids` above) 394 | 395 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: 396 | **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: 397 | Classification loss. 398 | **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension 399 | of the input tensors. (see `input_ids` above). 400 | Classification scores (before SoftMax). 401 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) 402 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) 403 | of shape ``(batch_size, sequence_length, hidden_size)``: 404 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 405 | **attentions**: (`optional`, returned when ``config.output_attentions=True``) 406 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: 407 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. 408 | 409 | Examples:: 410 | 411 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 412 | model = RobertaForMultipleChoice.from_pretrained('roberta-base') 413 | choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] 414 | input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices 415 | labels = torch.tensor(1).unsqueeze(0) # Batch size 1 416 | outputs = model(input_ids, labels=labels) 417 | loss, classification_scores = outputs[:2] 418 | 419 | """ 420 | config_class = RobertaConfig 421 | pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 422 | base_model_prefix = "roberta" 423 | 424 | def __init__(self, config): 425 | super(RobertaForMultipleChoice, self).__init__(config) 426 | 427 | self.roberta = RobertaModel(config) 428 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 429 | self.classifier = nn.Linear(config.hidden_size, 1) 430 | 431 | self.init_weights() 432 | 433 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, 434 | position_ids=None, head_mask=None): 435 | num_choices = input_ids.shape[1] 436 | 437 | flat_input_ids = input_ids.view(-1, input_ids.size(-1)) 438 | flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None 439 | flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None 440 | flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None 441 | outputs = self.roberta(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, 442 | attention_mask=flat_attention_mask, head_mask=head_mask) 443 | pooled_output = outputs[1] 444 | 445 | pooled_output = self.dropout(pooled_output) 446 | logits = self.classifier(pooled_output) 447 | reshaped_logits = logits.view(-1, num_choices) 448 | 449 | outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here 450 | 451 | if labels is not None: 452 | loss_fct = CrossEntropyLoss() 453 | loss = loss_fct(reshaped_logits, labels) 454 | outputs = (loss,) + outputs 455 | 456 | return outputs # (loss), reshaped_logits, (hidden_states), (attentions) 457 | 458 | 459 | class RobertaClassificationHead(nn.Module): 460 | """Head for sentence-level classification tasks.""" 461 | 462 | def __init__(self, config): 463 | super(RobertaClassificationHead, self).__init__() 464 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 465 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 466 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels) 467 | 468 | def forward(self, features, **kwargs): 469 | x = features[:, 0, :] # take token (equiv. to [CLS]) 470 | x = self.dropout(x) 471 | x = self.dense(x) 472 | x = torch.tanh(x) 473 | x = self.dropout(x) 474 | x = self.out_proj(x) 475 | return x 476 | -------------------------------------------------------------------------------- /run_glue.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa).""" 17 | 18 | from __future__ import absolute_import, division, print_function 19 | from sklearn.metrics import roc_curve, auc 20 | import math 21 | import argparse 22 | import glob 23 | import logging 24 | import os 25 | import random 26 | 27 | import numpy as np 28 | import torch 29 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 30 | TensorDataset) 31 | from torch.utils.data.distributed import DistributedSampler 32 | 33 | try: 34 | from torch.utils.tensorboard import SummaryWriter 35 | except: 36 | from tensorboardX import SummaryWriter 37 | 38 | from tqdm import tqdm, trange 39 | from file_utils1 import WEIGHTS_NAME 40 | from configuration_bert1 import BertConfig 41 | from modeling_bert1 import BertForMultiSequenceClassification 42 | from tokenization_bert1 import BertTokenizer 43 | from configuration_roberta1 import RobertaConfig 44 | from modeling_roberta1 import RobertaForSequenceClassification 45 | from tokenization_roberta1 import RobertaTokenizer 46 | # from transformers import (WEIGHTS_NAME, BertConfig, 47 | # BertForSequenceClassification, BertTokenizer, 48 | # RobertaConfig, 49 | # RobertaForSequenceClassification, 50 | # RobertaTokenizer, 51 | # XLMConfig, XLMForSequenceClassification, 52 | # XLMTokenizer, XLNetConfig, 53 | # XLNetForSequenceClassification, 54 | # XLNetTokenizer, 55 | # DistilBertConfig, 56 | # DistilBertForSequenceClassification, 57 | # DistilBertTokenizer) 58 | from optimization1 import AdamW,WarmupLinearSchedule 59 | # from transformers import AdamW, WarmupLinearSchedule 60 | from metrics1 import glue_compute_metrics as compute_metrics 61 | # from transformers import glue_compute_metrics as compute_metrics 62 | # from transformers import glue_output_modes as output_modes 63 | from glue1 import glue_output_modes as output_modes 64 | # from transformers import glue_processors as processors 65 | from glue1 import glue_processors as processors 66 | from glue1 import glue_convert_examples_to_features as convert_examples_to_features 67 | # from transformers import glue_convert_examples_to_features as convert_examples_to_features 68 | def sigmoid(x): 69 | return 1. / (1 + np.exp(-x)) 70 | logger = logging.getLogger(__name__) 71 | 72 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, 73 | RobertaConfig)), ()) 74 | 75 | MODEL_CLASSES = { 76 | 'bert': (BertConfig, BertForMultiSequenceClassification, BertTokenizer), 77 | # 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), 78 | # 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), 79 | 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), 80 | # 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer) 81 | } 82 | 83 | 84 | def set_seed(args): 85 | random.seed(args.seed) 86 | np.random.seed(args.seed) 87 | torch.manual_seed(args.seed) 88 | if args.n_gpu > 0: 89 | torch.cuda.manual_seed_all(args.seed) 90 | 91 | # def softmax(inMatrix): 92 | # """ 93 | # softmax计算公式函数 94 | # :param inMatrix: 矩阵数据 95 | # :return: 96 | # """ 97 | # m,n = np.shape(inMatrix) #得到m,n(行,列) 98 | # outMatrix = np.mat(np.zeros((m,n)))#mat生成数组 99 | # for i in range(m): 100 | # soft_sum = 0 101 | # for idx in range(0,n): 102 | # outMatrix[i,idx] = math.exp(inMatrix[i,idx]) #求幂运算,取e为底的指数计算变成非负 103 | # soft_sum +=outMatrix[i,idx] #求和运算 104 | # for idx in range(0,n): 105 | # outMatrix[i,idx] = outMatrix[i,idx] /soft_sum #然后除以所有项之后进行归一化 106 | # return outMatrix 107 | 108 | def train(args, train_dataset, model, tokenizer): 109 | """ Train the model """ 110 | if args.local_rank in [-1, 0]: 111 | tb_writer = SummaryWriter() 112 | 113 | # args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 114 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 115 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) 116 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) 117 | 118 | if args.max_steps > 0: 119 | t_total = args.max_steps 120 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 121 | else: 122 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs 123 | 124 | # Prepare optimizer and schedule (linear warmup and decay) 125 | no_decay = ['bias', 'LayerNorm.weight'] 126 | optimizer_grouped_parameters = [ 127 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 128 | 'weight_decay': args.weight_decay}, 129 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 130 | ] 131 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) 132 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.06, t_total=t_total) 133 | if args.fp16: 134 | try: 135 | from apex import amp 136 | except ImportError: 137 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 138 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 139 | 140 | # multi-gpu training (should be after apex fp16 initialization) 141 | if args.n_gpu > 1: 142 | model = torch.nn.DataParallel(model) 143 | 144 | # Distributed training (should be after apex fp16 initialization) 145 | if args.local_rank != -1: 146 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 147 | output_device=args.local_rank, 148 | find_unused_parameters=True) 149 | 150 | # Train! 151 | logger.info("***** Running training *****") 152 | logger.info(" Num examples = %d", len(train_dataset)) 153 | logger.info(" Num Epochs = %d", args.num_train_epochs) 154 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 155 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 156 | args.train_batch_size * args.gradient_accumulation_steps * ( 157 | torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 158 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 159 | logger.info(" Total optimization steps = %d", t_total) 160 | 161 | global_step = 0 162 | tr_loss, logging_loss = 0.0, 0.0 163 | model.zero_grad() 164 | train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) 165 | set_seed(args) # Added here for reproductibility (even between python 2 and 3) 166 | 167 | for _ in train_iterator: 168 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) 169 | for step, batch in enumerate(epoch_iterator): 170 | model.train() 171 | batch = tuple(t.to(args.device) for t in batch) 172 | inputs = {'input_ids': batch[0], 173 | 'attention_mask': batch[1], 174 | 'labels': batch[3]} 175 | if args.model_type != 'distilbert': 176 | inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 177 | 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids 178 | outputs = model(**inputs) 179 | loss = outputs[0] # model outputs are always tuple in transformers (see doc) 180 | 181 | if args.n_gpu > 1: 182 | loss = loss.mean() # mean() to average on multi-gpu parallel training 183 | if args.gradient_accumulation_steps > 1: 184 | loss = loss / args.gradient_accumulation_steps 185 | 186 | if args.fp16: 187 | with amp.scale_loss(loss, optimizer) as scaled_loss: 188 | scaled_loss.backward() 189 | else: 190 | loss.backward() 191 | tr_loss += loss.item() 192 | if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu: 193 | if args.fp16: 194 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) 195 | else: 196 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 197 | 198 | optimizer.step() 199 | scheduler.step() # Update learning rate schedule 200 | model.zero_grad() 201 | global_step += 1 202 | 203 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 204 | # Log metrics 205 | if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well 206 | results = evaluate(args, model, tokenizer) 207 | for key, value in results.items(): 208 | tb_writer.add_scalar('eval_{}'.format(key), value, global_step) 209 | tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) 210 | tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) 211 | logging_loss = tr_loss 212 | 213 | if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: 214 | # Save model checkpoint 215 | output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) 216 | if not os.path.exists(output_dir): 217 | os.makedirs(output_dir) 218 | model_to_save = model.module if hasattr(model, 219 | 'module') else model # Take care of distributed/parallel training 220 | model_to_save.save_pretrained(output_dir) 221 | torch.save(args, os.path.join(output_dir, 'training_args.bin')) 222 | logger.info("Saving model checkpoint to %s", output_dir) 223 | 224 | if args.tpu: 225 | args.xla_model.optimizer_step(optimizer, barrier=True) 226 | model.zero_grad() 227 | global_step += 1 228 | 229 | if args.max_steps > 0 and global_step > args.max_steps: 230 | epoch_iterator.close() 231 | break 232 | if args.max_steps > 0 and global_step > args.max_steps: 233 | train_iterator.close() 234 | break 235 | eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) 236 | eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( 237 | args.output_dir,) 238 | 239 | if args.local_rank in [-1, 0]: 240 | tb_writer.close() 241 | 242 | return global_step, tr_loss / global_step 243 | 244 | 245 | def evaluate(args, model, tokenizer, prefix=""): 246 | # Loop to handle MNLI double evaluation (matched, mis-matched) 247 | eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) 248 | eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) 249 | 250 | results = {} 251 | for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): 252 | eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) 253 | 254 | if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: 255 | os.makedirs(eval_output_dir) 256 | 257 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 258 | # Note that DistributedSampler samples randomly 259 | eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) 260 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) 261 | 262 | # Eval! 263 | logger.info("***** Running evaluation {} *****".format(prefix)) 264 | logger.info(" Num examples = %d", len(eval_dataset)) 265 | logger.info(" Batch size = %d", args.eval_batch_size) 266 | eval_loss = 0.0 267 | nb_eval_steps = 0 268 | preds = None 269 | out_label_ids = None 270 | all_logits=[] 271 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 272 | model.eval() 273 | batch = tuple(t.to(args.device) for t in batch) 274 | 275 | with torch.no_grad(): 276 | inputs = {'input_ids': batch[0], 277 | 'attention_mask': batch[1], 278 | 'labels': batch[3]} 279 | if args.model_type != 'distilbert': 280 | inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 281 | 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids 282 | outputs = model(**inputs) 283 | tmp_eval_loss, logits = outputs[:2] 284 | tem_logits=logits 285 | Array_logits=np.array((tem_logits.cpu())) 286 | all_logits.append(Array_logits[0].tolist()) 287 | all_logits.append(Array_logits[1].tolist()) 288 | all_logits.append(Array_logits[2].tolist()) 289 | eval_loss += tmp_eval_loss.mean().item() 290 | nb_eval_steps += 1 291 | if preds is None: 292 | preds = logits.detach().cpu().numpy() 293 | out_label_ids = inputs['labels'].detach().cpu().numpy() 294 | else: 295 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) 296 | out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) 297 | # for x in range(10,98): 298 | Q=0.9 299 | tem_logits_NEW=[] 300 | tem_list=[] 301 | all_logits=np.array(all_logits) 302 | for i in range(399): 303 | for j in range(16): 304 | tem_list.append(sigmoid(all_logits[i][j])) 305 | tem_logits_NEW.append(tem_list) 306 | tem_list=[] 307 | # tem_logits_NEW=np.array(tem_logits_NEW) 308 | # tem_logits_NEW=softmax(tem_logits_NEW) 309 | # tem_logits_NEW=tem_logits_NEW.getA().tolist() 310 | 311 | for i in range(399): 312 | for j in range(16): 313 | if tem_logits_NEW[i][j] > Q: 314 | tem_logits_NEW[i][j]=1 315 | else: tem_logits_NEW[i][j]=0 316 | tem_logits_NEW=np.array(tem_logits_NEW) 317 | count=0 318 | for i in range(399): 319 | tem_1=tem_logits_NEW[i] 320 | tem_2=out_label_ids[i] 321 | w=0 322 | k=0 323 | z=0 324 | for j in range(16): 325 | if tem_1[j]==1: 326 | w=w+1 327 | for j in range(16): 328 | if tem_2[j]==1: 329 | k=k+1 330 | for j in range(16): 331 | if tem_1[j]==1 and tem_2[j]==1: 332 | z=z+1 333 | N = z*1.0/(w+0.00000001) 334 | M = z*1.0/(k+0.00000001) 335 | if N+M == 0: 336 | H=0 337 | else: 338 | H = (2*N*M)/(N+M) 339 | count=count+H 340 | F1=count/399 341 | print("***********************************************") 342 | print(F1) 343 | print("***********************************************") 344 | 345 | # all_logits = np.array(all_logits) 346 | # eval_loss = eval_loss / nb_eval_steps 347 | # fpr = dict() 348 | # tpr = dict() 349 | # roc_auc = dict() 350 | # for i in range(16): 351 | # fpr[i], tpr[i], _ = roc_curve(out_label_ids[:, i], all_logits[:, i]) 352 | # roc_auc[i] = auc(fpr[i], tpr[i]) 353 | # fpr["micro"], tpr["micro"], _ = roc_curve(out_label_ids.ravel(), all_logits.ravel()) 354 | # roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) 355 | # print(roc_auc) 356 | H = {'F1': F1} 357 | return H 358 | # if args.output_mode == "MultiLabelclassification": 359 | # output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") 360 | # with open(output_eval_file, "w") as writer: 361 | # logger.info("***** Eval results {} *****".format(prefix)) 362 | # # for key in sorted(result.keys()): 363 | # logger.info(" %s ", str(eval_loss)) 364 | # writer.write("%s\n" % (str(eval_loss))) 365 | # return eval_loss 366 | # if args.output_mode == "classification": 367 | # preds = np.argmax(preds, axis=1) 368 | # elif args.output_mode == "regression": 369 | # preds = np.squeeze(preds) 370 | # result = compute_metrics(eval_task, preds, out_label_ids) 371 | # results.update(result) 372 | # 373 | # output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") 374 | # with open(output_eval_file, "w") as writer: 375 | # logger.info("***** Eval results {} *****".format(prefix)) 376 | # for key in sorted(result.keys()): 377 | # logger.info(" %s = %s", key, str(result[key])) 378 | # writer.write("%s = %s\n" % (key, str(result[key]))) 379 | # 380 | # return results 381 | 382 | 383 | def load_and_cache_examples(args, task, tokenizer, evaluate=False): 384 | if args.local_rank not in [-1, 0] and not evaluate: 385 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 386 | 387 | processor = processors[task]() 388 | output_mode = output_modes[task] 389 | # Load data features from cache or dataset file 390 | cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( 391 | 'dev' if evaluate else 'train', 392 | list(filter(None, args.model_name_or_path.split('/'))).pop(), 393 | str(args.max_seq_length), 394 | str(task))) 395 | if os.path.exists(cached_features_file) and not args.overwrite_cache: 396 | logger.info("Loading features from cached file %s", cached_features_file) 397 | features = torch.load(cached_features_file) 398 | else: 399 | logger.info("Creating features from dataset file at %s", args.data_dir) 400 | label_list = processor.get_labels() 401 | if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: 402 | # HACK(label indices are swapped in RoBERTa pretrained model) 403 | label_list[1], label_list[2] = label_list[2], label_list[1] 404 | examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples( 405 | args.data_dir) 406 | features = convert_examples_to_features(examples, 407 | tokenizer, 408 | label_list=label_list, 409 | max_length=args.max_seq_length, 410 | output_mode=output_mode, 411 | pad_on_left=bool(args.model_type in ['xlnet']), 412 | # pad on the left for xlnet 413 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], 414 | pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, 415 | ) 416 | if args.local_rank in [-1, 0]: 417 | logger.info("Saving features into cached file %s", cached_features_file) 418 | torch.save(features, cached_features_file) 419 | 420 | if args.local_rank == 0 and not evaluate: 421 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 422 | 423 | # Convert to Tensors and build dataset 424 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) 425 | all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) 426 | all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) 427 | if output_mode == "classification": 428 | all_labels = torch.tensor([f.label for f in features], dtype=torch.long) 429 | elif output_mode == "regression": 430 | all_labels = torch.tensor([f.label for f in features], dtype=torch.float) 431 | elif output_mode == "MultiLabelclassification": 432 | all_labels = torch.tensor([f.label for f in features], dtype=torch.float) 433 | 434 | dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) 435 | return dataset 436 | 437 | 438 | def main(): 439 | parser = argparse.ArgumentParser() 440 | 441 | ## Required parameters 442 | parser.add_argument("--data_dir", default="/home/msqin/bert/bert1/data_mutil", type=str, 443 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.") 444 | parser.add_argument("--model_type", default="bert", type=str, 445 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) 446 | parser.add_argument("--model_name_or_path", default="/home/msqin/bert/bert1/uncased_L-12_H-768_A-12/pytorch_model.bin", type=str, 447 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join( 448 | ALL_MODELS)) 449 | parser.add_argument("--task_name", default="multilabel", type=str, 450 | help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) 451 | parser.add_argument("--output_dir", default="/home/msqin/bert/bert1/tmp/new_output", type=str, 452 | help="The output directory where the model predictions and checkpoints will be written.") 453 | 454 | ## Other parameters 455 | parser.add_argument("--config_name", default="/home/msqin/bert/bert1/uncased_L-12_H-768_A-12/bert_config.json", type=str, 456 | help="Pretrained config name or path if not the same as model_name") 457 | parser.add_argument("--tokenizer_name", default="/home/msqin/bert/bert1/uncased_L-12_H-768_A-12/vocab.txt", type=str, 458 | help="Pretrained tokenizer name or path if not the same as model_name") 459 | parser.add_argument("--cache_dir", default="", type=str, 460 | help="Where do you want to store the pre-trained models downloaded from s3") 461 | parser.add_argument("--max_seq_length", default=32, type=int, 462 | help="The maximum total input sequence length after tokenization. Sequences longer " 463 | "than this will be truncated, sequences shorter will be padded.") 464 | parser.add_argument("--do_train", default=True, 465 | help="Whether to run training.") 466 | parser.add_argument("--do_eval", default=True, 467 | help="Whether to run eval on the dev set.") 468 | parser.add_argument("--evaluate_during_training", default=True, 469 | help="Rul evaluation during training at each logging step.") 470 | parser.add_argument("--do_lower_case", default=True, 471 | help="Set this flag if you are using an uncased model.") 472 | 473 | parser.add_argument("--per_gpu_train_batch_size", default=32,type=int, 474 | help="Batch size per GPU/CPU for training.") 475 | parser.add_argument("--per_gpu_eval_batch_size", default=3, type=int, 476 | help="Batch size per GPU/CPU for evaluation.") 477 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 478 | help="Number of updates steps to accumulate before performing a backward/update pass.") 479 | parser.add_argument("--learning_rate", default=2e-5, type=float, 480 | help="The initial learning rate for Adam.") 481 | parser.add_argument("--weight_decay", default=0.0, type=float, 482 | help="Weight deay if we apply some.") 483 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 484 | help="Epsilon for Adam optimizer.") 485 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 486 | help="Max gradient norm.") 487 | parser.add_argument("--num_train_epochs", default=20.0, type=float, 488 | help="Total number of training epochs to perform.") 489 | parser.add_argument("--max_steps", default=-1, type=int, 490 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 491 | parser.add_argument("--warmup_steps", default=0, type=int, 492 | help="Linear warmup over warmup_steps.") 493 | 494 | parser.add_argument('--logging_steps', type=int, default=500, 495 | help="Log every X updates steps.") 496 | parser.add_argument('--save_steps', type=int, default=500, 497 | help="Save checkpoint every X updates steps.") 498 | parser.add_argument("--eval_all_checkpoints", default=True, 499 | help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") 500 | parser.add_argument("--no_cuda", action='store_true', 501 | help="Avoid using CUDA when available") 502 | parser.add_argument('--overwrite_output_dir', action='store_true', 503 | help="Overwrite the content of the output directory") 504 | parser.add_argument('--overwrite_cache', action='store_true', 505 | help="Overwrite the cached training and evaluation sets") 506 | parser.add_argument('--seed', type=int, default=42, 507 | help="random seed for initialization") 508 | 509 | parser.add_argument('--tpu', action='store_true', 510 | help="Whether to run on the TPU defined in the environment variables") 511 | parser.add_argument('--tpu_ip_address', type=str, default='', 512 | help="TPU IP address if none are set in the environment variables") 513 | parser.add_argument('--tpu_name', type=str, default='', 514 | help="TPU name if none are set in the environment variables") 515 | parser.add_argument('--xrt_tpu_config', type=str, default='', 516 | help="XRT TPU config if none are set in the environment variables") 517 | 518 | parser.add_argument('--fp16', action='store_true', 519 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 520 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 521 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 522 | "See details at https://nvidia.github.io/apex/amp.html") 523 | parser.add_argument("--local_rank", type=int, default=-1, 524 | help="For distributed training: local_rank") 525 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") 526 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") 527 | args = parser.parse_args() 528 | 529 | if os.path.exists(args.output_dir) and os.listdir( 530 | args.output_dir) and args.do_train and not args.overwrite_output_dir: 531 | raise ValueError( 532 | "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( 533 | args.output_dir)) 534 | 535 | # Setup distant debugging if needed 536 | if args.server_ip and args.server_port: 537 | # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script 538 | import ptvsd 539 | print("Waiting for debugger attach") 540 | ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) 541 | ptvsd.wait_for_attach() 542 | 543 | # Setup CUDA, GPU & distributed training 544 | if args.local_rank == -1 or args.no_cuda: 545 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 546 | args.n_gpu = torch.cuda.device_count() 547 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 548 | torch.cuda.set_device(args.local_rank) 549 | device = torch.device("cuda", args.local_rank) 550 | torch.distributed.init_process_group(backend='nccl') 551 | args.n_gpu = 1 552 | args.device = device 553 | 554 | if args.tpu: 555 | if args.tpu_ip_address: 556 | os.environ["TPU_IP_ADDRESS"] = args.tpu_ip_address 557 | if args.tpu_name: 558 | os.environ["TPU_NAME"] = args.tpu_name 559 | if args.xrt_tpu_config: 560 | os.environ["XRT_TPU_CONFIG"] = args.xrt_tpu_config 561 | 562 | assert "TPU_IP_ADDRESS" in os.environ 563 | assert "TPU_NAME" in os.environ 564 | assert "XRT_TPU_CONFIG" in os.environ 565 | 566 | import torch_xla 567 | import torch_xla.core.xla_model as xm 568 | args.device = xm.xla_device() 569 | args.xla_model = xm 570 | 571 | # Setup logging 572 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 573 | datefmt='%m/%d/%Y %H:%M:%S', 574 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) 575 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 576 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 577 | 578 | # Set seed 579 | set_seed(args) 580 | 581 | # Prepare GLUE task 582 | args.task_name = args.task_name.lower() 583 | if args.task_name not in processors: 584 | raise ValueError("Task not found: %s" % (args.task_name)) 585 | processor = processors[args.task_name]() 586 | args.output_mode = output_modes[args.task_name] 587 | label_list = processor.get_labels() 588 | num_labels = len(label_list) 589 | 590 | # Load pretrained model and tokenizer 591 | if args.local_rank not in [-1, 0]: 592 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 593 | 594 | args.model_type = args.model_type.lower() 595 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 596 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, 597 | num_labels=num_labels, finetuning_task=args.task_name) 598 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, 599 | do_lower_case=args.do_lower_case) 600 | model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), 601 | config=config) 602 | 603 | if args.local_rank == 0: 604 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 605 | 606 | model.to(args.device) 607 | 608 | logger.info("Training/evaluation parameters %s", args) 609 | 610 | # Training 611 | if args.do_train: 612 | train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) 613 | global_step, tr_loss = train(args, train_dataset, model, tokenizer) 614 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) 615 | 616 | # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() 617 | if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and not args.tpu: 618 | # Create output directory if needed 619 | if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: 620 | os.makedirs(args.output_dir) 621 | 622 | logger.info("Saving model checkpoint to %s", args.output_dir) 623 | # Save a trained model, configuration and tokenizer using `save_pretrained()`. 624 | # They can then be reloaded using `from_pretrained()` 625 | model_to_save = model.module if hasattr(model, 626 | 'module') else model # Take care of distributed/parallel training 627 | model_to_save.save_pretrained(args.output_dir) 628 | tokenizer.save_pretrained(args.output_dir) 629 | 630 | # Good practice: save your training arguments together with the trained model 631 | torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) 632 | 633 | # Load a trained model and vocabulary that you have fine-tuned 634 | model = model_class.from_pretrained(args.output_dir) 635 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) 636 | model.to(args.device) 637 | 638 | # Evaluation 639 | results = {} 640 | if args.do_eval and args.local_rank in [-1, 0]: 641 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) 642 | checkpoints = [args.output_dir] 643 | if args.eval_all_checkpoints: 644 | checkpoints = list( 645 | os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) 646 | logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging 647 | logger.info("Evaluate the following checkpoints: %s", checkpoints) 648 | for checkpoint in checkpoints: 649 | # checkpoint="/home/msqin/bert/bert1/tmp/My_output" 650 | global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" 651 | prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" 652 | 653 | model = model_class.from_pretrained(checkpoint) 654 | model.to(args.device) 655 | result = evaluate(args, model, tokenizer, prefix=prefix) 656 | result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) 657 | results.update(result) 658 | print(results) 659 | return results 660 | 661 | 662 | if __name__ == "__main__": 663 | main() -------------------------------------------------------------------------------- /modeling_utils1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """PyTorch BERT model.""" 17 | 18 | from __future__ import (absolute_import, division, print_function, 19 | unicode_literals) 20 | 21 | import copy 22 | import json 23 | import logging 24 | import os 25 | from io import open 26 | 27 | import six 28 | import torch 29 | from torch import nn 30 | from torch.nn import CrossEntropyLoss 31 | from torch.nn import functional as F 32 | 33 | from configuration_utils1 import PretrainedConfig 34 | from file_utils1 import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | 39 | try: 40 | from torch.nn import Identity 41 | except ImportError: 42 | # Older PyTorch compatibility 43 | class Identity(nn.Module): 44 | r"""A placeholder identity operator that is argument-insensitive. 45 | """ 46 | def __init__(self, *args, **kwargs): 47 | super(Identity, self).__init__() 48 | 49 | def forward(self, input): 50 | return input 51 | 52 | class PreTrainedModel(nn.Module): 53 | r""" Base class for all models. 54 | 55 | :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models 56 | as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. 57 | 58 | Class attributes (overridden by derived classes): 59 | - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. 60 | - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values. 61 | - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments: 62 | 63 | - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`, 64 | - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`, 65 | - ``path``: a path (string) to the TensorFlow checkpoint. 66 | 67 | - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model. 68 | """ 69 | config_class = None 70 | pretrained_model_archive_map = {} 71 | load_tf_weights = lambda model, config, path: None 72 | base_model_prefix = "" 73 | 74 | def __init__(self, config, *inputs, **kwargs): 75 | super(PreTrainedModel, self).__init__() 76 | if not isinstance(config, PretrainedConfig): 77 | raise ValueError( 78 | "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. " 79 | "To create a model from a pretrained model use " 80 | "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( 81 | self.__class__.__name__, self.__class__.__name__ 82 | )) 83 | # Save config in model 84 | self.config = config 85 | 86 | def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None): 87 | """ Build a resized Embedding Module from a provided token Embedding Module. 88 | Increasing the size will add newly initialized vectors at the end 89 | Reducing the size will remove vectors from the end 90 | 91 | Args: 92 | new_num_tokens: (`optional`) int 93 | New number of tokens in the embedding matrix. 94 | Increasing the size will add newly initialized vectors at the end 95 | Reducing the size will remove vectors from the end 96 | If not provided or None: return the provided token Embedding Module. 97 | Return: ``torch.nn.Embeddings`` 98 | Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None 99 | """ 100 | if new_num_tokens is None: 101 | return old_embeddings 102 | 103 | old_num_tokens, old_embedding_dim = old_embeddings.weight.size() 104 | if old_num_tokens == new_num_tokens: 105 | return old_embeddings 106 | 107 | # Build new embeddings 108 | new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim) 109 | new_embeddings.to(old_embeddings.weight.device) 110 | 111 | # initialize all new embeddings (in particular added tokens) 112 | self._init_weights(new_embeddings) 113 | 114 | # Copy word embeddings from the previous weights 115 | num_tokens_to_copy = min(old_num_tokens, new_num_tokens) 116 | new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :] 117 | 118 | return new_embeddings 119 | 120 | def _tie_or_clone_weights(self, first_module, second_module): 121 | """ Tie or clone module weights depending of weither we are using TorchScript or not 122 | """ 123 | if self.config.torchscript: 124 | first_module.weight = nn.Parameter(second_module.weight.clone()) 125 | else: 126 | first_module.weight = second_module.weight 127 | 128 | if hasattr(first_module, 'bias') and first_module.bias is not None: 129 | first_module.bias.data = torch.nn.functional.pad( 130 | first_module.bias.data, 131 | (0, first_module.weight.shape[0] - first_module.bias.shape[0]), 132 | 'constant', 133 | 0 134 | ) 135 | 136 | def resize_token_embeddings(self, new_num_tokens=None): 137 | """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. 138 | Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. 139 | 140 | Arguments: 141 | 142 | new_num_tokens: (`optional`) int: 143 | New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. 144 | If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model. 145 | 146 | Return: ``torch.nn.Embeddings`` 147 | Pointer to the input tokens Embeddings Module of the model 148 | """ 149 | base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed 150 | model_embeds = base_model._resize_token_embeddings(new_num_tokens) 151 | if new_num_tokens is None: 152 | return model_embeds 153 | 154 | # Update base model and current model config 155 | self.config.vocab_size = new_num_tokens 156 | base_model.vocab_size = new_num_tokens 157 | 158 | # Tie weights again if needed 159 | if hasattr(self, 'tie_weights'): 160 | self.tie_weights() 161 | 162 | return model_embeds 163 | 164 | def init_weights(self): 165 | """ Initialize and prunes weights if needed. """ 166 | # Initialize weights 167 | self.apply(self._init_weights) 168 | 169 | # Prune heads if needed 170 | if self.config.pruned_heads: 171 | self.prune_heads(self.config.pruned_heads) 172 | 173 | def prune_heads(self, heads_to_prune): 174 | """ Prunes heads of the base model. 175 | 176 | Arguments: 177 | 178 | heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). 179 | E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2. 180 | """ 181 | base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed 182 | 183 | # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads 184 | for layer, heads in heads_to_prune.items(): 185 | union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads) 186 | self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON 187 | 188 | base_model._prune_heads(heads_to_prune) 189 | 190 | def save_pretrained(self, save_directory): 191 | """ Save a model and its configuration file to a directory, so that it 192 | can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. 193 | """ 194 | assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" 195 | 196 | # Only save the model it-self if we are using distributed training 197 | model_to_save = self.module if hasattr(self, 'module') else self 198 | 199 | # Save configuration file 200 | model_to_save.config.save_pretrained(save_directory) 201 | 202 | # If we save using the predefined names, we can load using `from_pretrained` 203 | output_model_file = os.path.join(save_directory, WEIGHTS_NAME) 204 | torch.save(model_to_save.state_dict(), output_model_file) 205 | logger.info("Model weights saved in {}".format(output_model_file)) 206 | 207 | @classmethod 208 | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): 209 | r"""Instantiate a pretrained pytorch model from a pre-trained model configuration. 210 | 211 | The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated) 212 | To train the model, you should first set it back in training mode with ``model.train()`` 213 | 214 | The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. 215 | It is up to you to train those weights with a downstream fine-tuning task. 216 | 217 | The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. 218 | 219 | Parameters: 220 | pretrained_model_name_or_path: either: 221 | 222 | - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. 223 | - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. 224 | - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. 225 | - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``) 226 | 227 | model_args: (`optional`) Sequence of positional arguments: 228 | All remaning positional arguments will be passed to the underlying model's ``__init__`` method 229 | 230 | config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: 231 | Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: 232 | 233 | - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or 234 | - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. 235 | - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. 236 | 237 | state_dict: (`optional`) dict: 238 | an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. 239 | This option can be used if you want to create a model from a pretrained configuration but load your own weights. 240 | In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. 241 | 242 | cache_dir: (`optional`) string: 243 | Path to a directory in which a downloaded pre-trained model 244 | configuration should be cached if the standard cache should not be used. 245 | 246 | force_download: (`optional`) boolean, default False: 247 | Force to (re-)download the model weights and configuration files and override the cached versions if they exists. 248 | 249 | proxies: (`optional`) dict, default None: 250 | A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. 251 | The proxies are used on each request. 252 | 253 | output_loading_info: (`optional`) boolean: 254 | Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. 255 | 256 | kwargs: (`optional`) Remaining dictionary of keyword arguments: 257 | Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: 258 | 259 | - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) 260 | - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. 261 | 262 | Examples:: 263 | 264 | model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. 265 | model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` 266 | model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading 267 | assert model.config.output_attention == True 268 | # Loading from a TF checkpoint file instead of a PyTorch model (slower) 269 | config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') 270 | model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) 271 | 272 | """ 273 | config = kwargs.pop('config', None) 274 | state_dict = kwargs.pop('state_dict', None) 275 | cache_dir = kwargs.pop('cache_dir', None) 276 | from_tf = kwargs.pop('from_tf', False) 277 | force_download = kwargs.pop('force_download', False) 278 | proxies = kwargs.pop('proxies', None) 279 | output_loading_info = kwargs.pop('output_loading_info', False) 280 | 281 | # Load config 282 | if config is None: 283 | config, model_kwargs = cls.config_class.from_pretrained( 284 | pretrained_model_name_or_path, *model_args, 285 | cache_dir=cache_dir, return_unused_kwargs=True, 286 | force_download=force_download, 287 | **kwargs 288 | ) 289 | else: 290 | model_kwargs = kwargs 291 | 292 | # Load model 293 | if pretrained_model_name_or_path is not None: 294 | if pretrained_model_name_or_path in cls.pretrained_model_archive_map: 295 | archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path] 296 | elif os.path.isdir(pretrained_model_name_or_path): 297 | if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")): 298 | # Load from a TF 1.0 checkpoint 299 | archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index") 300 | elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)): 301 | # Load from a TF 2.0 checkpoint 302 | archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME) 303 | elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): 304 | # Load from a PyTorch checkpoint 305 | archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) 306 | else: 307 | raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format( 308 | [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"], 309 | pretrained_model_name_or_path)) 310 | elif os.path.isfile(pretrained_model_name_or_path): 311 | archive_file = pretrained_model_name_or_path 312 | else: 313 | assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path) 314 | archive_file = pretrained_model_name_or_path + ".index" 315 | 316 | # redirect to the cache, if necessary 317 | try: 318 | resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) 319 | except EnvironmentError: 320 | if pretrained_model_name_or_path in cls.pretrained_model_archive_map: 321 | msg = "Couldn't reach server at '{}' to download pretrained weights.".format( 322 | archive_file) 323 | else: 324 | msg = "Model name '{}' was not found in model name list ({}). " \ 325 | "We assumed '{}' was a path or url to model weight files named one of {} but " \ 326 | "couldn't find any such file at this path or url.".format( 327 | pretrained_model_name_or_path, 328 | ', '.join(cls.pretrained_model_archive_map.keys()), 329 | archive_file, 330 | [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME]) 331 | raise EnvironmentError(msg) 332 | 333 | if resolved_archive_file == archive_file: 334 | logger.info("loading weights file {}".format(archive_file)) 335 | else: 336 | logger.info("loading weights file {} from cache at {}".format( 337 | archive_file, resolved_archive_file)) 338 | else: 339 | resolved_archive_file = None 340 | 341 | # Instantiate model. 342 | model = cls(config, *model_args, **model_kwargs) 343 | 344 | if state_dict is None and not from_tf: 345 | state_dict = torch.load(resolved_archive_file, map_location='cpu') 346 | 347 | missing_keys = [] 348 | unexpected_keys = [] 349 | error_msgs = [] 350 | 351 | if from_tf: 352 | if resolved_archive_file.endswith('.index'): 353 | # Load from a TensorFlow 1.X checkpoint - provided by original authors 354 | model = cls.load_tf_weights(model, config, resolved_archive_file[:-6]) # Remove the '.index' 355 | else: 356 | # Load from our TensorFlow 2.0 checkpoints 357 | try: 358 | from transformers import load_tf2_checkpoint_in_pytorch_model 359 | model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True) 360 | except ImportError as e: 361 | logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " 362 | "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.") 363 | raise e 364 | else: 365 | # Convert old format to new format if needed from a PyTorch state_dict 366 | old_keys = [] 367 | new_keys = [] 368 | for key in state_dict.keys(): 369 | new_key = None 370 | if 'gamma' in key: 371 | new_key = key.replace('gamma', 'weight') 372 | if 'beta' in key: 373 | new_key = key.replace('beta', 'bias') 374 | if new_key: 375 | old_keys.append(key) 376 | new_keys.append(new_key) 377 | for old_key, new_key in zip(old_keys, new_keys): 378 | state_dict[new_key] = state_dict.pop(old_key) 379 | 380 | # copy state_dict so _load_from_state_dict can modify it 381 | metadata = getattr(state_dict, '_metadata', None) 382 | state_dict = state_dict.copy() 383 | if metadata is not None: 384 | state_dict._metadata = metadata 385 | 386 | def load(module, prefix=''): 387 | local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) 388 | module._load_from_state_dict( 389 | state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) 390 | for name, child in module._modules.items(): 391 | if child is not None: 392 | load(child, prefix + name + '.') 393 | 394 | # Make sure we are able to load base models as well as derived models (with heads) 395 | start_prefix = '' 396 | model_to_load = model 397 | if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()): 398 | start_prefix = cls.base_model_prefix + '.' 399 | if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()): 400 | model_to_load = getattr(model, cls.base_model_prefix) 401 | 402 | load(model_to_load, prefix=start_prefix) 403 | if len(missing_keys) > 0: 404 | logger.info("Weights of {} not initialized from pretrained model: {}".format( 405 | model.__class__.__name__, missing_keys)) 406 | if len(unexpected_keys) > 0: 407 | logger.info("Weights from pretrained model not used in {}: {}".format( 408 | model.__class__.__name__, unexpected_keys)) 409 | if len(error_msgs) > 0: 410 | raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( 411 | model.__class__.__name__, "\n\t".join(error_msgs))) 412 | 413 | if hasattr(model, 'tie_weights'): 414 | model.tie_weights() # make sure word embedding weights are still tied 415 | 416 | # Set model in evaluation mode to desactivate DropOut modules by default 417 | model.eval() 418 | 419 | if output_loading_info: 420 | loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs} 421 | return model, loading_info 422 | 423 | return model 424 | 425 | 426 | class Conv1D(nn.Module): 427 | def __init__(self, nf, nx): 428 | """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) 429 | Basically works like a Linear layer but the weights are transposed 430 | """ 431 | super(Conv1D, self).__init__() 432 | self.nf = nf 433 | w = torch.empty(nx, nf) 434 | nn.init.normal_(w, std=0.02) 435 | self.weight = nn.Parameter(w) 436 | self.bias = nn.Parameter(torch.zeros(nf)) 437 | 438 | def forward(self, x): 439 | size_out = x.size()[:-1] + (self.nf,) 440 | x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight) 441 | x = x.view(*size_out) 442 | return x 443 | 444 | 445 | class PoolerStartLogits(nn.Module): 446 | """ Compute SQuAD start_logits from sequence hidden states. """ 447 | def __init__(self, config): 448 | super(PoolerStartLogits, self).__init__() 449 | self.dense = nn.Linear(config.hidden_size, 1) 450 | 451 | def forward(self, hidden_states, p_mask=None): 452 | """ Args: 453 | **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)` 454 | invalid position mask such as query and special symbols (PAD, SEP, CLS) 455 | 1.0 means token should be masked. 456 | """ 457 | x = self.dense(hidden_states).squeeze(-1) 458 | 459 | if p_mask is not None: 460 | if next(self.parameters()).dtype == torch.float16: 461 | x = x * (1 - p_mask) - 65500 * p_mask 462 | else: 463 | x = x * (1 - p_mask) - 1e30 * p_mask 464 | 465 | return x 466 | 467 | 468 | class PoolerEndLogits(nn.Module): 469 | """ Compute SQuAD end_logits from sequence hidden states and start token hidden state. 470 | """ 471 | def __init__(self, config): 472 | super(PoolerEndLogits, self).__init__() 473 | self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) 474 | self.activation = nn.Tanh() 475 | self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) 476 | self.dense_1 = nn.Linear(config.hidden_size, 1) 477 | 478 | def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None): 479 | """ Args: 480 | One of ``start_states``, ``start_positions`` should be not None. 481 | If both are set, ``start_positions`` overrides ``start_states``. 482 | 483 | **start_states**: ``torch.LongTensor`` of shape identical to hidden_states 484 | hidden states of the first tokens for the labeled span. 485 | **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` 486 | position of the first token for the labeled span: 487 | **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)`` 488 | Mask of invalid position such as query and special symbols (PAD, SEP, CLS) 489 | 1.0 means token should be masked. 490 | """ 491 | assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None" 492 | if start_positions is not None: 493 | slen, hsz = hidden_states.shape[-2:] 494 | start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) 495 | start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz) 496 | start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz) 497 | 498 | x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1)) 499 | x = self.activation(x) 500 | x = self.LayerNorm(x) 501 | x = self.dense_1(x).squeeze(-1) 502 | 503 | if p_mask is not None: 504 | if next(self.parameters()).dtype == torch.float16: 505 | x = x * (1 - p_mask) - 65500 * p_mask 506 | else: 507 | x = x * (1 - p_mask) - 1e30 * p_mask 508 | 509 | return x 510 | 511 | 512 | class PoolerAnswerClass(nn.Module): 513 | """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """ 514 | def __init__(self, config): 515 | super(PoolerAnswerClass, self).__init__() 516 | self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) 517 | self.activation = nn.Tanh() 518 | self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False) 519 | 520 | def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None): 521 | """ 522 | Args: 523 | One of ``start_states``, ``start_positions`` should be not None. 524 | If both are set, ``start_positions`` overrides ``start_states``. 525 | 526 | **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``. 527 | hidden states of the first tokens for the labeled span. 528 | **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` 529 | position of the first token for the labeled span. 530 | **cls_index**: torch.LongTensor of shape ``(batch_size,)`` 531 | position of the CLS token. If None, take the last token. 532 | 533 | note(Original repo): 534 | no dependency on end_feature so that we can obtain one single `cls_logits` 535 | for each sample 536 | """ 537 | hsz = hidden_states.shape[-1] 538 | assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None" 539 | if start_positions is not None: 540 | start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) 541 | start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz) 542 | 543 | if cls_index is not None: 544 | cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) 545 | cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz) 546 | else: 547 | cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz) 548 | 549 | x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1)) 550 | x = self.activation(x) 551 | x = self.dense_1(x).squeeze(-1) 552 | 553 | return x 554 | 555 | 556 | class SQuADHead(nn.Module): 557 | r""" A SQuAD head inspired by XLNet. 558 | 559 | Parameters: 560 | config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model. 561 | 562 | Inputs: 563 | **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)`` 564 | hidden states of sequence tokens 565 | **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` 566 | position of the first token for the labeled span. 567 | **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` 568 | position of the last token for the labeled span. 569 | **cls_index**: torch.LongTensor of shape ``(batch_size,)`` 570 | position of the CLS token. If None, take the last token. 571 | **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)`` 572 | Whether the question has a possible answer in the paragraph or not. 573 | **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)`` 574 | Mask of invalid position such as query and special symbols (PAD, SEP, CLS) 575 | 1.0 means token should be masked. 576 | 577 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: 578 | **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: 579 | Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. 580 | **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) 581 | ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)`` 582 | Log probabilities for the top config.start_n_top start token possibilities (beam-search). 583 | **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) 584 | ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)`` 585 | Indices for the top config.start_n_top start token possibilities (beam-search). 586 | **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) 587 | ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` 588 | Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). 589 | **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) 590 | ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` 591 | Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). 592 | **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) 593 | ``torch.FloatTensor`` of shape ``(batch_size,)`` 594 | Log probabilities for the ``is_impossible`` label of the answers. 595 | """ 596 | def __init__(self, config): 597 | super(SQuADHead, self).__init__() 598 | self.start_n_top = config.start_n_top 599 | self.end_n_top = config.end_n_top 600 | 601 | self.start_logits = PoolerStartLogits(config) 602 | self.end_logits = PoolerEndLogits(config) 603 | self.answer_class = PoolerAnswerClass(config) 604 | 605 | def forward(self, hidden_states, start_positions=None, end_positions=None, 606 | cls_index=None, is_impossible=None, p_mask=None): 607 | outputs = () 608 | 609 | start_logits = self.start_logits(hidden_states, p_mask=p_mask) 610 | 611 | if start_positions is not None and end_positions is not None: 612 | # If we are on multi-GPU, let's remove the dimension added by batch splitting 613 | for x in (start_positions, end_positions, cls_index, is_impossible): 614 | if x is not None and x.dim() > 1: 615 | x.squeeze_(-1) 616 | 617 | # during training, compute the end logits based on the ground truth of the start position 618 | end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) 619 | 620 | loss_fct = CrossEntropyLoss() 621 | start_loss = loss_fct(start_logits, start_positions) 622 | end_loss = loss_fct(end_logits, end_positions) 623 | total_loss = (start_loss + end_loss) / 2 624 | 625 | if cls_index is not None and is_impossible is not None: 626 | # Predict answerability from the representation of CLS and START 627 | cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) 628 | loss_fct_cls = nn.BCEWithLogitsLoss() 629 | cls_loss = loss_fct_cls(cls_logits, is_impossible) 630 | 631 | # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss 632 | total_loss += cls_loss * 0.5 633 | 634 | outputs = (total_loss,) + outputs 635 | 636 | else: 637 | # during inference, compute the end logits based on beam search 638 | bsz, slen, hsz = hidden_states.size() 639 | start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) 640 | 641 | start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top) 642 | start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) 643 | start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) 644 | start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) 645 | 646 | hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz) 647 | p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None 648 | end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) 649 | end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) 650 | 651 | end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top) 652 | end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) 653 | end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) 654 | 655 | start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) 656 | cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) 657 | 658 | outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs 659 | 660 | # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits 661 | # or (if labels are provided) (total_loss,) 662 | return outputs 663 | 664 | 665 | class SequenceSummary(nn.Module): 666 | r""" Compute a single vector summary of a sequence hidden states according to various possibilities: 667 | Args of the config class: 668 | summary_type: 669 | - 'last' => [default] take the last token hidden state (like XLNet) 670 | - 'first' => take the first token hidden state (like Bert) 671 | - 'mean' => take the mean of all tokens hidden states 672 | - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) 673 | - 'attn' => Not implemented now, use multi-head attention 674 | summary_use_proj: Add a projection after the vector extraction 675 | summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. 676 | summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default 677 | summary_first_dropout: Add a dropout before the projection and activation 678 | summary_last_dropout: Add a dropout after the projection and activation 679 | """ 680 | def __init__(self, config): 681 | super(SequenceSummary, self).__init__() 682 | 683 | self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last' 684 | if self.summary_type == 'attn': 685 | # We should use a standard multi-head attention module with absolute positional embedding for that. 686 | # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 687 | # We can probably just use the multi-head attention module of PyTorch >=1.1.0 688 | raise NotImplementedError 689 | 690 | self.summary = Identity() 691 | if hasattr(config, 'summary_use_proj') and config.summary_use_proj: 692 | if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0: 693 | num_classes = config.num_labels 694 | else: 695 | num_classes = config.hidden_size 696 | self.summary = nn.Linear(config.hidden_size, num_classes) 697 | 698 | self.activation = Identity() 699 | if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh': 700 | self.activation = nn.Tanh() 701 | 702 | self.first_dropout = Identity() 703 | if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0: 704 | self.first_dropout = nn.Dropout(config.summary_first_dropout) 705 | 706 | self.last_dropout = Identity() 707 | if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0: 708 | self.last_dropout = nn.Dropout(config.summary_last_dropout) 709 | 710 | def forward(self, hidden_states, cls_index=None): 711 | """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer. 712 | cls_index: [optional] position of the classification token if summary_type == 'cls_index', 713 | shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states. 714 | if summary_type == 'cls_index' and cls_index is None: 715 | we take the last token of the sequence as classification token 716 | """ 717 | if self.summary_type == 'last': 718 | output = hidden_states[:, -1] 719 | elif self.summary_type == 'first': 720 | output = hidden_states[:, 0] 721 | elif self.summary_type == 'mean': 722 | output = hidden_states.mean(dim=1) 723 | elif self.summary_type == 'cls_index': 724 | if cls_index is None: 725 | cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long) 726 | else: 727 | cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) 728 | cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),)) 729 | # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states 730 | output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) 731 | elif self.summary_type == 'attn': 732 | raise NotImplementedError 733 | 734 | output = self.first_dropout(output) 735 | output = self.summary(output) 736 | output = self.activation(output) 737 | output = self.last_dropout(output) 738 | 739 | return output 740 | 741 | 742 | def prune_linear_layer(layer, index, dim=0): 743 | """ Prune a linear layer (a model parameters) to keep only entries in index. 744 | Return the pruned layer as a new layer with requires_grad=True. 745 | Used to remove heads. 746 | """ 747 | index = index.to(layer.weight.device) 748 | W = layer.weight.index_select(dim, index).clone().detach() 749 | if layer.bias is not None: 750 | if dim == 1: 751 | b = layer.bias.clone().detach() 752 | else: 753 | b = layer.bias[index].clone().detach() 754 | new_size = list(layer.weight.size()) 755 | new_size[dim] = len(index) 756 | new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) 757 | new_layer.weight.requires_grad = False 758 | new_layer.weight.copy_(W.contiguous()) 759 | new_layer.weight.requires_grad = True 760 | if layer.bias is not None: 761 | new_layer.bias.requires_grad = False 762 | new_layer.bias.copy_(b.contiguous()) 763 | new_layer.bias.requires_grad = True 764 | return new_layer 765 | 766 | 767 | def prune_conv1d_layer(layer, index, dim=1): 768 | """ Prune a Conv1D layer (a model parameters) to keep only entries in index. 769 | A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed. 770 | Return the pruned layer as a new layer with requires_grad=True. 771 | Used to remove heads. 772 | """ 773 | index = index.to(layer.weight.device) 774 | W = layer.weight.index_select(dim, index).clone().detach() 775 | if dim == 0: 776 | b = layer.bias.clone().detach() 777 | else: 778 | b = layer.bias[index].clone().detach() 779 | new_size = list(layer.weight.size()) 780 | new_size[dim] = len(index) 781 | new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device) 782 | new_layer.weight.requires_grad = False 783 | new_layer.weight.copy_(W.contiguous()) 784 | new_layer.weight.requires_grad = True 785 | new_layer.bias.requires_grad = False 786 | new_layer.bias.copy_(b.contiguous()) 787 | new_layer.bias.requires_grad = True 788 | return new_layer 789 | 790 | 791 | def prune_layer(layer, index, dim=None): 792 | """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index. 793 | Return the pruned layer as a new layer with requires_grad=True. 794 | Used to remove heads. 795 | """ 796 | if isinstance(layer, nn.Linear): 797 | return prune_linear_layer(layer, index, dim=0 if dim is None else dim) 798 | elif isinstance(layer, Conv1D): 799 | return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim) 800 | else: 801 | raise ValueError("Can't prune layer of class {}".format(layer.__class__)) 802 | --------------------------------------------------------------------------------