├── data ├── requirements.txt ├── read_me.md └── code │ ├── models │ ├── __pycache__ │ │ └── nezha.cpython-37.pyc │ └── nezha.py │ ├── util │ ├── others │ │ ├── __pycache__ │ │ │ ├── hanzi.cpython-37.pyc │ │ │ └── label2id.cpython-37.pyc │ │ ├── label2id.py │ │ └── hanzi.py │ ├── tools │ │ ├── __pycache__ │ │ │ ├── predict_tools.cpython-37.pyc │ │ │ └── finetune_tools.cpython-37.pyc │ │ ├── predict_tools.py │ │ └── finetune_tools.py │ ├── pretrain_utils │ │ ├── __pycache__ │ │ │ ├── trainer.cpython-37.pyc │ │ │ └── trainer_args.cpython-37.pyc │ │ └── trainer_args.py │ └── modeling │ │ └── modeling_nezha │ │ ├── __pycache__ │ │ ├── modeling.cpython-37.pyc │ │ └── configuration.cpython-37.pyc │ │ ├── configuration.py │ │ └── modeling.py │ ├── fusion_code │ └── run_fusion.py │ ├── predict_code │ └── run_predictor.py │ ├── build_vocab │ └── build_vocab.py │ ├── process_data │ └── process_data.py │ ├── finetune_code │ └── run_classify.py │ └── pretrain_code │ └── run_pretrain.py ├── .idea ├── .gitignore ├── vcs.xml ├── misc.xml ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml ├── modules.xml └── daguancup_end2end.iml ├── .gitattributes └── READ_ME.md /data/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.7.1 2 | transformers==4.3.0.rc1 -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /data/read_me.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/read_me.md -------------------------------------------------------------------------------- /data/code/models/__pycache__/nezha.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/models/__pycache__/nezha.cpython-37.pyc -------------------------------------------------------------------------------- /data/code/util/others/__pycache__/hanzi.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/others/__pycache__/hanzi.cpython-37.pyc -------------------------------------------------------------------------------- /data/code/util/others/__pycache__/label2id.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/others/__pycache__/label2id.cpython-37.pyc -------------------------------------------------------------------------------- /data/code/util/tools/__pycache__/predict_tools.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/tools/__pycache__/predict_tools.cpython-37.pyc -------------------------------------------------------------------------------- /data/code/util/tools/__pycache__/finetune_tools.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/tools/__pycache__/finetune_tools.cpython-37.pyc -------------------------------------------------------------------------------- /data/code/util/pretrain_utils/__pycache__/trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/pretrain_utils/__pycache__/trainer.cpython-37.pyc -------------------------------------------------------------------------------- /data/code/util/pretrain_utils/__pycache__/trainer_args.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/pretrain_utils/__pycache__/trainer_args.cpython-37.pyc -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /data/code/util/modeling/modeling_nezha/__pycache__/modeling.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/modeling/modeling_nezha/__pycache__/modeling.cpython-37.pyc -------------------------------------------------------------------------------- /data/code/util/modeling/modeling_nezha/__pycache__/configuration.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/modeling/modeling_nezha/__pycache__/configuration.cpython-37.pyc -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/daguancup_end2end.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | 13 | 15 | -------------------------------------------------------------------------------- /data/code/fusion_code/run_fusion.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import os 4 | import sys 5 | import csv 6 | import numpy as np 7 | import pandas as pd 8 | 9 | sys.path.append('../../../data') 10 | from argparse import ArgumentParser 11 | from data.code.util.others.label2id import id2label 12 | 13 | 14 | def fusion(args): 15 | k, predictions = 0, 0 16 | 17 | tmp = pd.read_csv(os.path.join(args.result_path, 'output_result', 'full_logit.csv')) 18 | tmp = tmp.values 19 | predictions += tmp 20 | predictions = np.argmax(predictions, axis=-1) 21 | result = [] 22 | for i in predictions: 23 | result.append((k, id2label[str(i)])) 24 | k += 1 25 | write2tsv(args.submit_path, result) 26 | 27 | 28 | def write2tsv(output_path, data): 29 | with open(output_path, 'w', newline='') as f: 30 | tsv_w = csv.writer(f, delimiter=',') 31 | tsv_w.writerow(['id', 'label']) 32 | tsv_w.writerows(data) 33 | 34 | 35 | def main(): 36 | parser = ArgumentParser() 37 | parser.add_argument('--result_path', type=str, default="../../user_data") 38 | parser.add_argument('--submit_path', type=str, default=f'../../prediction_result/result.csv') 39 | 40 | args = parser.parse_args() 41 | 42 | fusion(args) 43 | 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /data/code/util/others/label2id.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | label2id = { 4 | '1-1': 0, 5 | '1-10': 1, 6 | '1-4': 2, 7 | '1-9': 3, 8 | '10-26': 4, 9 | '2-11': 5, 10 | '2-14': 6, 11 | '2-17': 7, 12 | '2-2': 8, 13 | '2-25': 9, 14 | '2-3': 10, 15 | '2-33': 11, 16 | '2-6': 12, 17 | '3-5': 13, 18 | '4-7': 14, 19 | '5-12': 15, 20 | '5-22': 16, 21 | '5-24': 17, 22 | '5-30': 18, 23 | '5-35': 19, 24 | '6-13': 20, 25 | '6-15': 21, 26 | '6-19': 22, 27 | '6-20': 23, 28 | '6-21': 24, 29 | '6-28': 25, 30 | '6-29': 26, 31 | '6-31': 27, 32 | '6-32': 28, 33 | '6-34': 29, 34 | '6-8': 30, 35 | '7-16': 31, 36 | '8-18': 32, 37 | '8-27': 33, 38 | '9-23': 34 39 | } 40 | 41 | id2label = { 42 | '0': '1-1', 43 | '1': '1-10', 44 | '2': '1-4', 45 | '3': '1-9', 46 | '4': '10-26', 47 | '5': '2-11', 48 | '6': '2-14', 49 | '7': '2-17', 50 | '8': '2-2', 51 | '9': '2-25', 52 | '10': '2-3', 53 | '11': '2-33', 54 | '12': '2-6', 55 | '13': '3-5', 56 | '14': '4-7', 57 | '15': '5-12', 58 | '16': '5-22', 59 | '17': '5-24', 60 | '18': '5-30', 61 | '19': '5-35', 62 | '20': '6-13', 63 | '21': '6-15', 64 | '22': '6-19', 65 | '23': '6-20', 66 | '24': '6-21', 67 | '25': '6-28', 68 | '26': '6-29', 69 | '27': '6-31', 70 | '28': '6-32', 71 | '29': '6-34', 72 | '30': '6-8', 73 | '31': '7-16', 74 | '32': '8-18', 75 | '33': '8-27', 76 | '34': '9-23' 77 | } 78 | 79 | # print(label2id['9-23']) 80 | # print(id2label['0']) -------------------------------------------------------------------------------- /data/code/predict_code/run_predictor.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import sys 4 | import warnings 5 | from argparse import ArgumentParser 6 | 7 | sys.path.append('../../../data') 8 | from data.code.util.tools.predict_tools import * 9 | 10 | 11 | def main(): 12 | parser = ArgumentParser() 13 | 14 | parser.add_argument('--vocab_path', type=str, default='../../user_data/tokenizer/vocab.txt') 15 | parser.add_argument('--output_result_path', type=str, default='../../user_data/output_result') 16 | parser.add_argument('--data_cache_path', type=str, default='../../user_data/process_data/pkl') 17 | parser.add_argument('--test_path', type=str, default='../../user_data/process_data/test.txt') 18 | parser.add_argument('--load_model_path', type=str, default='../../user_data/output_model') 19 | parser.add_argument('--batch_size', type=int, default=128 * 8) 20 | parser.add_argument('--max_seq_len', type=int, default=128) 21 | parser.add_argument('--device', type=str, default='cuda') 22 | 23 | args = parser.parse_args() 24 | warnings.filterwarnings('ignore') 25 | 26 | os.makedirs(args.output_result_path, exist_ok=True) 27 | tokenizer = BertTokenizer.from_pretrained(args.vocab_path) 28 | 29 | if not os.path.exists(os.path.join(args.data_cache_path, 'test.pkl')): 30 | read_data(args, tokenizer) 31 | 32 | test_dataloader = load_data(args, tokenizer) 33 | 34 | model = NeZhaSequenceClassification_P.from_pretrained(os.path.join(args.load_model_path, f'last-checkpoint')) 35 | model.to(args.device) 36 | model.eval() 37 | 38 | final_res = predict(test_dataloader, model, args) 39 | final_res.tolist() 40 | save2csv(args, final_res) 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /READ_ME.md: -------------------------------------------------------------------------------- 1 | # 0.The 5th Dagan Cup, Team name: XiaoChuan Sun , 4th in the A list, 7th in the B list, single model throughout. 2 | ##### Competition Address:https://www.datafountain.cn/competitions/512/ranking?isRedance=0&sch=1804 3 | 4 | # 1.data process details 5 | 6 | ##### 1.1.The maximum length of a sentence is limited to 128, and any sentence longer than 128 is truncated (by taking the first 32 and the last 96). 7 | 8 | 9 | # 2.pretrain details 10 | 11 | ##### 2.1.The data used is the first 18W json (title+content) of the unlabeled data, totaling 36W (training set and test set data are not used, because I forgot to use them). 12 | 13 | ##### 2.2.The pre-training model used is nezha-cn-base, and the pre-training task is albert's ngram mask, as well as the Word Structural Objective task borrowed from structbert, in the time of mask, a randomly selected trigram is disrupted, and while the model predicts the original token, it also does the restoration operation, which is equivalent to the improvement of this task of structbert. 14 | 15 | 16 | # 3.finetune details 17 | 18 | ##### 3.1.Regular tricks are: PGD, Lookahead, EMA, stratified learning rate, TSA, etc. 19 | ##### 3.2.Customized the model architecture as follows. 20 | ###### 3.2.1.Taking the CLS of the last five layers of all hidden layer states for splicing works best (tried many kinds of structures, such as: post-connected CNN/LSTM, MSD, MEAN-POOLING, etc.). 21 | ###### 3.2.2.Because the data comes with two levels of labels, the labels are cut (primary label: 10, secondary label 35) and the loss is calculated separately (for the output hidden_state, it goes through two linear layers respectively, each linear layer output dimension corresponds to a different number of labels). 22 | ###### 3.2.3.The self-researched method, in the model fine-tuning, in each batch, let the model to predict the training set, the prediction results and the real label between the loss of feedback, pulling the distance between the predicted label and the real label, the effect has slightly improved, in other data sets have been tested, not deep investigation of it, is an innovative point. 23 | -------------------------------------------------------------------------------- /data/code/build_vocab/build_vocab.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import os 4 | import torch 5 | import random 6 | import logging 7 | import warnings 8 | import numpy as np 9 | from argparse import ArgumentParser 10 | 11 | from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer 12 | from transformers import BertTokenizer 13 | 14 | logging.basicConfig() 15 | logger = logging.getLogger('build vocab') 16 | logger.setLevel(logging.INFO) 17 | 18 | 19 | def seed_everything(seed): 20 | random.seed(seed) 21 | os.environ['PYTHONHASHSEED'] = str(seed) 22 | np.random.seed(seed) 23 | torch.manual_seed(seed) 24 | torch.cuda.manual_seed(seed) 25 | torch.cuda.manual_seed_all(seed) 26 | torch.backends.cudnn.benchmark = False 27 | torch.backends.cudnn.deterministic = True 28 | 29 | 30 | def train_tokenizer(args): 31 | tokenizer = BertWordPieceTokenizer( 32 | clean_text=False, 33 | handle_chinese_chars=True, 34 | strip_accents=False, 35 | lowercase=False 36 | ) 37 | special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] 38 | 39 | # for i in range(100): 40 | # special_tokens.append(f"[unused{i}]") 41 | 42 | tokenizer.train( 43 | files=[args.file_path, args.unlabeled_file_path], 44 | vocab_size=args.vocab_size, 45 | min_frequency=1, 46 | special_tokens=special_tokens, 47 | limit_alphabet=args.vocab_size, 48 | wordpieces_prefix="##" 49 | ) 50 | os.makedirs(args.out_path, exist_ok=True) 51 | tokenizer.save_model(args.out_path) 52 | tokenizer = BertTokenizer.from_pretrained(args.out_path, 53 | do_lower_case=False, 54 | strip_accents=False) 55 | tokenizer.save_pretrained(args.out_path) 56 | logger.info(f'save tokenizer, with vocab_size: {tokenizer.vocab_size}') 57 | 58 | 59 | if __name__ == '__main__': 60 | parser = ArgumentParser() 61 | 62 | parser.add_argument('--seed', type=int, default=2021) 63 | parser.add_argument('--vocab_size', type=int, default=21128) 64 | parser.add_argument('--file_path', type=str, default='../../user_data/process_data/pretrain.txt') 65 | parser.add_argument('--unlabeled_file_path', type=str, 66 | default='../../user_data/process_data/unlabeled_pretrain.txt') 67 | parser.add_argument('--out_path', type=str, default='../../user_data/tokenizer') 68 | 69 | warnings.filterwarnings('ignore') 70 | args = parser.parse_args() 71 | 72 | seed_everything(args.seed) 73 | 74 | train_tokenizer(args) 75 | 76 | logger.info(f'vocab creation completed .') 77 | -------------------------------------------------------------------------------- /data/code/util/others/hanzi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Constants for working with Chinese characters.""" 3 | 4 | from __future__ import unicode_literals 5 | import sys 6 | 7 | #: Character code ranges for pertinent CJK ideograph Unicode blocks. 8 | characters = cjk_ideographs = ( 9 | '\u3007' # Ideographic number zero, see issue #17 10 | '\u4E00-\u9FFF' # CJK Unified Ideographs 11 | '\u3400-\u4DBF' # CJK Unified Ideographs Extension A 12 | '\uF900-\uFAFF' # CJK Compatibility Ideographs 13 | ) 14 | if sys.maxunicode > 0xFFFF: 15 | characters += ( 16 | '\U00020000-\U0002A6DF' # CJK Unified Ideographs Extension B 17 | '\U0002A700-\U0002B73F' # CJK Unified Ideographs Extension C 18 | '\U0002B740-\U0002B81F' # CJK Unified Ideographs Extension D 19 | '\U0002F800-\U0002FA1F' # CJK Compatibility Ideographs Supplement 20 | ) 21 | 22 | #: Character code ranges for the Kangxi radicals and CJK Radicals Supplement. 23 | radicals = ( 24 | '\u2F00-\u2FD5' # Kangxi Radicals 25 | '\u2E80-\u2EF3' # CJK Radicals Supplement 26 | ) 27 | 28 | #: A string containing Chinese punctuation marks (non-stops). 29 | non_stops = ( 30 | # Fullwidth ASCII variants 31 | '\uFF02\uFF03\uFF04\uFF05\uFF06\uFF07\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0D' 32 | '\uFF0F\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF20\uFF3B\uFF3C\uFF3D\uFF3E\uFF3F' 33 | '\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF60' 34 | 35 | # Halfwidth CJK punctuation 36 | '\uFF62\uFF63\uFF64' 37 | 38 | # CJK symbols and punctuation 39 | '\u3000\u3001\u3003' 40 | 41 | # CJK angle and corner brackets 42 | '\u3008\u3009\u300A\u300B\u300C\u300D\u300E\u300F\u3010\u3011' 43 | 44 | # CJK brackets and symbols/punctuation 45 | '\u3014\u3015\u3016\u3017\u3018\u3019\u301A\u301B\u301C\u301D\u301E\u301F' 46 | 47 | # Other CJK symbols 48 | '\u3030' 49 | 50 | # Special CJK indicators 51 | '\u303E\u303F' 52 | 53 | # Dashes 54 | '\u2013\u2014' 55 | 56 | # Quotation marks and apostrophe 57 | '\u2018\u2019\u201B\u201C\u201D\u201E\u201F' 58 | 59 | # General punctuation 60 | '\u2026\u2027' 61 | 62 | # Overscores and underscores 63 | '\uFE4F' 64 | 65 | # Small form variants 66 | '\uFE51\uFE54' 67 | 68 | # Latin punctuation 69 | '\u00B7' 70 | ) 71 | 72 | #: A string of Chinese stops. 73 | stops = ( 74 | '\uFF01' # Fullwidth exclamation mark 75 | '\uFF1F' # Fullwidth question mark 76 | '\uFF61' # Halfwidth ideographic full stop 77 | '\u3002' # Ideographic full stop 78 | ) 79 | 80 | #: A string containing all Chinese punctuation. 81 | punctuation = non_stops + stops 82 | 83 | # A sentence end is defined by a stop followed by zero or more 84 | # container-closing marks (e.g. quotation or brackets). 85 | _sentence_end = '[{stops}][」﹂”』’》)]}〕〗〙〛〉】]*'.format(stops=stops) 86 | 87 | #: A regular expression pattern for a Chinese sentence. A sentence is defined 88 | #: as a series of characters and non-stop punctuation marks followed by a stop 89 | #: and zero or more container-closing punctuation marks (e.g. apostrophe or 90 | # brackets). 91 | sent = sentence = '[{characters}{radicals}{non_stops}]*{sentence_end}'.format( 92 | characters=characters, radicals=radicals, non_stops=non_stops, 93 | sentence_end=_sentence_end) -------------------------------------------------------------------------------- /data/code/models/nezha.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from data.code.util.modeling.modeling_nezha.modeling import NeZhaPreTrainedModel, NeZhaModel 4 | 5 | 6 | class NeZhaSequenceClassification_F(NeZhaPreTrainedModel): 7 | def __init__(self, config): 8 | super().__init__(config) 9 | self.level1_num_labels = 10 10 | self.num_labels = 35 11 | self.bert = NeZhaModel(config) 12 | self.level1_classifier = nn.Linear(config.hidden_size * 5, self.level1_num_labels) 13 | self.classifier = nn.Linear(config.hidden_size * 5, self.num_labels) 14 | self.init_weights() 15 | 16 | def forward( 17 | self, 18 | input_ids=None, 19 | attention_mask=None, 20 | token_type_ids=None, 21 | labels=None, 22 | level1_labels=None 23 | ): 24 | attention_mask = torch.ne(input_ids, 0) 25 | encoder_out, pooled_out, all_hidden_outputs = self.bert( 26 | input_ids=input_ids, 27 | attention_mask=attention_mask, 28 | token_type_ids=token_type_ids 29 | ) 30 | 31 | last_hidden = torch.cat( 32 | ( 33 | all_hidden_outputs[-1][:, 0], 34 | all_hidden_outputs[-2][:, 0], 35 | all_hidden_outputs[-3][:, 0], 36 | all_hidden_outputs[-4][:, 0], 37 | all_hidden_outputs[-5][:, 0] 38 | ), 39 | 1 40 | ) 41 | 42 | logits = self.classifier(last_hidden) 43 | outputs = (logits,) + (pooled_out,) 44 | 45 | if labels is not None: 46 | loss_fct = nn.CrossEntropyLoss() 47 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 48 | 49 | if level1_labels is not None: 50 | level1_logits = self.level1_classifier(last_hidden) 51 | level1_loss = loss_fct(level1_logits.view(-1, self.level1_num_labels), 52 | level1_labels.view(-1)) 53 | loss = loss + 0.5 * level1_loss 54 | outputs = (loss,) + outputs 55 | 56 | return outputs 57 | 58 | 59 | class NeZhaSequenceClassification_P(NeZhaPreTrainedModel): 60 | def __init__(self, config): 61 | super().__init__(config) 62 | self.level1_num_labels = 10 63 | self.num_labels = 35 64 | self.bert = NeZhaModel(config) 65 | self.level1_classifier = nn.Linear(config.hidden_size * 5, self.level1_num_labels) 66 | self.classifier = nn.Linear(config.hidden_size * 5, self.num_labels) 67 | self.init_weights() 68 | 69 | def forward( 70 | self, 71 | input_ids=None, 72 | attention_mask=None, 73 | token_type_ids=None 74 | ): 75 | attention_mask = torch.ne(input_ids, 0) 76 | encoder_out, pooled_out, all_hidden_outputs = self.bert( 77 | input_ids=input_ids, 78 | attention_mask=attention_mask, 79 | token_type_ids=token_type_ids 80 | ) 81 | 82 | last_hidden = torch.cat( 83 | ( 84 | all_hidden_outputs[-1][:, 0], 85 | all_hidden_outputs[-2][:, 0], 86 | all_hidden_outputs[-3][:, 0], 87 | all_hidden_outputs[-4][:, 0], 88 | all_hidden_outputs[-5][:, 0] 89 | ), 90 | 1 91 | ) 92 | 93 | logits = self.classifier(last_hidden) 94 | outputs = (logits,) + (pooled_out,) 95 | 96 | return outputs 97 | -------------------------------------------------------------------------------- /data/code/process_data/process_data.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import os 4 | import sys 5 | import json 6 | import logging 7 | import warnings 8 | import pandas as pd 9 | from tqdm import tqdm 10 | from argparse import ArgumentParser 11 | from data.code.util.others.label2id import label2id 12 | 13 | sys.path.append('../../../data') 14 | 15 | logging.basicConfig() 16 | logger = logging.getLogger('第五届达观杯') 17 | logger.setLevel(logging.INFO) 18 | 19 | 20 | def cut_text(text, args): 21 | char = [i for i in text.split(' ')] 22 | length = len(char) 23 | if length > args.max_length: 24 | head = char[:32] 25 | tail = char[-96:] 26 | new_char = head + tail 27 | new_text = '' 28 | for i in new_char: 29 | new_text += i + ' ' 30 | new_text = new_text.strip() 31 | return new_text 32 | else: 33 | return text.strip() 34 | 35 | 36 | def process_unlabeled_data(args): 37 | text = [] 38 | with open(args.unlabeled_path, 'r') as f, open(args.out_unlabeled_path, 'w', encoding='utf-8') as w: 39 | for i in tqdm(range(args.number_unlabeled), desc='processing unlabeled data'): 40 | line_data = f.readline() 41 | if line_data: 42 | data = json.loads(line_data) 43 | title = data['title'] 44 | content = data['content'] 45 | if title == '' or content == '': 46 | continue 47 | else: 48 | text.append(title) 49 | text.append(content) 50 | for j in text: 51 | w.writelines(j + '\n') 52 | text = [] 53 | else: 54 | break 55 | 56 | 57 | def process_text(args): 58 | train = pd.read_csv(args.train_path) 59 | test = pd.read_csv(args.test_path) 60 | 61 | train_text = train['text'].tolist() 62 | test_text = test['text'].tolist() 63 | pretrain_text = train_text + test_text 64 | 65 | label = train['label'].tolist() 66 | 67 | pretrain_sentence, train_sentence, train_sentence1, test_sentence = [], [], [], [] 68 | for i in pretrain_text: 69 | pretrain_sentence.append(i.strip()) 70 | 71 | pretrain_sentence = list(set(pretrain_sentence)) 72 | 73 | logger.info(f'total pretrain data : {len(pretrain_sentence)}.') 74 | 75 | for i in train_text: 76 | train_sentence.append(cut_text(i, args)) 77 | 78 | for i in range(len(train_sentence)): 79 | tgt_level1, tgt_level2 = label[i].split('-') 80 | tgt = label2id[label[i]] 81 | line = train_sentence[i] + '\t' + str(tgt) + '\t' + str(int(tgt_level1) - 1) 82 | train_sentence1.append(line) 83 | 84 | logger.info(f'total train data : {len(train_sentence)}.') 85 | 86 | for i in test_text: 87 | test_sentence.append(cut_text(i, args)) 88 | 89 | logger.info(f'total test data : {len(test_sentence)}.') 90 | 91 | return pretrain_sentence, train_sentence1, test_sentence 92 | 93 | 94 | def write(text_list, out_path): 95 | with open(out_path, 'w', encoding='utf-8') as f: 96 | for i in text_list: 97 | f.writelines(i + '\n') 98 | 99 | logger.info(f'process data has been written to {out_path}.') 100 | 101 | 102 | if __name__ == '__main__': 103 | parser = ArgumentParser() 104 | 105 | parser.add_argument('--max_length', type=int, default=128) 106 | parser.add_argument('--number_unlabeled', type=int, default=180000) 107 | parser.add_argument('--unlabeled_path', type=str, default='../../raw_data/datagrand_2021_unlabeled_data.json') 108 | parser.add_argument('--train_path', type=str, default='../../raw_data/datagrand_2021_train.csv') 109 | parser.add_argument('--test_path', type=str, default='../../raw_data/datagrand_2021_test.csv') 110 | parser.add_argument('--out_path', type=str, default='../../user_data/process_data/') 111 | parser.add_argument('--out_unlabeled_path', type=str, 112 | default='../../user_data/process_data/unlabeled_pretrain.txt') 113 | 114 | warnings.filterwarnings('ignore') 115 | args = parser.parse_args() 116 | 117 | os.makedirs(args.out_path, exist_ok=True) 118 | 119 | out_pretrain_path = os.path.join(args.out_path, 'pretrain.txt') 120 | out_train_path = os.path.join(args.out_path, 'train.txt') 121 | out_test_path = os.path.join(args.out_path, 'test.txt') 122 | 123 | process_unlabeled_data(args) 124 | pretrain, train, test = process_text(args) 125 | 126 | write(pretrain, out_pretrain_path) 127 | write(train, out_train_path) 128 | write(test, out_test_path) 129 | 130 | logger.info(f'data processing completed .') 131 | -------------------------------------------------------------------------------- /data/code/util/tools/predict_tools.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import os 4 | import sys 5 | import pickle 6 | import numpy as np 7 | import pandas as pd 8 | from tqdm import tqdm 9 | from collections import defaultdict 10 | from transformers import BertTokenizer 11 | from torch.utils.data import Dataset, DataLoader 12 | 13 | sys.path.append('../../../../data') 14 | from data.code.models.nezha import * 15 | 16 | 17 | def build_model_and_tokenizer_nezha(args): 18 | tokenizer = BertTokenizer.from_pretrained(args.vocab_path) 19 | model = NeZhaSequenceClassification_P.from_pretrained(os.path.join(args.load_model_path, f'last-checkpoint')) 20 | model.to(args.device) 21 | model.eval() 22 | 23 | return tokenizer, model 24 | 25 | 26 | def read_data(args, tokenizer): 27 | test_df = pd.read_csv(args.test_path, header=None, sep='\t') 28 | 29 | inputs = defaultdict(list) 30 | for i, row in tqdm(test_df.iterrows(), desc=f'Preprocessing test data', total=len(test_df)): 31 | sentence = row[0] 32 | build_bert_inputs(inputs, sentence, tokenizer) 33 | 34 | data_cache_path = args.data_cache_path 35 | if not os.path.exists(data_cache_path): 36 | os.makedirs(data_cache_path) 37 | 38 | cache_pkl_path = os.path.join(data_cache_path, 'test.pkl') 39 | with open(cache_pkl_path, 'wb') as f: 40 | pickle.dump(inputs, f) 41 | 42 | return cache_pkl_path 43 | 44 | 45 | def build_bert_inputs(inputs, sentence, tokenizer): 46 | inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, 47 | return_token_type_ids=True, return_attention_mask=True) 48 | inputs['input_ids'].append(inputs_dict['input_ids']) 49 | inputs['token_type_ids'].append(inputs_dict['token_type_ids']) 50 | inputs['attention_mask'].append(inputs_dict['attention_mask']) 51 | 52 | 53 | class DGDataset(Dataset): 54 | def __init__(self, data_dict: dict): 55 | super(DGDataset, self).__init__() 56 | self.data_dict = data_dict 57 | 58 | def __getitem__(self, index: int) -> tuple: 59 | data = ( 60 | self.data_dict['input_ids'][index], 61 | self.data_dict['token_type_ids'][index], 62 | self.data_dict['attention_mask'][index] 63 | ) 64 | return data 65 | 66 | def __len__(self) -> int: 67 | return len(self.data_dict['input_ids']) 68 | 69 | 70 | class Collator: 71 | def __init__(self, max_seq_len: int, tokenizer: BertTokenizer): 72 | self.max_seq_len = max_seq_len 73 | self.tokenizer = tokenizer 74 | 75 | def pad_and_truncate(self, input_ids_list, token_type_ids_list, 76 | attention_mask_list, max_seq_len): 77 | input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long) 78 | token_type_ids = torch.zeros_like(input_ids) 79 | attention_mask = torch.zeros_like(input_ids) 80 | for i in range(len(input_ids_list)): 81 | seq_len = len(input_ids_list[i]) 82 | if seq_len <= max_seq_len: 83 | input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long) 84 | token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long) 85 | attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long) 86 | else: 87 | input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id], 88 | dtype=torch.long) 89 | token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long) 90 | attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long) 91 | 92 | return input_ids, token_type_ids, attention_mask 93 | 94 | def __call__(self, examples: list) -> dict: 95 | input_ids_list, token_type_ids_list, attention_mask_list = list(zip(*examples)) 96 | cur_max_seq_len = max(len(input_id) for input_id in input_ids_list) 97 | max_seq_len = min(cur_max_seq_len, self.max_seq_len) 98 | 99 | input_ids, token_type_ids, attention_mask = self.pad_and_truncate(input_ids_list, token_type_ids_list, 100 | attention_mask_list, max_seq_len) 101 | 102 | data_dict = { 103 | 'input_ids': input_ids, 104 | 'token_type_ids': token_type_ids, 105 | 'attention_mask': attention_mask 106 | } 107 | 108 | return data_dict 109 | 110 | 111 | def load_data(args, tokenizer): 112 | cache_pkl_path = os.path.join(args.data_cache_path, 'test.pkl') 113 | 114 | with open(cache_pkl_path, 'rb') as f: 115 | test_data = pickle.load(f) 116 | 117 | collate_fn = Collator(args.max_seq_len, tokenizer) 118 | test_dataset = DGDataset(test_data) 119 | test_dataloader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, 120 | num_workers=0, collate_fn=collate_fn) 121 | return test_dataloader 122 | 123 | 124 | def save2csv(args, p_logit): 125 | logit_path = os.path.join(args.output_result_path, 'full_logit.csv') 126 | result = pd.DataFrame(p_logit, columns=["label%d" % i for i in range(p_logit.shape[-1])]) 127 | result.to_csv(logit_path, index=False) 128 | 129 | print(f"result hace save in :{logit_path} .") 130 | 131 | 132 | def batch2cuda(args, batch): 133 | return {item: value.to(args.device) for item, value in list(batch.items())} 134 | 135 | 136 | def predict(test_dataloader, pre_model, args): 137 | p_logit = [] 138 | 139 | val_iterator = tqdm(test_dataloader, desc='Predict', total=len(test_dataloader)) 140 | 141 | with torch.no_grad(): 142 | for batch in val_iterator: 143 | batch_cuda = batch2cuda(args, batch) 144 | logits = pre_model(**batch_cuda)[0] 145 | p_logit.extend(torch.softmax(logits, -1).cpu().numpy()) 146 | 147 | return np.vstack(p_logit) 148 | 149 | 150 | def create_dirs(path_list): 151 | for path in path_list: 152 | os.makedirs(path, exist_ok=True) 153 | -------------------------------------------------------------------------------- /data/code/util/modeling/modeling_nezha/configuration.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import PretrainedConfig 3 | 4 | NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} 5 | 6 | class NeZhaConfig(PretrainedConfig): 7 | r""" 8 | This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`. 9 | It is used to instantiate an ALBERT model according to the specified arguments, defining the model 10 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 11 | the ALBERT `xxlarge `__ architecture. 12 | 13 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used 14 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` 15 | for more information. 16 | 17 | 18 | Args: 19 | vocab_size (:obj:`int`, optional, defaults to 30000): 20 | Vocabulary size of the ALBERT model. Defines the different tokens that 21 | can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`. 22 | embedding_size (:obj:`int`, optional, defaults to 128): 23 | Dimensionality of vocabulary embeddings. 24 | hidden_size (:obj:`int`, optional, defaults to 4096): 25 | Dimensionality of the encoder layers and the pooler layer. 26 | num_hidden_layers (:obj:`int`, optional, defaults to 12): 27 | Number of hidden layers in the Transformer encoder. 28 | num_hidden_groups (:obj:`int`, optional, defaults to 1): 29 | Number of groups for the hidden layers, parameters in the same group are shared. 30 | num_attention_heads (:obj:`int`, optional, defaults to 64): 31 | Number of attention heads for each attention layer in the Transformer encoder. 32 | intermediate_size (:obj:`int`, optional, defaults to 16384): 33 | The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 34 | inner_group_num (:obj:`int`, optional, defaults to 1): 35 | The number of inner repetition of attention and ffn. 36 | hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"): 37 | The non-linear activation function (function or string) in the encoder and pooler. 38 | If string, "gelu", "relu", "swish" and "gelu_new" are supported. 39 | hidden_dropout_prob (:obj:`float`, optional, defaults to 0): 40 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 41 | attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0): 42 | The dropout ratio for the attention probabilities. 43 | max_position_embeddings (:obj:`int`, optional, defaults to 512): 44 | The maximum sequence length that this model might ever be used with. Typically set this to something 45 | large (e.g., 512 or 1024 or 2048). 46 | type_vocab_size (:obj:`int`, optional, defaults to 2): 47 | The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`. 48 | initializer_range (:obj:`float`, optional, defaults to 0.02): 49 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 50 | layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): 51 | The epsilon used by the layer normalization layers. 52 | classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1): 53 | The dropout ratio for attached classifiers. 54 | 55 | Example:: 56 | 57 | from transformers import AlbertConfig, AlbertModel 58 | # Initializing an ALBERT-xxlarge style configuration 59 | albert_xxlarge_configuration = AlbertConfig() 60 | 61 | # Initializing an ALBERT-base style configuration 62 | albert_base_configuration = AlbertConfig( 63 | hidden_size=768, 64 | num_attention_heads=12, 65 | intermediate_size=3072, 66 | ) 67 | 68 | # Initializing a model from the ALBERT-base style configuration 69 | model = AlbertModel(albert_xxlarge_configuration) 70 | 71 | # Accessing the model configuration 72 | configuration = model.config 73 | 74 | Attributes: 75 | pretrained_config_archive_map (Dict[str, str]): 76 | A dictionary containing all the available pre-trained checkpoints. 77 | """ 78 | 79 | pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP 80 | model_type = "nezha" 81 | 82 | def __init__( 83 | self, 84 | vocab_size=30000, 85 | embedding_size=128, 86 | hidden_size=4096, 87 | num_hidden_layers=12, 88 | num_hidden_groups=1, 89 | num_attention_heads=64, 90 | intermediate_size=16384, 91 | inner_group_num=1, 92 | hidden_act="gelu_new", 93 | hidden_dropout_prob=0, 94 | attention_probs_dropout_prob=0, 95 | max_position_embeddings=512, 96 | max_relative_position=64, 97 | type_vocab_size=2, 98 | initializer_range=0.02, 99 | layer_norm_eps=1e-12, 100 | classifier_dropout_prob=0.1, 101 | use_relative_position=True, 102 | pad_token_id=0, 103 | bos_token_id=2, 104 | eos_token_id=3, 105 | **kwargs 106 | ): 107 | super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 108 | 109 | self.vocab_size = vocab_size 110 | self.embedding_size = embedding_size 111 | self.hidden_size = hidden_size 112 | self.num_hidden_layers = num_hidden_layers 113 | self.num_hidden_groups = num_hidden_groups 114 | self.num_attention_heads = num_attention_heads 115 | self.inner_group_num = inner_group_num 116 | self.hidden_act = hidden_act 117 | self.intermediate_size = intermediate_size 118 | self.hidden_dropout_prob = hidden_dropout_prob 119 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 120 | self.max_position_embeddings = max_position_embeddings 121 | self.max_relative_position = max_relative_position 122 | self.type_vocab_size = type_vocab_size 123 | self.initializer_range = initializer_range 124 | self.layer_norm_eps = layer_norm_eps 125 | self.use_relative_position=use_relative_position 126 | self.classifier_dropout_prob = classifier_dropout_prob 127 | -------------------------------------------------------------------------------- /data/code/finetune_code/run_classify.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import gc 4 | import sys 5 | import warnings 6 | from torch import multiprocessing 7 | from argparse import ArgumentParser 8 | 9 | sys.path.append('../../../data') 10 | from data.code.util.tools.finetune_tools import * 11 | 12 | multiprocessing.set_sharing_strategy('file_system') 13 | 14 | 15 | class PGD: 16 | def __init__(self, args, model): 17 | self.model = model 18 | self.emb_backup = {} 19 | self.grad_backup = {} 20 | self.epsilon = args.epsilon 21 | self.emb_name = args.emb_name 22 | self.alpha = args.alpha 23 | 24 | def attack(self, is_first_attack=False): 25 | for name, param in self.model.bert.named_parameters(): 26 | if param.requires_grad and self.emb_name in name: 27 | if is_first_attack: 28 | self.emb_backup[name] = param.data.clone() 29 | norm = torch.norm(param.grad) 30 | if norm != 0 and not torch.isnan(norm): 31 | r_at = self.alpha * param.grad / norm 32 | param.data.add_(r_at) 33 | param.data = self.project(name, param.data, self.epsilon) 34 | 35 | def restore(self): 36 | for name, param in self.model.bert.named_parameters(): 37 | if param.requires_grad and self.emb_name in name: 38 | assert name in self.emb_backup 39 | param.data = self.emb_backup[name] 40 | self.emb_backup = {} 41 | 42 | def project(self, param_name, param_data, epsilon): 43 | r = param_data - self.emb_backup[param_name] 44 | if torch.norm(r) > epsilon: 45 | r = epsilon * r / torch.norm(r) 46 | return self.emb_backup[param_name] + r 47 | 48 | def backup_grad(self): 49 | for name, param in self.model.bert.named_parameters(): 50 | if param.requires_grad and param.grad is not None: 51 | self.grad_backup[name] = param.grad.clone() 52 | 53 | def restore_grad(self): 54 | for name, param in self.model.bert.named_parameters(): 55 | if param.requires_grad and param.grad is not None: 56 | param.grad = self.grad_backup[name] 57 | 58 | 59 | def train(args): 60 | tokenizer, model = build_model_and_tokenizer(args) 61 | 62 | if not os.path.exists(os.path.join(args.data_cache_path, 'train.pkl')): 63 | read_data(args, tokenizer) 64 | 65 | train_dataloader = load_data(args, tokenizer) 66 | 67 | total_steps = args.num_epochs * len(train_dataloader) 68 | 69 | optimizer, scheduler = build_optimizer(args, model, total_steps) 70 | 71 | total_loss, cur_avg_loss, global_steps = 0., 0., 0 72 | 73 | for epoch in range(1, args.num_epochs + 1): 74 | 75 | train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader)) 76 | 77 | model.train() 78 | 79 | for batch in train_iterator: 80 | batch_cuda = batch2cuda(args, batch) 81 | loss, logits = model(**batch_cuda)[:2] 82 | 83 | # TSA, 仅 backward loss 小于 阈值的 loss 84 | start, end = 1. / logits.shape[-1], 1 85 | tsa_thresh = get_tsa_thresh(args, global_steps, total_steps, start, end) 86 | larger_than_threshold = torch.exp(-loss) > tsa_thresh 87 | loss_mask = torch.ones_like(batch_cuda['labels'], dtype=torch.float32) * (1 - larger_than_threshold. 88 | type(torch.float32)) 89 | loss = torch.sum(loss * loss_mask, dim=-1) / torch.max(torch.sum(loss_mask, dim=-1), 90 | torch.tensor(1.).to(args.device)) 91 | 92 | total_loss += loss.item() 93 | cur_avg_loss += loss.item() 94 | 95 | loss.backward() 96 | 97 | if args.adv == 'pgd': 98 | pgd = PGD(args, model) 99 | K = args.adv_k 100 | pgd.backup_grad() 101 | for t in range(K): 102 | pgd.attack(is_first_attack=(t == 0)) 103 | if t != K - 1: 104 | model.zero_grad() 105 | else: 106 | pgd.restore_grad() 107 | adv_loss, adv_logits = model(**batch_cuda)[:2] 108 | adv_loss.backward() 109 | pgd.restore() 110 | 111 | optimizer.step() 112 | scheduler.step() 113 | optimizer.zero_grad() 114 | 115 | if args.ema_start: 116 | ema.update() 117 | 118 | if epoch >= args.ema_start_epoch: 119 | args.ema_start = True 120 | ema = EMA(model.module if hasattr(model, 'module') else model, decay=0.999) 121 | 122 | if (global_steps + 1) % args.logging_step == 0: 123 | epoch_avg_loss = cur_avg_loss / args.logging_step 124 | global_avg_loss = total_loss / (global_steps + 1) 125 | 126 | print(f"\n>> epoch - {epoch}, global steps - {global_steps + 1}, " 127 | f"epoch avg loss - {epoch_avg_loss:.4f}, global avg loss - {global_avg_loss:.4f}.") 128 | 129 | cur_avg_loss = 0.0 130 | 131 | global_steps += 1 132 | 133 | if epoch >= args.ema_start_epoch: 134 | ema.apply_shadow() 135 | 136 | save_model(args, model, tokenizer) 137 | 138 | del model, tokenizer, optimizer, scheduler 139 | torch.cuda.empty_cache() 140 | gc.collect() 141 | 142 | 143 | def main(): 144 | parser = ArgumentParser() 145 | parser.add_argument('--output_path', type=str, 146 | default='../../user_data/output_model') 147 | parser.add_argument('--train_path', type=str, 148 | default='../../user_data/process_data/train.txt') 149 | parser.add_argument('--data_cache_path', type=str, 150 | default='../../user_data/process_data/pkl') 151 | parser.add_argument('--vocab_path', type=str, 152 | default='../../user_data/tokenizer/vocab.txt') 153 | parser.add_argument('--model_path', type=str, 154 | default='../../user_data/saved_pretrain_model_record/checkpoint-240000') 155 | 156 | parser.add_argument('--num_epochs', type=int, default=4) 157 | parser.add_argument('--batch_size', type=int, default=32) 158 | parser.add_argument('--max_seq_len', type=int, default=128) 159 | 160 | parser.add_argument('--learning_rate', type=float, default=2e-5) 161 | parser.add_argument('--downstream_learning_rate', type=float, default=1e-4) 162 | parser.add_argument('--eps', type=float, default=1e-8) 163 | 164 | parser.add_argument('--adv_k', type=int, default=10) 165 | parser.add_argument('--alpha', type=float, default=0.3) 166 | parser.add_argument('--epsilon', type=float, default=0.5) 167 | parser.add_argument('--emb_name', type=str, default='word_embeddings.') 168 | parser.add_argument('--adv', type=str, default='pgd', choices=['', 'pgd']) 169 | 170 | parser.add_argument('--lookahead_k', type=int, default=5) 171 | parser.add_argument('--lookahead_alpha', type=int, default=1) 172 | 173 | parser.add_argument('--ema_start', type=bool, default=False) 174 | parser.add_argument('--ema_start_epoch', type=int, default=3) 175 | 176 | parser.add_argument('--schedule', type=str, default='log', choices=['linear', 'exp', 'log']) 177 | 178 | parser.add_argument('--warmup_ratio', type=float, default=0.1) 179 | parser.add_argument('--weight_decay', type=float, default=0.01) 180 | 181 | parser.add_argument('--logging_step', type=int, default=100) 182 | 183 | parser.add_argument('--seed', type=int, default=2021) 184 | 185 | parser.add_argument('--device', type=str, default='cuda') 186 | 187 | warnings.filterwarnings('ignore') 188 | args = parser.parse_args() 189 | 190 | os.makedirs(os.path.dirname(args.output_path), exist_ok=True) 191 | 192 | seed_everything(args.seed) 193 | train(args) 194 | 195 | 196 | if __name__ == '__main__': 197 | main() 198 | -------------------------------------------------------------------------------- /data/code/pretrain_code/run_pretrain.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import os 4 | import re 5 | import sys 6 | import random 7 | import warnings 8 | import numpy as np 9 | import pandas as pd 10 | from tqdm import tqdm 11 | from typing import List, Tuple 12 | from collections import defaultdict 13 | from argparse import ArgumentParser 14 | 15 | import torch 16 | from torch.utils.data import Dataset 17 | from transformers import BertTokenizer, TrainingArguments 18 | 19 | sys.path.append('../../../data') 20 | from data.code.util.others.hanzi import punctuation 21 | from data.code.util.pretrain_utils.trainer import Trainer 22 | from data.code.util.modeling.modeling_nezha.modeling import NeZhaConfig, NeZhaForMaskedLM 23 | 24 | warnings.filterwarnings('ignore') 25 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 26 | os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1' 27 | 28 | 29 | def seed_everything(seed): 30 | random.seed(seed) 31 | np.random.seed(seed) 32 | torch.manual_seed(seed) 33 | torch.cuda.manual_seed_all(seed) 34 | return seed 35 | 36 | 37 | def read_data(pretrain_file_path, tokenizer: BertTokenizer) -> dict: 38 | pretrain_df = pd.read_csv(pretrain_file_path, header=None, sep='\t') 39 | inputs = defaultdict(list) 40 | for i, row in tqdm(pretrain_df.iterrows(), desc='', total=len(pretrain_df)): 41 | sentence = row[0].strip() 42 | sentence = re.sub(r"[%s]+" % punctuation, '[SEP]', sentence) 43 | inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, 44 | return_token_type_ids=True, return_attention_mask=True) 45 | inputs['input_ids'].append(inputs_dict['input_ids']) 46 | inputs['token_type_ids'].append(inputs_dict['token_type_ids']) 47 | inputs['attention_mask'].append(inputs_dict['attention_mask']) 48 | 49 | return inputs 50 | 51 | 52 | class DGDataset(Dataset): 53 | def __init__(self, data_dict: dict): 54 | super(Dataset, self).__init__() 55 | self.data_dict = data_dict 56 | 57 | def __getitem__(self, index: int) -> tuple: 58 | data = (self.data_dict['input_ids'][index], 59 | self.data_dict['token_type_ids'][index], 60 | self.data_dict['attention_mask'][index]) 61 | 62 | return data 63 | 64 | def __len__(self) -> int: 65 | return len(self.data_dict['input_ids']) 66 | 67 | 68 | class DGDataCollator: 69 | def __init__(self, max_seq_len: int, tokenizer: BertTokenizer, mlm_probability=0.15): 70 | self.max_seq_len = max_seq_len 71 | self.tokenizer = tokenizer 72 | self.mlm_probability = mlm_probability 73 | self.special_token_ids = {tokenizer.cls_token_id, tokenizer.sep_token_id} 74 | 75 | def pad_and_truncate(self, input_ids_list, token_type_ids_list, 76 | attention_mask_list, max_seq_len): 77 | input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long) 78 | token_type_ids = torch.zeros_like(input_ids) 79 | attention_mask = torch.zeros_like(input_ids) 80 | for i in range(len(input_ids_list)): 81 | seq_len = len(input_ids_list[i]) 82 | if seq_len <= max_seq_len: 83 | input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long) 84 | token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long) 85 | attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long) 86 | else: 87 | input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id], 88 | dtype=torch.long) 89 | token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long) 90 | attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long) 91 | return input_ids, token_type_ids, attention_mask 92 | 93 | def _ngram_mask(self, input_ids, max_seq_len): 94 | cand_indexes = [] 95 | for (i, id_) in enumerate(input_ids): 96 | if id_ in self.special_token_ids: 97 | continue 98 | cand_indexes.append([i]) 99 | num_to_predict = max(1, int(round(len(input_ids) * self.mlm_probability))) 100 | 101 | max_ngram = 3 102 | ngrams = np.arange(1, max_ngram + 1, dtype=np.int64) 103 | pvals = 1. / np.arange(1, max_ngram + 1) 104 | pvals /= pvals.sum(keepdims=True) 105 | 106 | ngram_indexes = [] 107 | for idx in range(len(cand_indexes)): 108 | ngram_index = [] 109 | for n in ngrams: 110 | ngram_index.append(cand_indexes[idx:idx + n]) 111 | ngram_indexes.append(ngram_index) 112 | np.random.shuffle(ngram_indexes) 113 | 114 | covered_indexes = set() 115 | 116 | for cand_index_set in ngram_indexes: 117 | if len(covered_indexes) >= num_to_predict: 118 | break 119 | if not cand_index_set: 120 | continue 121 | for index_set in cand_index_set[0]: 122 | for index in index_set: 123 | if index in covered_indexes: 124 | continue 125 | n = np.random.choice(ngrams[:len(cand_index_set)], 126 | p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True)) 127 | index_set = sum(cand_index_set[n - 1], []) 128 | n -= 1 129 | while len(covered_indexes) + len(index_set) > num_to_predict: 130 | if n == 0: 131 | break 132 | index_set = sum(cand_index_set[n - 1], []) 133 | n -= 1 134 | if len(covered_indexes) + len(index_set) > num_to_predict: 135 | continue 136 | is_any_index_covered = False 137 | for index in index_set: 138 | if index in covered_indexes: 139 | is_any_index_covered = True 140 | break 141 | if is_any_index_covered: 142 | continue 143 | for index in index_set: 144 | covered_indexes.add(index) 145 | 146 | mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_ids))] 147 | mask_labels += [0] * (max_seq_len - len(mask_labels)) 148 | 149 | return torch.tensor(mask_labels[:max_seq_len]) 150 | 151 | def ngram_mask(self, input_ids_list: List[list], max_seq_len: int): 152 | mask_labels = [] 153 | for i, input_ids in enumerate(input_ids_list): 154 | mask_label = self._ngram_mask(input_ids, max_seq_len) 155 | mask_labels.append(mask_label) 156 | return torch.stack(mask_labels, dim=0) 157 | 158 | def mask_tokens(self, inputs: torch.Tensor, mask_labels: torch.Tensor) -> \ 159 | Tuple[torch.Tensor, torch.Tensor]: 160 | 161 | labels = inputs.clone() 162 | probability_matrix = mask_labels 163 | 164 | # word struct prediction 165 | 166 | ''' 167 | complete by yourself 168 | ''' 169 | 170 | special_tokens_mask = [ 171 | self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() 172 | ] 173 | probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) 174 | masked_indices = probability_matrix.bool() 175 | labels[~masked_indices] = -100 176 | indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices 177 | inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) 178 | indices_random = torch.bernoulli( 179 | torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced 180 | random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) 181 | inputs[indices_random] = random_words[indices_random] 182 | return inputs, labels 183 | 184 | def __call__(self, examples: list) -> dict: 185 | input_ids_list, token_type_ids_list, attention_mask_list = list(zip(*examples)) 186 | cur_max_seq_len = max(len(input_id) for input_id in input_ids_list) 187 | max_seq_len = min(cur_max_seq_len, self.max_seq_len) 188 | 189 | input_ids, token_type_ids, attention_mask = self.pad_and_truncate(input_ids_list, 190 | token_type_ids_list, 191 | attention_mask_list, 192 | max_seq_len) 193 | batch_mask = self.ngram_mask(input_ids_list, max_seq_len) 194 | input_ids, mlm_labels = self.mask_tokens(input_ids, batch_mask) 195 | data_dict = { 196 | 'input_ids': input_ids, 197 | 'attention_mask': attention_mask, 198 | 'token_type_ids': token_type_ids, 199 | 'labels': mlm_labels 200 | } 201 | 202 | return data_dict 203 | 204 | 205 | def main(): 206 | parser = ArgumentParser() 207 | parser.add_argument('--pretrain_data_path', type=str, default='../../user_data/process_data/unlabeled_pretrain.txt') 208 | parser.add_argument('--pretrain_model_path', type=str, default='../../user_data/pretrain_model/nezha-cn-base') 209 | parser.add_argument('--vocab_path', type=str, default='../../user_data/tokenizer/vocab.txt') 210 | parser.add_argument('--save_path', type=str, default='../../user_data/saved_pretrain_model') 211 | parser.add_argument('--record_save_path', type=str, default='../../user_data/saved_pretrain_model_record') 212 | parser.add_argument('--mlm_probability', type=float, default=0.15) 213 | parser.add_argument('--num_train_epochs', type=int, default=100) 214 | parser.add_argument('--seq_length', type=int, default=128) 215 | parser.add_argument('--batch_size', type=int, default=64) 216 | parser.add_argument('--learning_rate', type=float, default=6e-5) 217 | parser.add_argument('--save_steps', type=int, default=10000) 218 | parser.add_argument('--ckpt_save_limit', type=int, default=6) 219 | parser.add_argument('--logging_steps', type=int, default=2000) 220 | parser.add_argument('--seed', type=int, default=2021) 221 | parser.add_argument('--fp16', type=str, default=True) 222 | parser.add_argument('--fp16_backend', type=str, default='amp') 223 | 224 | warnings.filterwarnings('ignore') 225 | args = parser.parse_args() 226 | 227 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True) 228 | os.makedirs(os.path.dirname(args.record_save_path), exist_ok=True) 229 | 230 | tokenizer = BertTokenizer.from_pretrained(args.vocab_path) 231 | model_config = NeZhaConfig.from_pretrained(args.pretrain_model_path) 232 | 233 | data = read_data(args.pretrain_data_path, tokenizer) 234 | 235 | data_collator = DGDataCollator(max_seq_len=args.seq_length, 236 | tokenizer=tokenizer, 237 | mlm_probability=args.mlm_probability) 238 | model = NeZhaForMaskedLM.from_pretrained(pretrained_model_name_or_path=args.pretrain_model_path, 239 | config=model_config) 240 | model.resize_token_embeddings(tokenizer.vocab_size) 241 | dataset = DGDataset(data) 242 | 243 | training_args = TrainingArguments( 244 | seed=args.seed, 245 | fp16=args.fp16, 246 | fp16_backend=args.fp16_backend, 247 | save_steps=args.save_steps, 248 | prediction_loss_only=True, 249 | logging_steps=args.logging_steps, 250 | output_dir=args.record_save_path, 251 | learning_rate=args.learning_rate, 252 | save_total_limit=args.ckpt_save_limit, 253 | num_train_epochs=args.num_train_epochs, 254 | per_device_train_batch_size=args.batch_size 255 | ) 256 | 257 | trainer = Trainer( 258 | model=model, 259 | args=training_args, 260 | train_dataset=dataset, 261 | data_collator=data_collator 262 | ) 263 | 264 | trainer.train() 265 | trainer.save_model(args.save_path) 266 | tokenizer.save_pretrained(args.save_path) 267 | 268 | 269 | if __name__ == '__main__': 270 | main() 271 | -------------------------------------------------------------------------------- /data/code/util/tools/finetune_tools.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pickle 4 | import random 5 | import numpy as np 6 | import pandas as pd 7 | from torch.optim import AdamW 8 | from torch.utils.data import Dataset, DataLoader 9 | from tqdm import tqdm 10 | from transformers import BertTokenizer 11 | from collections import defaultdict 12 | 13 | from torch.optim import Optimizer 14 | from torch.optim.lr_scheduler import LambdaLR 15 | 16 | sys.path.append('../../../../data') 17 | from data.code.models.nezha import * 18 | 19 | 20 | def seed_everything(seed): 21 | torch.manual_seed(seed) 22 | torch.cuda.manual_seed(seed) 23 | torch.cuda.manual_seed_all(seed) 24 | torch.backends.cudnn.benchmark = False 25 | torch.backends.cudnn.deterministic = True 26 | random.seed(seed) 27 | np.random.seed(seed) 28 | os.environ['PYTHONHASHSEED'] = str(seed) 29 | 30 | 31 | def batch2cuda(args, batch): 32 | return {item: value.to(args.device) for item, value in list(batch.items())} 33 | 34 | 35 | def build_model_and_tokenizer(args): 36 | tokenizer = BertTokenizer.from_pretrained(args.vocab_path) 37 | model = NeZhaSequenceClassification_F.from_pretrained(args.model_path) 38 | model.to(args.device) 39 | 40 | return tokenizer, model 41 | 42 | 43 | class PGD: 44 | def __init__(self, args, model): 45 | self.model = model 46 | self.emb_backup = {} 47 | self.grad_backup = {} 48 | self.epsilon = args.epsilon 49 | self.emb_name = args.emb_name 50 | self.alpha = args.alpha 51 | 52 | def attack(self, is_first_attack=False): 53 | for name, param in self.model.bert.named_parameters(): 54 | if param.requires_grad and self.emb_name in name: 55 | if is_first_attack: 56 | self.emb_backup[name] = param.data.clone() 57 | norm = torch.norm(param.grad) 58 | if norm != 0 and not torch.isnan(norm): 59 | r_at = self.alpha * param.grad / norm 60 | param.data.add_(r_at) 61 | param.data = self.project(name, param.data, self.epsilon) 62 | 63 | def restore(self): 64 | for name, param in self.model.bert.named_parameters(): 65 | if param.requires_grad and self.emb_name in name: 66 | assert name in self.emb_backup 67 | param.data = self.emb_backup[name] 68 | self.emb_backup = {} 69 | 70 | def project(self, param_name, param_data, epsilon): 71 | r = param_data - self.emb_backup[param_name] 72 | if torch.norm(r) > epsilon: 73 | r = epsilon * r / torch.norm(r) 74 | return self.emb_backup[param_name] + r 75 | 76 | def backup_grad(self): 77 | for name, param in self.model.bert.named_parameters(): 78 | if param.requires_grad and param.grad is not None: 79 | self.grad_backup[name] = param.grad.clone() 80 | 81 | def restore_grad(self): 82 | for name, param in self.model.bert.named_parameters(): 83 | if param.requires_grad and param.grad is not None: 84 | param.grad = self.grad_backup[name] 85 | 86 | 87 | class Lookahead(Optimizer): 88 | def __init__(self, optimizer, k=5, alpha=0.5): 89 | self.optimizer = optimizer 90 | self.k = k 91 | self.alpha = alpha 92 | self.param_groups = self.optimizer.param_groups 93 | self.state = defaultdict(dict) 94 | self.fast_state = self.optimizer.state 95 | for group in self.param_groups: 96 | group["counter"] = 0 97 | 98 | def update(self, group): 99 | for fast in group["params"]: 100 | param_state = self.state[fast] 101 | if "slow_param" not in param_state: 102 | param_state["slow_param"] = torch.zeros_like(fast.data) 103 | param_state["slow_param"].copy_(fast.data) 104 | slow = param_state["slow_param"] 105 | slow += (fast.data - slow) * self.alpha 106 | fast.data.copy_(slow) 107 | 108 | def update_lookahead(self): 109 | for group in self.param_groups: 110 | self.update(group) 111 | 112 | def step(self, closure=None): 113 | loss = self.optimizer.step(closure) 114 | for group in self.param_groups: 115 | if group["counter"] == 0: 116 | self.update(group) 117 | group["counter"] += 1 118 | if group["counter"] >= self.k: 119 | group["counter"] = 0 120 | return loss 121 | 122 | def state_dict(self): 123 | fast_state_dict = self.optimizer.state_dict() 124 | slow_state = { 125 | (id(k) if isinstance(k, torch.Tensor) else k): v 126 | for k, v in self.state.items() 127 | } 128 | fast_state = fast_state_dict["state"] 129 | param_groups = fast_state_dict["param_groups"] 130 | return { 131 | "fast_state": fast_state, 132 | "slow_state": slow_state, 133 | "param_groups": param_groups, 134 | } 135 | 136 | def load_state_dict(self, state_dict): 137 | slow_state_dict = { 138 | "state": state_dict["slow_state"], 139 | "param_groups": state_dict["param_groups"], 140 | } 141 | fast_state_dict = { 142 | "state": state_dict["fast_state"], 143 | "param_groups": state_dict["param_groups"], 144 | } 145 | super(Lookahead, self).load_state_dict(slow_state_dict) 146 | self.optimizer.load_state_dict(fast_state_dict) 147 | self.fast_state = self.optimizer.state 148 | 149 | def add_param_group(self, param_group): 150 | param_group["counter"] = 0 151 | self.optimizer.add_param_group(param_group) 152 | 153 | 154 | class EMA: 155 | def __init__(self, model, decay): 156 | self.model = model 157 | self.decay = decay 158 | self.shadow = {} 159 | self.backup = {} 160 | self.register() 161 | 162 | def register(self): 163 | for name, param in self.model.named_parameters(): 164 | if param.requires_grad: 165 | self.shadow[name] = param.data.clone() 166 | 167 | def update(self): 168 | for name, param in self.model.named_parameters(): 169 | if param.requires_grad: 170 | assert name in self.shadow 171 | new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name] 172 | self.shadow[name] = new_average.clone() 173 | 174 | def apply_shadow(self): 175 | for name, param in self.model.named_parameters(): 176 | if param.requires_grad: 177 | assert name in self.shadow 178 | self.backup[name] = param.data 179 | param.data = self.shadow[name] 180 | 181 | def restore(self): 182 | for name, param in self.model.named_parameters(): 183 | if param.requires_grad: 184 | assert name in self.backup 185 | param.data = self.backup[name] 186 | self.backup = {} 187 | 188 | 189 | class WarmupLinearSchedule(LambdaLR): 190 | def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1): 191 | self.warmup_steps = warmup_steps 192 | self.t_total = t_total 193 | super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) 194 | 195 | def lr_lambda(self, step): 196 | if step < self.warmup_steps: 197 | return float(step) / float(max(1, self.warmup_steps)) 198 | return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps))) 199 | 200 | 201 | def build_optimizer(args, model, train_steps): 202 | no_decay = ['bias', 'LayerNorm.weight'] 203 | 204 | bert_model_param, bert_downstream_param = [], [] 205 | 206 | for items in model.named_parameters(): 207 | if "bert" in items: 208 | bert_model_param.append(items) 209 | else: 210 | bert_downstream_param.append(items) 211 | 212 | optimizer_grouped_parameters = [ 213 | {"params": [p for n, p in bert_model_param if 214 | not any(nd in n for nd in no_decay)], 215 | 'weight_decay_rate': args.weight_decay, "lr": args.learning_rate}, 216 | {'params': [p for n, p in bert_model_param if 217 | any(nd in n for nd in no_decay)], 218 | 'weight_decay_rate': 0.0, 'lr': args.learning_rate}, 219 | 220 | {"params": [p for n, p in bert_downstream_param if 221 | not any(nd in n for nd in no_decay)], 222 | 'weight_decay_rate': args.weight_decay, "lr": args.downstream_learning_rate}, 223 | {'params': [p for n, p in bert_downstream_param if 224 | any(nd in n for nd in no_decay)], 225 | 'weight_decay_rate': 0.0, 'lr': args.downstream_learning_rate} 226 | ] 227 | 228 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.eps) 229 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_steps * args.warmup_ratio, 230 | t_total=train_steps) 231 | optimizer = Lookahead(optimizer, args.lookahead_k, args.lookahead_alpha) 232 | 233 | return optimizer, scheduler 234 | 235 | 236 | def save_model(args, model, tokenizer): 237 | model_to_save = model.module if hasattr(model, 'module') else model 238 | model_save_path = os.path.join(args.output_path, f'last-checkpoint') 239 | model_to_save.save_pretrained(model_save_path) 240 | tokenizer.save_vocabulary(model_save_path) 241 | 242 | print(f'model saved in : {model_save_path} .') 243 | 244 | 245 | def get_tsa_thresh(args, global_step, num_train_steps, start, end): 246 | training_progress = torch.tensor(float(global_step) / float(num_train_steps)) 247 | 248 | if args.schedule == 'linear': 249 | threshold = training_progress 250 | elif args.schedule == 'exp': 251 | scale = 5 252 | threshold = torch.exp((training_progress - 1) * scale) 253 | elif args.schedule == 'log': 254 | scale = 5 255 | threshold = 1 - torch.exp((-training_progress) * scale) 256 | 257 | output = threshold * (end - start) + start 258 | 259 | return output.to(args.device) 260 | 261 | 262 | def read_data(args, tokenizer): 263 | train_df = pd.read_csv(args.train_path, header=None, sep='\t') 264 | 265 | inputs = defaultdict(list) 266 | for i, row in tqdm(train_df.iterrows(), desc=f'Preprocessing train data', total=len(train_df)): 267 | sentence, label, level1_label = row 268 | build_bert_inputs(inputs, label, level1_label, sentence, tokenizer) 269 | 270 | data_cache_path = args.data_cache_path 271 | if not os.path.exists(data_cache_path): 272 | os.makedirs(data_cache_path) 273 | 274 | cache_pkl_path = os.path.join(data_cache_path, 'train.pkl') 275 | with open(cache_pkl_path, 'wb') as f: 276 | pickle.dump(inputs, f) 277 | 278 | return cache_pkl_path 279 | 280 | 281 | def build_bert_inputs(inputs, label, level1_label, sentence, tokenizer): 282 | inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, 283 | return_token_type_ids=True, return_attention_mask=True) 284 | inputs['input_ids'].append(inputs_dict['input_ids']) 285 | inputs['token_type_ids'].append(inputs_dict['token_type_ids']) 286 | inputs['attention_mask'].append(inputs_dict['attention_mask']) 287 | inputs['labels'].append(label) 288 | inputs['level1_labels'].append(level1_label) 289 | 290 | 291 | class DGDataset(Dataset): 292 | def __init__(self, data_dict: dict, tokenizer: BertTokenizer): 293 | super(DGDataset, self).__init__() 294 | self.data_dict = data_dict 295 | self.tokenizer = tokenizer 296 | 297 | def __getitem__(self, index: int) -> tuple: 298 | data = ( 299 | self.data_dict['input_ids'][index], 300 | self.data_dict['token_type_ids'][index], 301 | self.data_dict['attention_mask'][index], 302 | self.data_dict['labels'][index], 303 | self.data_dict['level1_labels'][index] 304 | ) 305 | 306 | return data 307 | 308 | def __len__(self) -> int: 309 | return len(self.data_dict['input_ids']) 310 | 311 | 312 | class Collator: 313 | def __init__(self, max_seq_len: int, tokenizer: BertTokenizer): 314 | self.max_seq_len = max_seq_len 315 | self.tokenizer = tokenizer 316 | 317 | def pad_and_truncate(self, input_ids_list, token_type_ids_list, 318 | attention_mask_list, labels_list, level1_labels_list, max_seq_len): 319 | input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long) 320 | token_type_ids = torch.zeros_like(input_ids) 321 | attention_mask = torch.zeros_like(input_ids) 322 | for i in range(len(input_ids_list)): 323 | seq_len = len(input_ids_list[i]) 324 | if seq_len <= max_seq_len: 325 | input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long) 326 | token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long) 327 | attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long) 328 | else: 329 | input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id], 330 | dtype=torch.long) 331 | token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long) 332 | attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long) 333 | 334 | labels = torch.tensor(labels_list, dtype=torch.long) 335 | level1_labels = torch.tensor(level1_labels_list, dtype=torch.long) 336 | return input_ids, token_type_ids, attention_mask, labels, level1_labels 337 | 338 | def __call__(self, examples: list) -> dict: 339 | input_ids_list, token_type_ids_list, attention_mask_list, labels_list, level1_labels_list = list(zip(*examples)) 340 | cur_max_seq_len = max(len(input_id) for input_id in input_ids_list) 341 | max_seq_len = min(cur_max_seq_len, self.max_seq_len) 342 | 343 | input_ids, token_type_ids, attention_mask, labels, level1_labels = \ 344 | self.pad_and_truncate(input_ids_list, token_type_ids_list, attention_mask_list, 345 | labels_list, level1_labels_list, max_seq_len) 346 | 347 | data_dict = { 348 | 'input_ids': input_ids, 349 | 'token_type_ids': token_type_ids, 350 | 'attention_mask': attention_mask, 351 | 'labels': labels, 352 | 'level1_labels': level1_labels 353 | } 354 | 355 | return data_dict 356 | 357 | 358 | def load_data(args, tokenizer): 359 | cache_pkl_path = os.path.join(args.data_cache_path, 'train.pkl') 360 | 361 | with open(cache_pkl_path, 'rb') as f: 362 | train_data = pickle.load(f) 363 | 364 | collate_fn = Collator(args.max_seq_len, tokenizer) 365 | train_dataset = DGDataset(train_data, tokenizer) 366 | train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, 367 | num_workers=0, collate_fn=collate_fn) 368 | return train_dataloader 369 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 317 | -------------------------------------------------------------------------------- /data/code/util/pretrain_utils/trainer_args.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import json 4 | import os 5 | from dataclasses import asdict, dataclass, field 6 | from enum import Enum 7 | from typing import Any, Dict, List, Optional 8 | 9 | from transformers.file_utils import ( 10 | cached_property, 11 | is_torch_available, 12 | is_torch_tpu_available, 13 | torch_required, 14 | ) 15 | from transformers.trainer_utils import EvaluationStrategy, SchedulerType 16 | from transformers.utils import logging 17 | 18 | 19 | if is_torch_available(): 20 | import torch 21 | 22 | if is_torch_tpu_available(): 23 | import torch_xla.core.xla_model as xm 24 | 25 | 26 | logger = logging.get_logger(__name__) 27 | 28 | 29 | def default_logdir() -> str: 30 | """ 31 | Same default as PyTorch 32 | """ 33 | import socket 34 | from datetime import datetime 35 | 36 | current_time = datetime.now().strftime("%b%d_%H-%M-%S") 37 | return os.path.join("runs", current_time + "_" + socket.gethostname()) 38 | 39 | 40 | @dataclass 41 | class TrainingArguments: 42 | """ 43 | TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop 44 | itself**. 45 | 46 | Using :class:`~transformers.HfArgumentParser` we can turn this class into `argparse 47 | `__ arguments that can be specified on the command 48 | line. 49 | 50 | 51 | 52 | 53 | Parameters: 54 | output_dir (:obj:`str`): 55 | The output directory where the model predictions and checkpoints will be written. 56 | overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`): 57 | If :obj:`True`, overwrite the content of the output directory. Use this to continue training if 58 | :obj:`output_dir` points to a checkpoint directory. 59 | do_train (:obj:`bool`, `optional`, defaults to :obj:`False`): 60 | Whether to run training or not. This argument is not directly used by :class:`~transformers.Trainer`, it's 61 | intended to be used by your training/evaluation scripts instead. See the `example scripts 62 | `__ for more details. 63 | do_eval (:obj:`bool`, `optional`): 64 | Whether to run evaluation on the validation set or not. Will be set to :obj:`True` if 65 | :obj:`evaluation_strategy` is different from :obj:`"no"`. This argument is not directly used by 66 | :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See 67 | the `example scripts `__ for more 68 | details. 69 | do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`): 70 | Whether to run predictions on the test set or not. This argument is not directly used by 71 | :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See 72 | the `example scripts `__ for more 73 | details. 74 | evaluation_strategy (:obj:`str` or :class:`~transformers.trainer_utils.EvaluationStrategy`, `optional`, defaults to :obj:`"no"`): 75 | The evaluation strategy to adopt during training. Possible values are: 76 | 77 | * :obj:`"no"`: No evaluation is done during training. 78 | * :obj:`"steps"`: Evaluation is done (and logged) every :obj:`eval_steps`. 79 | * :obj:`"epoch"`: Evaluation is done at the end of each epoch. 80 | 81 | prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`): 82 | When performing evaluation and generating predictions, only returns the loss. 83 | per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8): 84 | The batch size per GPU/TPU core/CPU for training. 85 | per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8): 86 | The batch size per GPU/TPU core/CPU for evaluation. 87 | gradient_accumulation_steps (:obj:`int`, `optional`, defaults to 1): 88 | Number of updates steps to accumulate the gradients for, before performing a backward/update pass. 89 | 90 | .. warning:: 91 | 92 | When using gradient accumulation, one step is counted as one step with backward pass. Therefore, 93 | logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training 94 | examples. 95 | eval_accumulation_steps (:obj:`int`, `optional`): 96 | Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If 97 | left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but 98 | requires more memory). 99 | learning_rate (:obj:`float`, `optional`, defaults to 5e-5): 100 | The initial learning rate for :class:`~transformers.AdamW` optimizer. 101 | weight_decay (:obj:`float`, `optional`, defaults to 0): 102 | The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in 103 | :class:`~transformers.AdamW` optimizer. 104 | adam_beta1 (:obj:`float`, `optional`, defaults to 0.9): 105 | The beta1 hyperparameter for the :class:`~transformers.AdamW` optimizer. 106 | adam_beta2 (:obj:`float`, `optional`, defaults to 0.999): 107 | The beta2 hyperparameter for the :class:`~transformers.AdamW` optimizer. 108 | adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8): 109 | The epsilon hyperparameter for the :class:`~transformers.AdamW` optimizer. 110 | max_grad_norm (:obj:`float`, `optional`, defaults to 1.0): 111 | Maximum gradient norm (for gradient clipping). 112 | num_train_epochs(:obj:`float`, `optional`, defaults to 3.0): 113 | Total number of training epochs to perform (if not an integer, will perform the decimal part percents of 114 | the last epoch before stopping training). 115 | max_steps (:obj:`int`, `optional`, defaults to -1): 116 | If set to a positive number, the total number of training steps to perform. Overrides 117 | :obj:`num_train_epochs`. 118 | lr_scheduler_type (:obj:`str` or :class:`~transformers.SchedulerType`, `optional`, defaults to :obj:`"linear"`): 119 | The scheduler type to use. See the documentation of :class:`~transformers.SchedulerType` for all possible 120 | values. 121 | warmup_steps (:obj:`int`, `optional`, defaults to 0): 122 | Number of steps used for a linear warmup from 0 to :obj:`learning_rate`. 123 | logging_dir (:obj:`str`, `optional`): 124 | `TensorBoard `__ log directory. Will default to 125 | `runs/**CURRENT_DATETIME_HOSTNAME**`. 126 | logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`): 127 | Whether to log and evaluate the first :obj:`global_step` or not. 128 | logging_steps (:obj:`int`, `optional`, defaults to 500): 129 | Number of update steps between two logs. 130 | save_steps (:obj:`int`, `optional`, defaults to 500): 131 | Number of updates steps before two checkpoint saves. 132 | save_total_limit (:obj:`int`, `optional`): 133 | If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in 134 | :obj:`output_dir`. 135 | no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`): 136 | Whether to not use CUDA even when it is available or not. 137 | seed (:obj:`int`, `optional`, defaults to 42): 138 | Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the 139 | :func:`~transformers.Trainer.model_init` function to instantiate the model if it has some randomly 140 | initialized parameters. 141 | fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`): 142 | Whether to use 16-bit (mixed) precision training (through NVIDIA Apex) instead of 32-bit training. 143 | fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'): 144 | For :obj:`fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details 145 | on the `Apex documentation `__. 146 | fp16_backend (:obj:`str`, `optional`, defaults to :obj:`"auto"`): 147 | The backend to use for mixed precision training. Must be one of :obj:`"auto"`, :obj:`"amp"` or 148 | :obj:`"apex"`. :obj:`"auto"` will use AMP or APEX depending on the PyTorch version detected, while the 149 | other choices will force the requested backend. 150 | local_rank (:obj:`int`, `optional`, defaults to -1): 151 | Rank of the process during distributed training. 152 | tpu_num_cores (:obj:`int`, `optional`): 153 | When training on TPU, the number of TPU cores (automatically passed by launcher script). 154 | debug (:obj:`bool`, `optional`, defaults to :obj:`False`): 155 | When training on TPU, whether to print debug metrics or not. 156 | dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`): 157 | Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) 158 | or not. 159 | eval_steps (:obj:`int`, `optional`): 160 | Number of update steps between two evaluations if :obj:`evaluation_strategy="steps"`. Will default to the 161 | same value as :obj:`logging_steps` if not set. 162 | dataloader_num_workers (:obj:`int`, `optional`, defaults to 0): 163 | Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the 164 | main process. 165 | past_index (:obj:`int`, `optional`, defaults to -1): 166 | Some models like :doc:`TransformerXL <../model_doc/transformerxl>` or :doc`XLNet <../model_doc/xlnet>` can 167 | make use of the past hidden states for their predictions. If this argument is set to a positive int, the 168 | ``Trainer`` will use the corresponding output (usually index 2) as the past state and feed it to the model 169 | at the next training step under the keyword argument ``mems``. 170 | run_name (:obj:`str`, `optional`): 171 | A descriptor for the run. Typically used for `wandb `_ logging. 172 | disable_tqdm (:obj:`bool`, `optional`): 173 | Whether or not to disable the tqdm progress bars and table of metrics produced by 174 | :class:`~transformers.notebook.NotebookTrainingTracker` in Jupyter Notebooks. Will default to :obj:`True` 175 | if the logging level is set to warn or lower (default), :obj:`False` otherwise. 176 | remove_unused_columns (:obj:`bool`, `optional`, defaults to :obj:`True`): 177 | If using :obj:`datasets.Dataset` datasets, whether or not to automatically remove the columns unused by the 178 | model forward method. 179 | 180 | (Note that this behavior is not implemented for :class:`~transformers.TFTrainer` yet.) 181 | label_names (:obj:`List[str]`, `optional`): 182 | The list of keys in your dictionary of inputs that correspond to the labels. 183 | 184 | Will eventually default to :obj:`["labels"]` except if the model used is one of the 185 | :obj:`XxxForQuestionAnswering` in which case it will default to :obj:`["start_positions", 186 | "end_positions"]`. 187 | load_best_model_at_end (:obj:`bool`, `optional`, defaults to :obj:`False`): 188 | Whether or not to load the best model found during training at the end of training. 189 | 190 | .. note:: 191 | 192 | When set to :obj:`True`, the parameters :obj:`save_steps` will be ignored and the model will be saved 193 | after each evaluation. 194 | metric_for_best_model (:obj:`str`, `optional`): 195 | Use in conjunction with :obj:`load_best_model_at_end` to specify the metric to use to compare two different 196 | models. Must be the name of a metric returned by the evaluation with or without the prefix :obj:`"eval_"`. 197 | Will default to :obj:`"loss"` if unspecified and :obj:`load_best_model_at_end=True` (to use the evaluation 198 | loss). 199 | 200 | If you set this value, :obj:`greater_is_better` will default to :obj:`True`. Don't forget to set it to 201 | :obj:`False` if your metric is better when lower. 202 | greater_is_better (:obj:`bool`, `optional`): 203 | Use in conjunction with :obj:`load_best_model_at_end` and :obj:`metric_for_best_model` to specify if better 204 | models should have a greater metric or not. Will default to: 205 | 206 | - :obj:`True` if :obj:`metric_for_best_model` is set to a value that isn't :obj:`"loss"` or 207 | :obj:`"eval_loss"`. 208 | - :obj:`False` if :obj:`metric_for_best_model` is not set, or set to :obj:`"loss"` or :obj:`"eval_loss"`. 209 | ignore_skip_data (:obj:`bool`, `optional`, defaults to :obj:`False`): 210 | When resuming training, whether or not to skip the epochs and batches to get the data loading at the same 211 | stage as in the previous training. If set to :obj:`True`, the training will begin faster (as that skipping 212 | step can take a long time) but will not yield the same results as the interrupted training would have. 213 | sharded_ddp (:obj:`bool`, `optional`, defaults to :obj:`False`): 214 | Use Sharded DDP training from `FairScale `__ (in distributed 215 | training only). This is an experimental feature. 216 | deepspeed (:obj:`str`, `optional`): 217 | Use `Deepspeed `__. This is an experimental feature and its API may 218 | evolve in the future. The value is the location of its json config file (usually ``ds_config.json``). 219 | label_smoothing_factor (:obj:`float`, `optional`, defaults to 0.0): 220 | The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded 221 | labels are changed from 0s and 1s to :obj:`label_smoothing_factor/num_labels` and :obj:`1 - 222 | label_smoothing_factor + label_smoothing_factor/num_labels` respectively. 223 | adafactor (:obj:`bool`, `optional`, defaults to :obj:`False`): 224 | Whether or not to use the :class:`~transformers.Adafactor` optimizer instead of 225 | :class:`~transformers.AdamW`. 226 | group_by_length (:obj:`bool`, `optional`, defaults to :obj:`False`): 227 | Whether or not to group together samples of roughly the same legnth in the training dataset (to minimize 228 | padding applied and be more efficient). Only useful if applying dynamic padding. 229 | report_to (:obj:`List[str]`, `optional`, defaults to the list of integrations platforms installed): 230 | The list of integrations to report the results and logs to. Supported platforms are :obj:`"azure_ml"`, 231 | :obj:`"comet_ml"`, :obj:`"mlflow"`, :obj:`"tensorboard"` and :obj:`"wandb"`. 232 | ddp_find_unused_parameters (:obj:`bool`, `optional`): 233 | When using distributed training, the value of the flag :obj:`find_unused_parameters` passed to 234 | :obj:`DistributedDataParallel`. Will default to :obj:`False` if gradient checkpointing is used, :obj:`True` 235 | otherwise. 236 | dataloader_pin_memory (:obj:`bool`, `optional`, defaults to :obj:`True`)): 237 | Whether you want to pin memory in data loaders or not. Will default to :obj:`True`. 238 | """ 239 | 240 | output_dir: Optional[str] = field( 241 | default=None, 242 | metadata={"help": "The output directory where the model predictions and checkpoints will be written."}, 243 | ) 244 | overwrite_output_dir: bool = field( 245 | default=False, 246 | metadata={ 247 | "help": ( 248 | "Overwrite the content of the output directory." 249 | "Use this to continue training if output_dir points to a checkpoint directory." 250 | ) 251 | }, 252 | ) 253 | 254 | do_train: bool = field(default=False, metadata={"help": "Whether to run training."}) 255 | do_eval: bool = field(default=None, metadata={"help": "Whether to run eval on the dev set."}) 256 | do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."}) 257 | evaluation_strategy: EvaluationStrategy = field( 258 | default="no", 259 | metadata={"help": "The evaluation strategy to use."}, 260 | ) 261 | prediction_loss_only: bool = field( 262 | default=False, 263 | metadata={"help": "When performing evaluation and predictions, only returns the loss."}, 264 | ) 265 | 266 | per_device_train_batch_size: int = field( 267 | default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."} 268 | ) 269 | per_device_eval_batch_size: int = field( 270 | default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."} 271 | ) 272 | 273 | per_gpu_train_batch_size: Optional[int] = field( 274 | default=None, 275 | metadata={ 276 | "help": "Deprecated, the use of `--per_device_train_batch_size` is preferred. " 277 | "Batch size per GPU/TPU core/CPU for training." 278 | }, 279 | ) 280 | per_gpu_eval_batch_size: Optional[int] = field( 281 | default=None, 282 | metadata={ 283 | "help": "Deprecated, the use of `--per_device_eval_batch_size` is preferred." 284 | "Batch size per GPU/TPU core/CPU for evaluation." 285 | }, 286 | ) 287 | 288 | gradient_accumulation_steps: int = field( 289 | default=1, 290 | metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."}, 291 | ) 292 | eval_accumulation_steps: Optional[int] = field( 293 | default=None, 294 | metadata={"help": "Number of predictions steps to accumulate before moving the tensors to the CPU."}, 295 | ) 296 | 297 | learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."}) 298 | weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."}) 299 | adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"}) 300 | adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"}) 301 | adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."}) 302 | max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."}) 303 | 304 | num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."}) 305 | max_steps: int = field( 306 | default=-1, 307 | metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."}, 308 | ) 309 | lr_scheduler_type: SchedulerType = field( 310 | default="linear", 311 | metadata={"help": "The scheduler type to use."}, 312 | ) 313 | warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) 314 | 315 | logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."}) 316 | logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"}) 317 | logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."}) 318 | save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) 319 | save_total_limit: Optional[int] = field( 320 | default=None, 321 | metadata={ 322 | "help": ( 323 | "Limit the total amount of checkpoints." 324 | "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints" 325 | ) 326 | }, 327 | ) 328 | no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"}) 329 | seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."}) 330 | 331 | fp16: bool = field( 332 | default=False, 333 | metadata={"help": "Whether to use 16-bit (mixed) precision (through NVIDIA Apex) instead of 32-bit"}, 334 | ) 335 | fp16_opt_level: str = field( 336 | default="O1", 337 | metadata={ 338 | "help": ( 339 | "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 340 | "See details at https://nvidia.github.io/apex/amp.html" 341 | ) 342 | }, 343 | ) 344 | fp16_backend: str = field( 345 | default="auto", 346 | metadata={"help": "The backend to be used for mixed precision.", "choices": ["auto", "amp", "apex"]}, 347 | ) 348 | local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"}) 349 | 350 | tpu_num_cores: Optional[int] = field( 351 | default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"} 352 | ) 353 | tpu_metrics_debug: bool = field( 354 | default=False, 355 | metadata={"help": "Deprecated, the use of `--debug` is preferred. TPU: Whether to print debug metrics"}, 356 | ) 357 | debug: bool = field(default=False, metadata={"help": "Whether to print debug metrics on TPU"}) 358 | 359 | dataloader_drop_last: bool = field( 360 | default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."} 361 | ) 362 | eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."}) 363 | dataloader_num_workers: int = field( 364 | default=0, 365 | metadata={ 366 | "help": "Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process." 367 | }, 368 | ) 369 | 370 | past_index: int = field( 371 | default=-1, 372 | metadata={"help": "If >=0, uses the corresponding part of the output as the past state for next step."}, 373 | ) 374 | 375 | run_name: Optional[str] = field( 376 | default=None, metadata={"help": "An optional descriptor for the run. Notably used for wandb logging."} 377 | ) 378 | disable_tqdm: Optional[bool] = field( 379 | default=None, metadata={"help": "Whether or not to disable the tqdm progress bars."} 380 | ) 381 | 382 | remove_unused_columns: Optional[bool] = field( 383 | default=True, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."} 384 | ) 385 | label_names: Optional[List[str]] = field( 386 | default=None, metadata={"help": "The list of keys in your dictionary of inputs that correspond to the labels."} 387 | ) 388 | 389 | load_best_model_at_end: Optional[bool] = field( 390 | default=False, 391 | metadata={"help": "Whether or not to load the best model found during training at the end of training."}, 392 | ) 393 | metric_for_best_model: Optional[str] = field( 394 | default=None, metadata={"help": "The metric to use to compare two different models."} 395 | ) 396 | greater_is_better: Optional[bool] = field( 397 | default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."} 398 | ) 399 | ignore_data_skip: bool = field( 400 | default=False, 401 | metadata={ 402 | "help": "When resuming training, whether or not to skip the first epochs and batches to get to the same training data." 403 | }, 404 | ) 405 | sharded_ddp: bool = field( 406 | default=False, 407 | metadata={"help": "Whether or not to use sharded DDP training (in distributed training only)."}, 408 | ) 409 | deepspeed: Optional[str] = field( 410 | default=None, 411 | metadata={"help": "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json)"}, 412 | ) 413 | label_smoothing_factor: float = field( 414 | default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."} 415 | ) 416 | adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."}) 417 | group_by_length: bool = field( 418 | default=False, 419 | metadata={"help": "Whether or not to group samples of roughly the same length together when batching."}, 420 | ) 421 | report_to: Optional[List[str]] = field( 422 | default=None, metadata={"help": "The list of integrations to report the results and logs to."} 423 | ) 424 | ddp_find_unused_parameters: Optional[bool] = field( 425 | default=None, 426 | metadata={ 427 | "help": "When using distributed training, the value of the flag `find_unused_parameters` passed to " 428 | "`DistributedDataParallel`." 429 | }, 430 | ) 431 | dataloader_pin_memory: bool = field( 432 | default=True, metadata={"help": "Whether or not to pin memory for DataLoader."} 433 | ) 434 | _n_gpu: int = field(init=False, repr=False, default=-1) 435 | 436 | def __post_init__(self): 437 | if self.output_dir is None and os.getenv("SM_OUTPUT_DATA_DIR") is None: 438 | raise ValueError( 439 | "`output_dir` is only optional if it can get inferred from the environment. Please set a value for " 440 | "`output_dir`." 441 | ) 442 | elif os.getenv("SM_OUTPUT_DATA_DIR") is not None: 443 | if self.output_dir is not None: 444 | logger.warn( 445 | "`output_dir` is overwritten by the env variable 'SM_OUTPUT_DATA_DIR' " 446 | f"({os.getenv('SM_OUTPUT_DATA_DIR')})." 447 | ) 448 | self.output_dir = os.getenv("SM_OUTPUT_DATA_DIR") 449 | if self.disable_tqdm is None: 450 | self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN 451 | self.evaluation_strategy = EvaluationStrategy(self.evaluation_strategy) 452 | self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type) 453 | if self.do_eval is False and self.evaluation_strategy != EvaluationStrategy.NO: 454 | self.do_eval = True 455 | if self.eval_steps is None: 456 | self.eval_steps = self.logging_steps 457 | 458 | if self.load_best_model_at_end and self.metric_for_best_model is None: 459 | self.metric_for_best_model = "loss" 460 | if self.greater_is_better is None and self.metric_for_best_model is not None: 461 | self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"] 462 | if self.run_name is None: 463 | self.run_name = self.output_dir 464 | 465 | if is_torch_available() and self.device.type != "cuda" and self.fp16: 466 | raise ValueError("Mixed precision training with AMP or APEX (`--fp16`) can only be used on CUDA devices.") 467 | if self.report_to is None: 468 | # Import at runtime to avoid a circular import. 469 | from transformers.integrations import get_available_reporting_integrations 470 | 471 | self.report_to = get_available_reporting_integrations() 472 | 473 | def __repr__(self): 474 | # We override the default repr to remove deprecated arguments from the repr. This method should be removed once 475 | # those deprecated arguments are removed form TrainingArguments. (TODO: v5) 476 | self_as_dict = asdict(self) 477 | del self_as_dict["per_gpu_train_batch_size"] 478 | del self_as_dict["per_gpu_eval_batch_size"] 479 | attrs_as_str = [f"{k}={v}" for k, v in self_as_dict.items()] 480 | return f"{self.__class__.__name__}({', '.join(attrs_as_str)})" 481 | 482 | @property 483 | def train_batch_size(self) -> int: 484 | """ 485 | The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training). 486 | """ 487 | if self.per_gpu_train_batch_size: 488 | logger.warning( 489 | "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future " 490 | "version. Using `--per_device_train_batch_size` is preferred." 491 | ) 492 | per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size 493 | train_batch_size = per_device_batch_size * max(1, self.n_gpu) 494 | return train_batch_size 495 | 496 | @property 497 | def eval_batch_size(self) -> int: 498 | """ 499 | The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training). 500 | """ 501 | if self.per_gpu_eval_batch_size: 502 | logger.warning( 503 | "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future " 504 | "version. Using `--per_device_eval_batch_size` is preferred." 505 | ) 506 | per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size 507 | eval_batch_size = per_device_batch_size * max(1, self.n_gpu) 508 | return eval_batch_size 509 | 510 | @cached_property 511 | @torch_required 512 | def _setup_devices(self) -> "torch.device": 513 | logger.info("PyTorch: setting up devices") 514 | if self.no_cuda: 515 | device = torch.device("cpu") 516 | self._n_gpu = 0 517 | elif is_torch_tpu_available(): 518 | device = xm.xla_device() 519 | self._n_gpu = 0 520 | elif self.deepspeed: 521 | # deepspeed performs its own DDP internally, and requires the program to be started with: 522 | # deepspeed ./program.py 523 | # rather than: 524 | # python -m torch.distributed.launch --nproc_per_node=2 ./program.py 525 | from transformers.integrations import is_deepspeed_available 526 | 527 | if not is_deepspeed_available(): 528 | raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.") 529 | import deepspeed 530 | 531 | deepspeed.init_distributed() 532 | device = torch.device("cuda", self.local_rank) 533 | self._n_gpu = 1 534 | elif self.local_rank == -1: 535 | # if n_gpu is > 1 we'll use nn.DataParallel. 536 | # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` 537 | # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will 538 | # trigger an error that a device index is missing. Index 0 takes into account the 539 | # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` 540 | # will use the first GPU in that env, i.e. GPU#1 541 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 542 | # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at 543 | # the default value. 544 | self._n_gpu = torch.cuda.device_count() 545 | else: 546 | # Here, we'll use torch.distributed. 547 | # Initializes the distributed backend which will take care of synchronizing nodes/GPUs 548 | torch.distributed.init_process_group(backend="nccl") 549 | device = torch.device("cuda", self.local_rank) 550 | self._n_gpu = 1 551 | 552 | if device.type == "cuda": 553 | torch.cuda.set_device(device) 554 | 555 | return device 556 | 557 | @property 558 | @torch_required 559 | def device(self) -> "torch.device": 560 | """ 561 | The device used by this process. 562 | """ 563 | return self._setup_devices 564 | 565 | @property 566 | @torch_required 567 | def n_gpu(self): 568 | """ 569 | The number of GPUs used by this process. 570 | 571 | Note: 572 | This will only be greater than one when you have multiple GPUs available but are not using distributed 573 | training. For distributed training, it will always be 1. 574 | """ 575 | # Make sure `self._n_gpu` is properly setup. 576 | _ = self._setup_devices 577 | return self._n_gpu 578 | 579 | @property 580 | @torch_required 581 | def parallel_mode(self): 582 | """ 583 | The current mode used for parallelism if multiple GPUs/TPU cores are available. One of: 584 | 585 | - :obj:`ParallelMode.NOT_PARALLEL`: no parallelism (CPU or one GPU). 586 | - :obj:`ParallelMode.NOT_DISTRIBUTED`: several GPUs in one single process (uses :obj:`torch.nn.DataParallel`). 587 | - :obj:`ParallelMode.DISTRIBUTED`: several GPUs, each ahving its own process (uses 588 | :obj:`torch.nn.DistributedDataParallel`). 589 | - :obj:`ParallelMode.TPU`: several TPU cores. 590 | """ 591 | if is_torch_tpu_available(): 592 | return ParallelMode.TPU 593 | elif self.local_rank != -1: 594 | return ParallelMode.DISTRIBUTED 595 | elif self.n_gpu > 1: 596 | return ParallelMode.NOT_DISTRIBUTED 597 | else: 598 | return ParallelMode.NOT_PARALLEL 599 | 600 | def to_dict(self): 601 | """ 602 | Serializes this instance while replace `Enum` by their values (for JSON serialization support). 603 | """ 604 | d = asdict(self) 605 | for k, v in d.items(): 606 | if isinstance(v, Enum): 607 | d[k] = v.value 608 | return d 609 | 610 | def to_json_string(self): 611 | """ 612 | Serializes this instance to a JSON string. 613 | """ 614 | return json.dumps(self.to_dict(), indent=2) 615 | 616 | def to_sanitized_dict(self) -> Dict[str, Any]: 617 | """ 618 | Sanitized serialization to use with TensorBoard’s hparams 619 | """ 620 | d = self.to_dict() 621 | d = {**d, **{"train_batch_size": self.train_batch_size, "eval_batch_size": self.eval_batch_size}} 622 | 623 | valid_types = [bool, int, float, str] 624 | if is_torch_available(): 625 | valid_types.append(torch.Tensor) 626 | 627 | return {k: v if type(v) in valid_types else str(v) for k, v in d.items()} 628 | 629 | 630 | class ParallelMode(Enum): 631 | NOT_PARALLEL = "not_parallel" 632 | NOT_DISTRIBUTED = "not_distributed" 633 | DISTRIBUTED = "distributed" 634 | SAGEMAKER_DISTRIBUTED = "sm_distributed" 635 | TPU = "tpu" 636 | -------------------------------------------------------------------------------- /data/code/util/modeling/modeling_nezha/modeling.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import logging 4 | import torch 5 | 6 | from torch import nn 7 | from torch.nn import CrossEntropyLoss, MSELoss 8 | 9 | from .configuration import NeZhaConfig 10 | from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward 11 | from transformers.modeling_utils import PreTrainedModel, prune_linear_layer 12 | from transformers.models.bert.modeling_bert import ( 13 | BertOutput, 14 | BertPooler, 15 | BertSelfOutput, 16 | BertIntermediate, 17 | BertOnlyMLMHead, 18 | BertOnlyNSPHead, 19 | BertLMPredictionHead, 20 | BERT_START_DOCSTRING, 21 | BERT_INPUTS_DOCSTRING, 22 | ) 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | _CONFIG_FOR_DOC = "NeZhaConfig" 27 | _TOKENIZER_FOR_DOC = "NeZhaTokenizer" 28 | 29 | NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = [] 30 | NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP = {} 31 | 32 | 33 | def load_tf_weights_in_nezha(model, config, tf_checkpoint_path): 34 | """Load tf checkpoints in a pytorch model.""" 35 | try: 36 | import re 37 | import numpy as np 38 | import tensorflow as tf 39 | except ImportError: 40 | logger.error( 41 | "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " 42 | "https://www.tensorflow.org/install/ for installation instructions." 43 | ) 44 | raise 45 | 46 | tf_path = os.path.abspath(tf_checkpoint_path) 47 | logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) 48 | # Load weights from TF model 49 | init_vars = tf.train.list_variables(tf_path) 50 | names = [] 51 | arrays = [] 52 | for name, shape in init_vars: 53 | # logger.info("Loading TF weight {} with shape {}".format(name, shape)) 54 | array = tf.train.load_variable(tf_path, name) 55 | names.append(name) 56 | arrays.append(array) 57 | 58 | for name, array in zip(names, arrays): 59 | name = name.split("/") 60 | # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v 61 | # which are not required for using pretrained model 62 | if any( 63 | n in ["adam_v", "adam_m", "lamb_m", "lamb_v", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", 64 | "global_step", "good_steps", "loss_scale", 'bad_steps'] 65 | for n in name 66 | ): 67 | logger.info("Skipping {}".format("/".join(name))) 68 | continue 69 | pointer = model 70 | for m_name in name: 71 | if re.fullmatch(r"[A-Za-z]+_\d+", m_name): 72 | scope_names = re.split(r"_(\d+)", m_name) 73 | else: 74 | scope_names = [m_name] 75 | if scope_names[0] == "kernel" or scope_names[0] == "gamma": 76 | pointer = getattr(pointer, "weight") 77 | elif scope_names[0] == "output_bias" or scope_names[0] == "beta": 78 | pointer = getattr(pointer, "bias") 79 | elif scope_names[0] == "output_weights": 80 | pointer = getattr(pointer, "weight") 81 | elif scope_names[0] == "squad": 82 | pointer = getattr(pointer, "classifier") 83 | else: 84 | try: 85 | pointer = getattr(pointer, scope_names[0]) 86 | except AttributeError: 87 | logger.info("Skipping {}".format("/".join(name))) 88 | continue 89 | if len(scope_names) >= 2: 90 | num = int(scope_names[1]) 91 | pointer = pointer[num] 92 | if m_name[-11:] == "_embeddings": 93 | pointer = getattr(pointer, "weight") 94 | elif m_name == "kernel": 95 | array = np.transpose(array) 96 | try: 97 | assert ( 98 | pointer.shape == array.shape 99 | ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" 100 | except AssertionError as e: 101 | e.args += (pointer.shape, array.shape) 102 | raise 103 | logger.info("Initialize PyTorch weight {}".format(name)) 104 | pointer.data = torch.from_numpy(array) 105 | return model 106 | 107 | 108 | class NeZhaEmbeddings(nn.Module): 109 | """ 110 | Construct the embeddings from word, position and token_type embeddings. 111 | """ 112 | 113 | def __init__(self, config): 114 | super().__init__() 115 | self.use_relative_position = config.use_relative_position 116 | self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) 117 | self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) 118 | # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load 119 | # any TensorFlow checkpoint file 120 | self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) 121 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 122 | 123 | def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None): 124 | if input_ids is not None: 125 | input_shape = input_ids.size() 126 | else: 127 | input_shape = inputs_embeds.size()[:-1] 128 | device = input_ids.device if input_ids is not None else inputs_embeds.device 129 | if token_type_ids is None: 130 | token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) 131 | if inputs_embeds is None: 132 | inputs_embeds = self.word_embeddings(input_ids) 133 | token_type_embeddings = self.token_type_embeddings(token_type_ids) 134 | 135 | embeddings = inputs_embeds + token_type_embeddings 136 | 137 | # embeddings = inputs_embeds + token_type_embeddings 138 | embeddings = self.LayerNorm(embeddings) 139 | embeddings = self.dropout(embeddings) 140 | return embeddings 141 | 142 | 143 | def relative_position_encoding(depth, max_length=512, max_relative_position=127): 144 | vocab_size = max_relative_position * 2 + 1 145 | range_vec = torch.arange(max_length) 146 | range_mat = range_vec.repeat(max_length).view(max_length, max_length) 147 | distance_mat = range_mat - torch.t(range_mat) 148 | distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position) 149 | final_mat = distance_mat_clipped + max_relative_position 150 | 151 | embeddings_table = torch.zeros(vocab_size, depth) 152 | position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1) 153 | div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth)) 154 | embeddings_table[:, 0::2] = torch.sin(position * div_term) 155 | embeddings_table[:, 1::2] = torch.cos(position * div_term) 156 | embeddings_table = embeddings_table.unsqueeze(0).transpose(0, 1).squeeze(1) 157 | 158 | flat_relative_positions_matrix = final_mat.view(-1) 159 | one_hot_relative_positions_matrix = torch.nn.functional.one_hot(flat_relative_positions_matrix, 160 | num_classes=vocab_size).float() 161 | positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table) 162 | my_shape = list(final_mat.size()) 163 | my_shape.append(depth) 164 | positions_encoding = positions_encoding.view(my_shape) 165 | return positions_encoding 166 | 167 | 168 | class NeZhaSelfAttention(nn.Module): 169 | def __init__(self, config): 170 | super().__init__() 171 | if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): 172 | raise ValueError( 173 | "The hidden size (%d) is not a multiple of the number of attention " 174 | "heads (%d)" % (config.hidden_size, config.num_attention_heads) 175 | ) 176 | self.output_attentions = config.output_attentions 177 | 178 | self.num_attention_heads = config.num_attention_heads 179 | self.attention_head_size = int(config.hidden_size / config.num_attention_heads) 180 | self.all_head_size = self.num_attention_heads * self.attention_head_size 181 | 182 | self.query = nn.Linear(config.hidden_size, self.all_head_size) 183 | self.key = nn.Linear(config.hidden_size, self.all_head_size) 184 | self.value = nn.Linear(config.hidden_size, self.all_head_size) 185 | self.dropout = nn.Dropout(config.attention_probs_dropout_prob) 186 | 187 | self.relative_positions_encoding = relative_position_encoding(max_length=config.max_position_embeddings, 188 | depth=self.attention_head_size, 189 | max_relative_position=config.max_relative_position) 190 | 191 | def transpose_for_scores(self, x): 192 | new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) 193 | x = x.view(*new_x_shape) 194 | return x.permute(0, 2, 1, 3) 195 | 196 | def forward( 197 | self, 198 | hidden_states, 199 | attention_mask=None, 200 | head_mask=None, 201 | encoder_hidden_states=None, 202 | encoder_attention_mask=None, 203 | ): 204 | 205 | mixed_query_layer = self.query(hidden_states) 206 | 207 | # If this is instantiated as a cross-attention module, the keys 208 | # and values come from an encoder; the attention mask needs to be 209 | # such that the encoder's padding tokens are not attended to. 210 | if encoder_hidden_states is not None: 211 | mixed_key_layer = self.key(encoder_hidden_states) 212 | mixed_value_layer = self.value(encoder_hidden_states) 213 | attention_mask = encoder_attention_mask 214 | else: 215 | mixed_key_layer = self.key(hidden_states) 216 | mixed_value_layer = self.value(hidden_states) 217 | 218 | query_layer = self.transpose_for_scores(mixed_query_layer) 219 | key_layer = self.transpose_for_scores(mixed_key_layer) 220 | value_layer = self.transpose_for_scores(mixed_value_layer) 221 | 222 | # Take the dot product between "query" and "key" to get the raw attention scores. 223 | attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) 224 | 225 | batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size() 226 | 227 | relations_keys = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :].to(hidden_states.device) 228 | query_layer_t = query_layer.permute(2, 0, 1, 3) 229 | 230 | query_layer_r = query_layer_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, 231 | self.attention_head_size) 232 | key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1)) 233 | key_position_scores_r = key_position_scores.view(from_seq_length, batch_size, 234 | num_attention_heads, from_seq_length) 235 | key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3) 236 | attention_scores = attention_scores + key_position_scores_r_t 237 | 238 | attention_scores = attention_scores / math.sqrt(self.attention_head_size) 239 | if attention_mask is not None: 240 | # Apply the attention mask is (precomputed for all layers in BertModel forward() function) 241 | attention_scores = attention_scores + attention_mask 242 | 243 | # Normalize the attention scores to probabilities. 244 | attention_probs = nn.Softmax(dim=-1)(attention_scores) 245 | 246 | # This is actually dropping out entire tokens to attend to, which might 247 | # seem a bit unusual, but is taken from the original Transformer paper. 248 | attention_probs = self.dropout(attention_probs) 249 | 250 | # Mask heads if we want to 251 | if head_mask is not None: 252 | attention_probs = attention_probs * head_mask 253 | 254 | context_layer = torch.matmul(attention_probs, value_layer) 255 | 256 | relations_values = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :].to(hidden_states.device) 257 | attention_probs_t = attention_probs.permute(2, 0, 1, 3) 258 | attentions_probs_r = attention_probs_t.contiguous().view(from_seq_length, batch_size * num_attention_heads, 259 | to_seq_length) 260 | value_position_scores = torch.matmul(attentions_probs_r, relations_values) 261 | value_position_scores_r = value_position_scores.view(from_seq_length, batch_size, 262 | num_attention_heads, self.attention_head_size) 263 | value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3) 264 | context_layer = context_layer + value_position_scores_r_t 265 | 266 | context_layer = context_layer.permute(0, 2, 1, 3).contiguous() 267 | new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) 268 | context_layer = context_layer.view(*new_context_layer_shape) 269 | 270 | outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) 271 | return outputs 272 | 273 | 274 | class NeZhaAttention(nn.Module): 275 | def __init__(self, config): 276 | super().__init__() 277 | self.self = NeZhaSelfAttention(config) 278 | self.output = BertSelfOutput(config) 279 | self.pruned_heads = set() 280 | 281 | def prune_heads(self, heads): 282 | if len(heads) == 0: 283 | return 284 | mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) 285 | heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads 286 | for head in heads: 287 | # Compute how many pruned heads are before the head and move the index accordingly 288 | head = head - sum(1 if h < head else 0 for h in self.pruned_heads) 289 | mask[head] = 0 290 | mask = mask.view(-1).contiguous().eq(1) 291 | index = torch.arange(len(mask))[mask].long() 292 | # Prune linear layers 293 | self.self.query = prune_linear_layer(self.self.query, index) 294 | self.self.key = prune_linear_layer(self.self.key, index) 295 | self.self.value = prune_linear_layer(self.self.value, index) 296 | self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) 297 | # Update hyper params and store pruned heads 298 | self.self.num_attention_heads = self.self.num_attention_heads - len(heads) 299 | self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads 300 | self.pruned_heads = self.pruned_heads.union(heads) 301 | 302 | def forward( 303 | self, 304 | hidden_states, 305 | attention_mask=None, 306 | head_mask=None, 307 | encoder_hidden_states=None, 308 | encoder_attention_mask=None, 309 | ): 310 | self_outputs = self.self( 311 | hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask 312 | ) 313 | attention_output = self.output(self_outputs[0], hidden_states) 314 | outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them 315 | return outputs 316 | 317 | 318 | class NeZhaLayer(nn.Module): 319 | def __init__(self, config): 320 | super().__init__() 321 | self.attention = NeZhaAttention(config) 322 | self.is_decoder = config.is_decoder 323 | if self.is_decoder: 324 | self.crossattention = NeZhaAttention(config) 325 | self.intermediate = BertIntermediate(config) 326 | self.output = BertOutput(config) 327 | 328 | def forward( 329 | self, 330 | hidden_states, 331 | attention_mask=None, 332 | head_mask=None, 333 | encoder_hidden_states=None, 334 | encoder_attention_mask=None, 335 | ): 336 | self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask) 337 | attention_output = self_attention_outputs[0] 338 | outputs = self_attention_outputs[1:] # add self attentions if we output attention weights 339 | 340 | if self.is_decoder and encoder_hidden_states is not None: 341 | cross_attention_outputs = self.crossattention( 342 | attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask 343 | ) 344 | attention_output = cross_attention_outputs[0] 345 | outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights 346 | 347 | intermediate_output = self.intermediate(attention_output) 348 | layer_output = self.output(intermediate_output, attention_output) 349 | outputs = (layer_output,) + outputs 350 | return outputs 351 | 352 | 353 | class NeZhaEncoder(nn.Module): 354 | def __init__(self, config): 355 | super().__init__() 356 | self.output_attentions = config.output_attentions 357 | # self.output_hidden_states = config.output_hidden_states 358 | self.output_hidden_states = True 359 | self.layer = nn.ModuleList([NeZhaLayer(config) for _ in range(config.num_hidden_layers)]) 360 | 361 | def forward( 362 | self, 363 | hidden_states, 364 | attention_mask=None, 365 | head_mask=None, 366 | encoder_hidden_states=None, 367 | encoder_attention_mask=None, 368 | ): 369 | all_hidden_states = () 370 | all_attentions = () 371 | for i, layer_module in enumerate(self.layer): 372 | if self.output_hidden_states: 373 | all_hidden_states = all_hidden_states + (hidden_states,) 374 | layer_outputs = layer_module( 375 | hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask 376 | ) 377 | hidden_states = layer_outputs[0] 378 | if self.output_attentions: 379 | all_attentions = all_attentions + (layer_outputs[1],) 380 | # Add last layer 381 | if self.output_hidden_states: 382 | all_hidden_states = all_hidden_states + (hidden_states,) 383 | 384 | outputs = (hidden_states,) 385 | if self.output_hidden_states: 386 | outputs = outputs + (all_hidden_states,) 387 | if self.output_attentions: 388 | outputs = outputs + (all_attentions,) 389 | return outputs # last-layer hidden state, (all hidden states), (all attentions) 390 | 391 | 392 | class NeZhaPreTrainedModel(PreTrainedModel): 393 | """ An abstract class to handle weights initialization and 394 | a simple interface for downloading and loading pretrained models. 395 | """ 396 | config_class = NeZhaConfig 397 | pretrained_model_archive_map = NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP 398 | load_tf_weights = load_tf_weights_in_nezha 399 | base_model_prefix = "bert" 400 | 401 | def _init_weights(self, module): 402 | """ Initialize the weights """ 403 | if isinstance(module, (nn.Linear, nn.Embedding)): 404 | # Slightly different from the TF version which uses truncated_normal for initialization 405 | # cf https://github.com/pytorch/pytorch/pull/5617 406 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) 407 | elif isinstance(module, nn.LayerNorm): 408 | module.bias.data.zero_() 409 | module.weight.data.fill_(1.0) 410 | if isinstance(module, nn.Linear) and module.bias is not None: 411 | module.bias.data.zero_() 412 | 413 | 414 | @add_start_docstrings( 415 | "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", 416 | BERT_START_DOCSTRING, 417 | ) 418 | class NeZhaModel(NeZhaPreTrainedModel): 419 | """ 420 | The model can behave as an encoder (with only self-attention) as well 421 | as a decoder, in which case a layer of cross-attention is added between 422 | the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, 423 | Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. 424 | 425 | To behave as an decoder the model needs to be initialized with the 426 | :obj:`is_decoder` argument of the configuration set to :obj:`True`; an 427 | :obj:`encoder_hidden_states` is expected as an input to the forward pass. 428 | 429 | .. _`Attention is all you need`: 430 | https://arxiv.org/abs/1706.03762 431 | 432 | """ 433 | 434 | def __init__(self, config): 435 | super().__init__(config) 436 | self.config = config 437 | self.embeddings = NeZhaEmbeddings(config) 438 | self.encoder = NeZhaEncoder(config) 439 | self.pooler = BertPooler(config) 440 | self.init_weights() 441 | 442 | def get_input_embeddings(self): 443 | return self.embeddings.word_embeddings 444 | 445 | def set_input_embeddings(self, value): 446 | self.embeddings.word_embeddings = value 447 | 448 | def _prune_heads(self, heads_to_prune): 449 | """ Prunes heads of the model. 450 | heads_to_prune: dict of {layer_num: list of heads to prune in this layer} 451 | See base class PreTrainedModel 452 | """ 453 | for layer, heads in heads_to_prune.items(): 454 | self.encoder.layer[layer].attention.prune_heads(heads) 455 | 456 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) 457 | def forward( 458 | self, 459 | input_ids=None, 460 | attention_mask=None, 461 | token_type_ids=None, 462 | head_mask=None, 463 | position_ids=None, 464 | inputs_embeds=None, 465 | encoder_hidden_states=None, 466 | encoder_attention_mask=None, 467 | ): 468 | r""" 469 | Return: 470 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: 471 | last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): 472 | Sequence of hidden-states at the output of the last layer of the model. 473 | pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): 474 | Last layer hidden-state of the first token of the sequence (classification token) 475 | further processed by a Linear layer and a Tanh activation function. The Linear 476 | layer weights are trained from the next sentence prediction (classification) 477 | objective during pre-training. 478 | 479 | This output is usually *not* a good summary 480 | of the semantic content of the input, you're often better with averaging or pooling 481 | the sequence of hidden-states for the whole input sequence. 482 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): 483 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) 484 | of shape :obj:`(batch_size, sequence_length, hidden_size)`. 485 | 486 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 487 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): 488 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape 489 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. 490 | 491 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention 492 | heads. 493 | 494 | Examples:: 495 | 496 | from transformers import BertModel, BertTokenizer 497 | import torch 498 | 499 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 500 | model = BertModel.from_pretrained('bert-base-uncased') 501 | 502 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 503 | outputs = model(input_ids) 504 | 505 | last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple 506 | 507 | """ 508 | 509 | if input_ids is not None and inputs_embeds is not None: 510 | raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") 511 | elif input_ids is not None: 512 | input_shape = input_ids.size() 513 | elif inputs_embeds is not None: 514 | input_shape = inputs_embeds.size()[:-1] 515 | else: 516 | raise ValueError("You have to specify either input_ids or inputs_embeds") 517 | 518 | device = input_ids.device if input_ids is not None else inputs_embeds.device 519 | 520 | if attention_mask is None: 521 | attention_mask = torch.ones(input_shape, device=device) 522 | if token_type_ids is None: 523 | token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) 524 | 525 | # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] 526 | # ourselves in which case we just need to make it broadcastable to all heads. 527 | extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( 528 | attention_mask, input_shape, self.device 529 | ) 530 | 531 | # If a 2D ou 3D attention mask is provided for the cross-attention 532 | # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] 533 | if self.config.is_decoder and encoder_hidden_states is not None: 534 | encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() 535 | encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) 536 | if encoder_attention_mask is None: 537 | encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) 538 | encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) 539 | else: 540 | encoder_extended_attention_mask = None 541 | 542 | # Prepare head mask if needed 543 | # 1.0 in head_mask indicate we keep the head 544 | # attention_probs has shape bsz x n_heads x N x N 545 | # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] 546 | # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] 547 | head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) 548 | 549 | embedding_output = self.embeddings( 550 | input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds 551 | ) 552 | encoder_outputs = self.encoder( 553 | embedding_output, 554 | attention_mask=extended_attention_mask, 555 | head_mask=head_mask, 556 | encoder_hidden_states=encoder_hidden_states, 557 | encoder_attention_mask=encoder_extended_attention_mask, 558 | ) 559 | sequence_output = encoder_outputs[0] 560 | pooled_output = self.pooler(sequence_output) 561 | 562 | outputs = (sequence_output, pooled_output,) + encoder_outputs[ 563 | 1: 564 | ] # add hidden_states and attentions if they are here 565 | return outputs # sequence_output, pooled_output, (hidden_states), (attentions) 566 | 567 | 568 | class BertPreTrainingHeads(nn.Module): 569 | def __init__(self, config): 570 | super().__init__() 571 | self.predictions = BertLMPredictionHead(config) 572 | self.seq_relationship = nn.Linear(config.hidden_size, 2) 573 | 574 | def forward(self, sequence_output, pooled_output): 575 | prediction_scores = self.predictions(sequence_output) 576 | seq_relationship_score = self.seq_relationship(pooled_output) 577 | return prediction_scores, seq_relationship_score 578 | 579 | 580 | @add_start_docstrings( 581 | """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and 582 | a `next sentence prediction (classification)` head. """, 583 | BERT_START_DOCSTRING, 584 | ) 585 | class NeZhaForPreTraining(NeZhaPreTrainedModel): 586 | def __init__(self, config): 587 | super().__init__(config) 588 | self.bert = NeZhaModel(config) 589 | self.cls = BertPreTrainingHeads(config) 590 | self.init_weights() 591 | 592 | def get_output_embeddings(self): 593 | return self.cls.predictions.decoder 594 | 595 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) 596 | def forward( 597 | self, 598 | input_ids=None, 599 | attention_mask=None, 600 | token_type_ids=None, 601 | head_mask=None, 602 | position_ids=None, 603 | inputs_embeds=None, 604 | labels=None, 605 | sentence_span_labels=None, 606 | ): 607 | 608 | outputs = self.bert( 609 | input_ids, 610 | attention_mask=attention_mask, 611 | token_type_ids=token_type_ids, 612 | head_mask=head_mask, 613 | inputs_embeds=inputs_embeds, 614 | ) 615 | 616 | sequence_output, pooled_output = outputs[:2] 617 | prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) 618 | # add hidden states and attention if they are here 619 | outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] 620 | 621 | if labels is not None and sentence_span_labels is not None: 622 | loss_fct = CrossEntropyLoss() 623 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) 624 | 625 | next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), sentence_span_labels.view(-1)) 626 | 627 | pseudo_labels = torch.argmax(torch.softmax(seq_relationship_score, -1), 1) 628 | pseudo_loss = loss_fct(seq_relationship_score.view(-1, 2), pseudo_labels.view(-1)) 629 | next_sentence_loss = next_sentence_loss + 0.5 * pseudo_loss 630 | 631 | total_loss = masked_lm_loss + next_sentence_loss 632 | outputs = (total_loss,) + outputs 633 | 634 | return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) 635 | 636 | 637 | @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) 638 | class NeZhaForMaskedLM(NeZhaPreTrainedModel): 639 | def __init__(self, config): 640 | super().__init__(config) 641 | self.bert = NeZhaModel(config) 642 | self.cls = BertOnlyMLMHead(config) 643 | self.init_weights() 644 | 645 | def get_output_embeddings(self): 646 | return self.cls.predictions.decoder 647 | 648 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) 649 | def forward( 650 | self, 651 | input_ids=None, 652 | attention_mask=None, 653 | token_type_ids=None, 654 | head_mask=None, 655 | position_ids=None, 656 | inputs_embeds=None, 657 | encoder_hidden_states=None, 658 | encoder_attention_mask=None, 659 | labels=None, 660 | ): 661 | r""" 662 | masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): 663 | Labels for computing the masked language modeling loss. 664 | Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) 665 | Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels 666 | in ``[0, ..., config.vocab_size]`` 667 | lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): 668 | Labels for computing the left-to-right language modeling loss (next word prediction). 669 | Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) 670 | Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels 671 | in ``[0, ..., config.vocab_size]`` 672 | 673 | Returns: 674 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: 675 | masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: 676 | Masked language modeling loss. 677 | ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided): 678 | Next token prediction loss. 679 | prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) 680 | Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). 681 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): 682 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) 683 | of shape :obj:`(batch_size, sequence_length, hidden_size)`. 684 | 685 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 686 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): 687 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape 688 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. 689 | 690 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention 691 | heads. 692 | 693 | Examples:: 694 | 695 | from transformers import BertTokenizer, BertForMaskedLM 696 | import torch 697 | 698 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 699 | model = BertForMaskedLM.from_pretrained('bert-base-uncased') 700 | 701 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 702 | outputs = model(input_ids, masked_lm_labels=input_ids) 703 | 704 | loss, prediction_scores = outputs[:2] 705 | 706 | """ 707 | outputs = self.bert( 708 | input_ids, 709 | attention_mask=attention_mask, 710 | token_type_ids=token_type_ids, 711 | head_mask=head_mask, 712 | inputs_embeds=inputs_embeds, 713 | encoder_hidden_states=encoder_hidden_states, 714 | encoder_attention_mask=encoder_attention_mask, 715 | ) 716 | 717 | sequence_output = outputs[0] 718 | prediction_scores = self.cls(sequence_output) 719 | outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here 720 | 721 | # Although this may seem awkward, BertForMaskedLM supports two scenarios: 722 | # 1. If a tensor that contains the indices of masked labels is provided, 723 | # the cross-entropy is the MLM cross-entropy that measures the likelihood 724 | # of predictions for masked words. 725 | # 2. If `lm_labels` is provided we are in a causal scenario where we 726 | # try to predict the next token for each input in the decoder. 727 | masked_lm_labels = None 728 | if labels is not None: 729 | loss_fct = CrossEntropyLoss() # -100 index = padding token 730 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) 731 | outputs = (masked_lm_loss,) + outputs 732 | return outputs # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions) 733 | 734 | def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): 735 | input_shape = input_ids.shape 736 | effective_batch_size = input_shape[0] 737 | 738 | # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly 739 | if attention_mask is None: 740 | attention_mask = input_ids.new_ones(input_shape) 741 | 742 | # if model is does not use a causal mask then add a dummy token 743 | if self.config.is_decoder is False: 744 | assert self.config.pad_token_id is not None, "The PAD token should be defined for generation" 745 | attention_mask = torch.cat( 746 | [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1 747 | ) 748 | 749 | dummy_token = torch.full( 750 | (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device 751 | ) 752 | input_ids = torch.cat([input_ids, dummy_token], dim=1) 753 | 754 | return {"input_ids": input_ids, "attention_mask": attention_mask} 755 | 756 | 757 | @add_start_docstrings( 758 | """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, 759 | ) 760 | class NeZhaForNextSentencePrediction(NeZhaPreTrainedModel): 761 | def __init__(self, config): 762 | super().__init__(config) 763 | self.bert = NeZhaModel(config) 764 | self.cls = BertOnlyNSPHead(config) 765 | self.init_weights() 766 | 767 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) 768 | def forward( 769 | self, 770 | input_ids=None, 771 | attention_mask=None, 772 | token_type_ids=None, 773 | head_mask=None, 774 | position_ids=None, 775 | inputs_embeds=None, 776 | next_sentence_label=None, 777 | ): 778 | r""" 779 | next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): 780 | Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) 781 | Indices should be in ``[0, 1]``. 782 | ``0`` indicates sequence B is a continuation of sequence A, 783 | ``1`` indicates sequence B is a random sequence. 784 | 785 | Returns: 786 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: 787 | loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided): 788 | Next sequence prediction (classification) loss. 789 | seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): 790 | Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). 791 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): 792 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) 793 | of shape :obj:`(batch_size, sequence_length, hidden_size)`. 794 | 795 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 796 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): 797 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape 798 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. 799 | 800 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention 801 | heads. 802 | 803 | Examples:: 804 | 805 | from transformers import BertTokenizer, BertForNextSentencePrediction 806 | import torch 807 | 808 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 809 | model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') 810 | 811 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 812 | outputs = model(input_ids) 813 | 814 | seq_relationship_scores = outputs[0] 815 | 816 | """ 817 | 818 | outputs = self.bert( 819 | input_ids, 820 | attention_mask=attention_mask, 821 | token_type_ids=token_type_ids, 822 | head_mask=head_mask, 823 | inputs_embeds=inputs_embeds, 824 | ) 825 | 826 | pooled_output = outputs[1] 827 | seq_relationship_score = self.cls(pooled_output) 828 | outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here 829 | if next_sentence_label is not None: 830 | loss_fct = CrossEntropyLoss() 831 | next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) 832 | outputs = (next_sentence_loss,) + outputs 833 | 834 | return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) 835 | 836 | 837 | @add_start_docstrings( 838 | """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of 839 | the pooled output) e.g. for GLUE tasks. """, 840 | BERT_START_DOCSTRING, 841 | ) 842 | class NeZhaForSequenceClassification(NeZhaPreTrainedModel): 843 | def __init__(self, config): 844 | super().__init__(config) 845 | self.num_labels = config.num_labels 846 | self.bert = NeZhaModel(config) 847 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 848 | self.classifier = nn.Linear(config.hidden_size, config.num_labels) 849 | self.init_weights() 850 | 851 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) 852 | def forward( 853 | self, 854 | input_ids=None, 855 | attention_mask=None, 856 | token_type_ids=None, 857 | position_ids=None, 858 | head_mask=None, 859 | inputs_embeds=None, 860 | labels=None, 861 | ): 862 | r""" 863 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): 864 | Labels for computing the sequence classification/regression loss. 865 | Indices should be in :obj:`[0, ..., config.num_labels - 1]`. 866 | If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), 867 | If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). 868 | 869 | Returns: 870 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: 871 | loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): 872 | Classification (or regression if config.num_labels==1) loss. 873 | logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): 874 | Classification (or regression if config.num_labels==1) scores (before SoftMax). 875 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): 876 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) 877 | of shape :obj:`(batch_size, sequence_length, hidden_size)`. 878 | 879 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 880 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): 881 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape 882 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. 883 | 884 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention 885 | heads. 886 | 887 | Examples:: 888 | 889 | from transformers import BertTokenizer, BertForSequenceClassification 890 | import torch 891 | 892 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 893 | model = BertForSequenceClassification.from_pretrained('bert-base-uncased') 894 | 895 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 896 | labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 897 | outputs = model(input_ids, labels=labels) 898 | 899 | loss, logits = outputs[:2] 900 | 901 | """ 902 | 903 | outputs = self.bert( 904 | input_ids, 905 | attention_mask=attention_mask, 906 | token_type_ids=token_type_ids, 907 | head_mask=head_mask, 908 | inputs_embeds=inputs_embeds, 909 | ) 910 | 911 | pooled_output = outputs[1] 912 | 913 | pooled_output = self.dropout(pooled_output) 914 | logits = self.classifier(pooled_output) 915 | 916 | outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here 917 | 918 | if labels is not None: 919 | if self.num_labels == 1: 920 | # We are doing regression 921 | loss_fct = MSELoss() 922 | loss = loss_fct(logits.view(-1), labels.view(-1)) 923 | else: 924 | loss_fct = CrossEntropyLoss() 925 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 926 | outputs = (loss,) + outputs 927 | 928 | return outputs # (loss), logits, (hidden_states), (attentions) 929 | 930 | 931 | @add_start_docstrings( 932 | """Bert Model with a multiple choice classification head on top (a linear layer on top of 933 | the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, 934 | BERT_START_DOCSTRING, 935 | ) 936 | class NeZhaForMultipleChoice(NeZhaPreTrainedModel): 937 | def __init__(self, config): 938 | super().__init__(config) 939 | self.bert = NeZhaModel(config) 940 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 941 | self.classifier = nn.Linear(config.hidden_size, 1) 942 | self.init_weights() 943 | 944 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) 945 | def forward( 946 | self, 947 | input_ids=None, 948 | attention_mask=None, 949 | token_type_ids=None, 950 | head_mask=None, 951 | position_ids=None, 952 | inputs_embeds=None, 953 | labels=None, 954 | ): 955 | r""" 956 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): 957 | Labels for computing the multiple choice classification loss. 958 | Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension 959 | of the input tensors. (see `input_ids` above) 960 | 961 | Returns: 962 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: 963 | loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): 964 | Classification loss. 965 | classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): 966 | `num_choices` is the second dimension of the input tensors. (see `input_ids` above). 967 | 968 | Classification scores (before SoftMax). 969 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): 970 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) 971 | of shape :obj:`(batch_size, sequence_length, hidden_size)`. 972 | 973 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 974 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): 975 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape 976 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. 977 | 978 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention 979 | heads. 980 | 981 | Examples:: 982 | 983 | from transformers import BertTokenizer, BertForMultipleChoice 984 | import torch 985 | 986 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 987 | model = BertForMultipleChoice.from_pretrained('bert-base-uncased') 988 | choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] 989 | 990 | input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices 991 | labels = torch.tensor(1).unsqueeze(0) # Batch size 1 992 | outputs = model(input_ids, labels=labels) 993 | 994 | loss, classification_scores = outputs[:2] 995 | 996 | """ 997 | num_choices = input_ids.shape[1] 998 | 999 | input_ids = input_ids.view(-1, input_ids.size(-1)) 1000 | attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None 1001 | token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None 1002 | 1003 | outputs = self.bert( 1004 | input_ids, 1005 | attention_mask=attention_mask, 1006 | token_type_ids=token_type_ids, 1007 | head_mask=head_mask, 1008 | inputs_embeds=inputs_embeds, 1009 | ) 1010 | 1011 | pooled_output = outputs[1] 1012 | 1013 | pooled_output = self.dropout(pooled_output) 1014 | logits = self.classifier(pooled_output) 1015 | reshaped_logits = logits.view(-1, num_choices) 1016 | 1017 | outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here 1018 | 1019 | if labels is not None: 1020 | loss_fct = CrossEntropyLoss() 1021 | loss = loss_fct(reshaped_logits, labels) 1022 | outputs = (loss,) + outputs 1023 | 1024 | return outputs # (loss), reshaped_logits, (hidden_states), (attentions) 1025 | 1026 | 1027 | @add_start_docstrings( 1028 | """Bert Model with a token classification head on top (a linear layer on top of 1029 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, 1030 | BERT_START_DOCSTRING, 1031 | ) 1032 | class NeZhaForTokenClassification(NeZhaPreTrainedModel): 1033 | def __init__(self, config): 1034 | super().__init__(config) 1035 | self.num_labels = config.num_labels 1036 | self.bert = NeZhaModel(config) 1037 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 1038 | self.classifier = nn.Linear(config.hidden_size, config.num_labels) 1039 | self.init_weights() 1040 | 1041 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) 1042 | def forward( 1043 | self, 1044 | input_ids=None, 1045 | attention_mask=None, 1046 | token_type_ids=None, 1047 | head_mask=None, 1048 | position_ids=None, 1049 | inputs_embeds=None, 1050 | labels=None, 1051 | ): 1052 | r""" 1053 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): 1054 | Labels for computing the token classification loss. 1055 | Indices should be in ``[0, ..., config.num_labels - 1]``. 1056 | 1057 | Returns: 1058 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: 1059 | loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : 1060 | Classification loss. 1061 | scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) 1062 | Classification scores (before SoftMax). 1063 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): 1064 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) 1065 | of shape :obj:`(batch_size, sequence_length, hidden_size)`. 1066 | 1067 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 1068 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): 1069 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape 1070 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. 1071 | 1072 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention 1073 | heads. 1074 | 1075 | Examples:: 1076 | 1077 | from transformers import BertTokenizer, BertForTokenClassification 1078 | import torch 1079 | 1080 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 1081 | model = BertForTokenClassification.from_pretrained('bert-base-uncased') 1082 | 1083 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 1084 | labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 1085 | outputs = model(input_ids, labels=labels) 1086 | 1087 | loss, scores = outputs[:2] 1088 | 1089 | """ 1090 | 1091 | outputs = self.bert( 1092 | input_ids, 1093 | attention_mask=attention_mask, 1094 | token_type_ids=token_type_ids, 1095 | head_mask=head_mask, 1096 | inputs_embeds=inputs_embeds, 1097 | ) 1098 | 1099 | sequence_output = outputs[0] 1100 | 1101 | sequence_output = self.dropout(sequence_output) 1102 | logits = self.classifier(sequence_output) 1103 | 1104 | outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here 1105 | if labels is not None: 1106 | loss_fct = CrossEntropyLoss() 1107 | # Only keep active parts of the loss 1108 | if attention_mask is not None: 1109 | active_loss = attention_mask.view(-1) == 1 1110 | active_logits = logits.view(-1, self.num_labels) 1111 | active_labels = torch.where( 1112 | active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) 1113 | ) 1114 | loss = loss_fct(active_logits, active_labels) 1115 | else: 1116 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 1117 | outputs = (loss,) + outputs 1118 | 1119 | return outputs # (loss), scores, (hidden_states), (attentions) 1120 | 1121 | 1122 | @add_start_docstrings( 1123 | """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear 1124 | layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, 1125 | BERT_START_DOCSTRING, 1126 | ) 1127 | class NeZhaForQuestionAnswering(NeZhaPreTrainedModel): 1128 | def __init__(self, config): 1129 | super().__init__(config) 1130 | self.num_labels = config.num_labels 1131 | self.bert = NeZhaModel(config) 1132 | self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) 1133 | self.init_weights() 1134 | 1135 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) 1136 | def forward( 1137 | self, 1138 | input_ids=None, 1139 | attention_mask=None, 1140 | token_type_ids=None, 1141 | head_mask=None, 1142 | inputs_embeds=None, 1143 | position_ids=None, 1144 | start_positions=None, 1145 | end_positions=None, 1146 | ): 1147 | r""" 1148 | start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): 1149 | Labels for position (index) of the start of the labelled span for computing the token classification loss. 1150 | Positions are clamped to the length of the sequence (`sequence_length`). 1151 | Position outside of the sequence are not taken into account for computing the loss. 1152 | end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): 1153 | Labels for position (index) of the end of the labelled span for computing the token classification loss. 1154 | Positions are clamped to the length of the sequence (`sequence_length`). 1155 | Position outside of the sequence are not taken into account for computing the loss. 1156 | 1157 | Returns: 1158 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: 1159 | loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): 1160 | Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. 1161 | start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): 1162 | Span-start scores (before SoftMax). 1163 | end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): 1164 | Span-end scores (before SoftMax). 1165 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): 1166 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) 1167 | of shape :obj:`(batch_size, sequence_length, hidden_size)`. 1168 | 1169 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 1170 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): 1171 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape 1172 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. 1173 | 1174 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention 1175 | heads. 1176 | 1177 | Examples:: 1178 | 1179 | from transformers import BertTokenizer, BertForQuestionAnswering 1180 | import torch 1181 | 1182 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 1183 | model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') 1184 | 1185 | question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" 1186 | encoding = tokenizer.encode_plus(question, text) 1187 | input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] 1188 | start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) 1189 | 1190 | all_tokens = tokenizer.convert_ids_to_tokens(input_ids) 1191 | answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) 1192 | 1193 | assert answer == "a nice puppet" 1194 | 1195 | """ 1196 | 1197 | outputs = self.bert( 1198 | input_ids, 1199 | attention_mask=attention_mask, 1200 | token_type_ids=token_type_ids, 1201 | head_mask=head_mask, 1202 | inputs_embeds=inputs_embeds, 1203 | ) 1204 | 1205 | sequence_output = outputs[0] 1206 | 1207 | logits = self.qa_outputs(sequence_output) 1208 | start_logits, end_logits = logits.split(1, dim=-1) 1209 | start_logits = start_logits.squeeze(-1) 1210 | end_logits = end_logits.squeeze(-1) 1211 | 1212 | outputs = (start_logits, end_logits,) + outputs[2:] 1213 | if start_positions is not None and end_positions is not None: 1214 | # If we are on multi-GPU, split add a dimension 1215 | if len(start_positions.size()) > 1: 1216 | start_positions = start_positions.squeeze(-1) 1217 | if len(end_positions.size()) > 1: 1218 | end_positions = end_positions.squeeze(-1) 1219 | # sometimes the start/end positions are outside our model inputs, we ignore these terms 1220 | ignored_index = start_logits.size(1) 1221 | start_positions.clamp_(0, ignored_index) 1222 | end_positions.clamp_(0, ignored_index) 1223 | 1224 | loss_fct = CrossEntropyLoss(ignore_index=ignored_index) 1225 | start_loss = loss_fct(start_logits, start_positions) 1226 | end_loss = loss_fct(end_logits, end_positions) 1227 | total_loss = (start_loss + end_loss) / 2 1228 | outputs = (total_loss,) + outputs 1229 | 1230 | return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) 1231 | --------------------------------------------------------------------------------