├── data
├── requirements.txt
├── read_me.md
└── code
│ ├── models
│ ├── __pycache__
│ │ └── nezha.cpython-37.pyc
│ └── nezha.py
│ ├── util
│ ├── others
│ │ ├── __pycache__
│ │ │ ├── hanzi.cpython-37.pyc
│ │ │ └── label2id.cpython-37.pyc
│ │ ├── label2id.py
│ │ └── hanzi.py
│ ├── tools
│ │ ├── __pycache__
│ │ │ ├── predict_tools.cpython-37.pyc
│ │ │ └── finetune_tools.cpython-37.pyc
│ │ ├── predict_tools.py
│ │ └── finetune_tools.py
│ ├── pretrain_utils
│ │ ├── __pycache__
│ │ │ ├── trainer.cpython-37.pyc
│ │ │ └── trainer_args.cpython-37.pyc
│ │ └── trainer_args.py
│ └── modeling
│ │ └── modeling_nezha
│ │ ├── __pycache__
│ │ ├── modeling.cpython-37.pyc
│ │ └── configuration.cpython-37.pyc
│ │ ├── configuration.py
│ │ └── modeling.py
│ ├── fusion_code
│ └── run_fusion.py
│ ├── predict_code
│ └── run_predictor.py
│ ├── build_vocab
│ └── build_vocab.py
│ ├── process_data
│ └── process_data.py
│ ├── finetune_code
│ └── run_classify.py
│ └── pretrain_code
│ └── run_pretrain.py
├── .idea
├── .gitignore
├── vcs.xml
├── misc.xml
├── inspectionProfiles
│ ├── profiles_settings.xml
│ └── Project_Default.xml
├── modules.xml
└── daguancup_end2end.iml
├── .gitattributes
└── READ_ME.md
/data/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.7.1
2 | transformers==4.3.0.rc1
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/data/read_me.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/read_me.md
--------------------------------------------------------------------------------
/data/code/models/__pycache__/nezha.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/models/__pycache__/nezha.cpython-37.pyc
--------------------------------------------------------------------------------
/data/code/util/others/__pycache__/hanzi.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/others/__pycache__/hanzi.cpython-37.pyc
--------------------------------------------------------------------------------
/data/code/util/others/__pycache__/label2id.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/others/__pycache__/label2id.cpython-37.pyc
--------------------------------------------------------------------------------
/data/code/util/tools/__pycache__/predict_tools.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/tools/__pycache__/predict_tools.cpython-37.pyc
--------------------------------------------------------------------------------
/data/code/util/tools/__pycache__/finetune_tools.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/tools/__pycache__/finetune_tools.cpython-37.pyc
--------------------------------------------------------------------------------
/data/code/util/pretrain_utils/__pycache__/trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/pretrain_utils/__pycache__/trainer.cpython-37.pyc
--------------------------------------------------------------------------------
/data/code/util/pretrain_utils/__pycache__/trainer_args.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/pretrain_utils/__pycache__/trainer_args.cpython-37.pyc
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/data/code/util/modeling/modeling_nezha/__pycache__/modeling.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/modeling/modeling_nezha/__pycache__/modeling.cpython-37.pyc
--------------------------------------------------------------------------------
/data/code/util/modeling/modeling_nezha/__pycache__/configuration.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/modeling/modeling_nezha/__pycache__/configuration.cpython-37.pyc
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/daguancup_end2end.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/data/code/fusion_code/run_fusion.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import os
4 | import sys
5 | import csv
6 | import numpy as np
7 | import pandas as pd
8 |
9 | sys.path.append('../../../data')
10 | from argparse import ArgumentParser
11 | from data.code.util.others.label2id import id2label
12 |
13 |
14 | def fusion(args):
15 | k, predictions = 0, 0
16 |
17 | tmp = pd.read_csv(os.path.join(args.result_path, 'output_result', 'full_logit.csv'))
18 | tmp = tmp.values
19 | predictions += tmp
20 | predictions = np.argmax(predictions, axis=-1)
21 | result = []
22 | for i in predictions:
23 | result.append((k, id2label[str(i)]))
24 | k += 1
25 | write2tsv(args.submit_path, result)
26 |
27 |
28 | def write2tsv(output_path, data):
29 | with open(output_path, 'w', newline='') as f:
30 | tsv_w = csv.writer(f, delimiter=',')
31 | tsv_w.writerow(['id', 'label'])
32 | tsv_w.writerows(data)
33 |
34 |
35 | def main():
36 | parser = ArgumentParser()
37 | parser.add_argument('--result_path', type=str, default="../../user_data")
38 | parser.add_argument('--submit_path', type=str, default=f'../../prediction_result/result.csv')
39 |
40 | args = parser.parse_args()
41 |
42 | fusion(args)
43 |
44 |
45 | if __name__ == '__main__':
46 | main()
47 |
--------------------------------------------------------------------------------
/data/code/util/others/label2id.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | label2id = {
4 | '1-1': 0,
5 | '1-10': 1,
6 | '1-4': 2,
7 | '1-9': 3,
8 | '10-26': 4,
9 | '2-11': 5,
10 | '2-14': 6,
11 | '2-17': 7,
12 | '2-2': 8,
13 | '2-25': 9,
14 | '2-3': 10,
15 | '2-33': 11,
16 | '2-6': 12,
17 | '3-5': 13,
18 | '4-7': 14,
19 | '5-12': 15,
20 | '5-22': 16,
21 | '5-24': 17,
22 | '5-30': 18,
23 | '5-35': 19,
24 | '6-13': 20,
25 | '6-15': 21,
26 | '6-19': 22,
27 | '6-20': 23,
28 | '6-21': 24,
29 | '6-28': 25,
30 | '6-29': 26,
31 | '6-31': 27,
32 | '6-32': 28,
33 | '6-34': 29,
34 | '6-8': 30,
35 | '7-16': 31,
36 | '8-18': 32,
37 | '8-27': 33,
38 | '9-23': 34
39 | }
40 |
41 | id2label = {
42 | '0': '1-1',
43 | '1': '1-10',
44 | '2': '1-4',
45 | '3': '1-9',
46 | '4': '10-26',
47 | '5': '2-11',
48 | '6': '2-14',
49 | '7': '2-17',
50 | '8': '2-2',
51 | '9': '2-25',
52 | '10': '2-3',
53 | '11': '2-33',
54 | '12': '2-6',
55 | '13': '3-5',
56 | '14': '4-7',
57 | '15': '5-12',
58 | '16': '5-22',
59 | '17': '5-24',
60 | '18': '5-30',
61 | '19': '5-35',
62 | '20': '6-13',
63 | '21': '6-15',
64 | '22': '6-19',
65 | '23': '6-20',
66 | '24': '6-21',
67 | '25': '6-28',
68 | '26': '6-29',
69 | '27': '6-31',
70 | '28': '6-32',
71 | '29': '6-34',
72 | '30': '6-8',
73 | '31': '7-16',
74 | '32': '8-18',
75 | '33': '8-27',
76 | '34': '9-23'
77 | }
78 |
79 | # print(label2id['9-23'])
80 | # print(id2label['0'])
--------------------------------------------------------------------------------
/data/code/predict_code/run_predictor.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import sys
4 | import warnings
5 | from argparse import ArgumentParser
6 |
7 | sys.path.append('../../../data')
8 | from data.code.util.tools.predict_tools import *
9 |
10 |
11 | def main():
12 | parser = ArgumentParser()
13 |
14 | parser.add_argument('--vocab_path', type=str, default='../../user_data/tokenizer/vocab.txt')
15 | parser.add_argument('--output_result_path', type=str, default='../../user_data/output_result')
16 | parser.add_argument('--data_cache_path', type=str, default='../../user_data/process_data/pkl')
17 | parser.add_argument('--test_path', type=str, default='../../user_data/process_data/test.txt')
18 | parser.add_argument('--load_model_path', type=str, default='../../user_data/output_model')
19 | parser.add_argument('--batch_size', type=int, default=128 * 8)
20 | parser.add_argument('--max_seq_len', type=int, default=128)
21 | parser.add_argument('--device', type=str, default='cuda')
22 |
23 | args = parser.parse_args()
24 | warnings.filterwarnings('ignore')
25 |
26 | os.makedirs(args.output_result_path, exist_ok=True)
27 | tokenizer = BertTokenizer.from_pretrained(args.vocab_path)
28 |
29 | if not os.path.exists(os.path.join(args.data_cache_path, 'test.pkl')):
30 | read_data(args, tokenizer)
31 |
32 | test_dataloader = load_data(args, tokenizer)
33 |
34 | model = NeZhaSequenceClassification_P.from_pretrained(os.path.join(args.load_model_path, f'last-checkpoint'))
35 | model.to(args.device)
36 | model.eval()
37 |
38 | final_res = predict(test_dataloader, model, args)
39 | final_res.tolist()
40 | save2csv(args, final_res)
41 |
42 |
43 | if __name__ == '__main__':
44 | main()
45 |
--------------------------------------------------------------------------------
/READ_ME.md:
--------------------------------------------------------------------------------
1 | # 0.The 5th Dagan Cup, Team name: XiaoChuan Sun , 4th in the A list, 7th in the B list, single model throughout.
2 | ##### Competition Address:https://www.datafountain.cn/competitions/512/ranking?isRedance=0&sch=1804
3 |
4 | # 1.data process details
5 |
6 | ##### 1.1.The maximum length of a sentence is limited to 128, and any sentence longer than 128 is truncated (by taking the first 32 and the last 96).
7 |
8 |
9 | # 2.pretrain details
10 |
11 | ##### 2.1.The data used is the first 18W json (title+content) of the unlabeled data, totaling 36W (training set and test set data are not used, because I forgot to use them).
12 |
13 | ##### 2.2.The pre-training model used is nezha-cn-base, and the pre-training task is albert's ngram mask, as well as the Word Structural Objective task borrowed from structbert, in the time of mask, a randomly selected trigram is disrupted, and while the model predicts the original token, it also does the restoration operation, which is equivalent to the improvement of this task of structbert.
14 |
15 |
16 | # 3.finetune details
17 |
18 | ##### 3.1.Regular tricks are: PGD, Lookahead, EMA, stratified learning rate, TSA, etc.
19 | ##### 3.2.Customized the model architecture as follows.
20 | ###### 3.2.1.Taking the CLS of the last five layers of all hidden layer states for splicing works best (tried many kinds of structures, such as: post-connected CNN/LSTM, MSD, MEAN-POOLING, etc.).
21 | ###### 3.2.2.Because the data comes with two levels of labels, the labels are cut (primary label: 10, secondary label 35) and the loss is calculated separately (for the output hidden_state, it goes through two linear layers respectively, each linear layer output dimension corresponds to a different number of labels).
22 | ###### 3.2.3.The self-researched method, in the model fine-tuning, in each batch, let the model to predict the training set, the prediction results and the real label between the loss of feedback, pulling the distance between the predicted label and the real label, the effect has slightly improved, in other data sets have been tested, not deep investigation of it, is an innovative point.
23 |
--------------------------------------------------------------------------------
/data/code/build_vocab/build_vocab.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import os
4 | import torch
5 | import random
6 | import logging
7 | import warnings
8 | import numpy as np
9 | from argparse import ArgumentParser
10 |
11 | from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer
12 | from transformers import BertTokenizer
13 |
14 | logging.basicConfig()
15 | logger = logging.getLogger('build vocab')
16 | logger.setLevel(logging.INFO)
17 |
18 |
19 | def seed_everything(seed):
20 | random.seed(seed)
21 | os.environ['PYTHONHASHSEED'] = str(seed)
22 | np.random.seed(seed)
23 | torch.manual_seed(seed)
24 | torch.cuda.manual_seed(seed)
25 | torch.cuda.manual_seed_all(seed)
26 | torch.backends.cudnn.benchmark = False
27 | torch.backends.cudnn.deterministic = True
28 |
29 |
30 | def train_tokenizer(args):
31 | tokenizer = BertWordPieceTokenizer(
32 | clean_text=False,
33 | handle_chinese_chars=True,
34 | strip_accents=False,
35 | lowercase=False
36 | )
37 | special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
38 |
39 | # for i in range(100):
40 | # special_tokens.append(f"[unused{i}]")
41 |
42 | tokenizer.train(
43 | files=[args.file_path, args.unlabeled_file_path],
44 | vocab_size=args.vocab_size,
45 | min_frequency=1,
46 | special_tokens=special_tokens,
47 | limit_alphabet=args.vocab_size,
48 | wordpieces_prefix="##"
49 | )
50 | os.makedirs(args.out_path, exist_ok=True)
51 | tokenizer.save_model(args.out_path)
52 | tokenizer = BertTokenizer.from_pretrained(args.out_path,
53 | do_lower_case=False,
54 | strip_accents=False)
55 | tokenizer.save_pretrained(args.out_path)
56 | logger.info(f'save tokenizer, with vocab_size: {tokenizer.vocab_size}')
57 |
58 |
59 | if __name__ == '__main__':
60 | parser = ArgumentParser()
61 |
62 | parser.add_argument('--seed', type=int, default=2021)
63 | parser.add_argument('--vocab_size', type=int, default=21128)
64 | parser.add_argument('--file_path', type=str, default='../../user_data/process_data/pretrain.txt')
65 | parser.add_argument('--unlabeled_file_path', type=str,
66 | default='../../user_data/process_data/unlabeled_pretrain.txt')
67 | parser.add_argument('--out_path', type=str, default='../../user_data/tokenizer')
68 |
69 | warnings.filterwarnings('ignore')
70 | args = parser.parse_args()
71 |
72 | seed_everything(args.seed)
73 |
74 | train_tokenizer(args)
75 |
76 | logger.info(f'vocab creation completed .')
77 |
--------------------------------------------------------------------------------
/data/code/util/others/hanzi.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Constants for working with Chinese characters."""
3 |
4 | from __future__ import unicode_literals
5 | import sys
6 |
7 | #: Character code ranges for pertinent CJK ideograph Unicode blocks.
8 | characters = cjk_ideographs = (
9 | '\u3007' # Ideographic number zero, see issue #17
10 | '\u4E00-\u9FFF' # CJK Unified Ideographs
11 | '\u3400-\u4DBF' # CJK Unified Ideographs Extension A
12 | '\uF900-\uFAFF' # CJK Compatibility Ideographs
13 | )
14 | if sys.maxunicode > 0xFFFF:
15 | characters += (
16 | '\U00020000-\U0002A6DF' # CJK Unified Ideographs Extension B
17 | '\U0002A700-\U0002B73F' # CJK Unified Ideographs Extension C
18 | '\U0002B740-\U0002B81F' # CJK Unified Ideographs Extension D
19 | '\U0002F800-\U0002FA1F' # CJK Compatibility Ideographs Supplement
20 | )
21 |
22 | #: Character code ranges for the Kangxi radicals and CJK Radicals Supplement.
23 | radicals = (
24 | '\u2F00-\u2FD5' # Kangxi Radicals
25 | '\u2E80-\u2EF3' # CJK Radicals Supplement
26 | )
27 |
28 | #: A string containing Chinese punctuation marks (non-stops).
29 | non_stops = (
30 | # Fullwidth ASCII variants
31 | '\uFF02\uFF03\uFF04\uFF05\uFF06\uFF07\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0D'
32 | '\uFF0F\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF20\uFF3B\uFF3C\uFF3D\uFF3E\uFF3F'
33 | '\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF60'
34 |
35 | # Halfwidth CJK punctuation
36 | '\uFF62\uFF63\uFF64'
37 |
38 | # CJK symbols and punctuation
39 | '\u3000\u3001\u3003'
40 |
41 | # CJK angle and corner brackets
42 | '\u3008\u3009\u300A\u300B\u300C\u300D\u300E\u300F\u3010\u3011'
43 |
44 | # CJK brackets and symbols/punctuation
45 | '\u3014\u3015\u3016\u3017\u3018\u3019\u301A\u301B\u301C\u301D\u301E\u301F'
46 |
47 | # Other CJK symbols
48 | '\u3030'
49 |
50 | # Special CJK indicators
51 | '\u303E\u303F'
52 |
53 | # Dashes
54 | '\u2013\u2014'
55 |
56 | # Quotation marks and apostrophe
57 | '\u2018\u2019\u201B\u201C\u201D\u201E\u201F'
58 |
59 | # General punctuation
60 | '\u2026\u2027'
61 |
62 | # Overscores and underscores
63 | '\uFE4F'
64 |
65 | # Small form variants
66 | '\uFE51\uFE54'
67 |
68 | # Latin punctuation
69 | '\u00B7'
70 | )
71 |
72 | #: A string of Chinese stops.
73 | stops = (
74 | '\uFF01' # Fullwidth exclamation mark
75 | '\uFF1F' # Fullwidth question mark
76 | '\uFF61' # Halfwidth ideographic full stop
77 | '\u3002' # Ideographic full stop
78 | )
79 |
80 | #: A string containing all Chinese punctuation.
81 | punctuation = non_stops + stops
82 |
83 | # A sentence end is defined by a stop followed by zero or more
84 | # container-closing marks (e.g. quotation or brackets).
85 | _sentence_end = '[{stops}][」﹂”』’》)]}〕〗〙〛〉】]*'.format(stops=stops)
86 |
87 | #: A regular expression pattern for a Chinese sentence. A sentence is defined
88 | #: as a series of characters and non-stop punctuation marks followed by a stop
89 | #: and zero or more container-closing punctuation marks (e.g. apostrophe or
90 | # brackets).
91 | sent = sentence = '[{characters}{radicals}{non_stops}]*{sentence_end}'.format(
92 | characters=characters, radicals=radicals, non_stops=non_stops,
93 | sentence_end=_sentence_end)
--------------------------------------------------------------------------------
/data/code/models/nezha.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from data.code.util.modeling.modeling_nezha.modeling import NeZhaPreTrainedModel, NeZhaModel
4 |
5 |
6 | class NeZhaSequenceClassification_F(NeZhaPreTrainedModel):
7 | def __init__(self, config):
8 | super().__init__(config)
9 | self.level1_num_labels = 10
10 | self.num_labels = 35
11 | self.bert = NeZhaModel(config)
12 | self.level1_classifier = nn.Linear(config.hidden_size * 5, self.level1_num_labels)
13 | self.classifier = nn.Linear(config.hidden_size * 5, self.num_labels)
14 | self.init_weights()
15 |
16 | def forward(
17 | self,
18 | input_ids=None,
19 | attention_mask=None,
20 | token_type_ids=None,
21 | labels=None,
22 | level1_labels=None
23 | ):
24 | attention_mask = torch.ne(input_ids, 0)
25 | encoder_out, pooled_out, all_hidden_outputs = self.bert(
26 | input_ids=input_ids,
27 | attention_mask=attention_mask,
28 | token_type_ids=token_type_ids
29 | )
30 |
31 | last_hidden = torch.cat(
32 | (
33 | all_hidden_outputs[-1][:, 0],
34 | all_hidden_outputs[-2][:, 0],
35 | all_hidden_outputs[-3][:, 0],
36 | all_hidden_outputs[-4][:, 0],
37 | all_hidden_outputs[-5][:, 0]
38 | ),
39 | 1
40 | )
41 |
42 | logits = self.classifier(last_hidden)
43 | outputs = (logits,) + (pooled_out,)
44 |
45 | if labels is not None:
46 | loss_fct = nn.CrossEntropyLoss()
47 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
48 |
49 | if level1_labels is not None:
50 | level1_logits = self.level1_classifier(last_hidden)
51 | level1_loss = loss_fct(level1_logits.view(-1, self.level1_num_labels),
52 | level1_labels.view(-1))
53 | loss = loss + 0.5 * level1_loss
54 | outputs = (loss,) + outputs
55 |
56 | return outputs
57 |
58 |
59 | class NeZhaSequenceClassification_P(NeZhaPreTrainedModel):
60 | def __init__(self, config):
61 | super().__init__(config)
62 | self.level1_num_labels = 10
63 | self.num_labels = 35
64 | self.bert = NeZhaModel(config)
65 | self.level1_classifier = nn.Linear(config.hidden_size * 5, self.level1_num_labels)
66 | self.classifier = nn.Linear(config.hidden_size * 5, self.num_labels)
67 | self.init_weights()
68 |
69 | def forward(
70 | self,
71 | input_ids=None,
72 | attention_mask=None,
73 | token_type_ids=None
74 | ):
75 | attention_mask = torch.ne(input_ids, 0)
76 | encoder_out, pooled_out, all_hidden_outputs = self.bert(
77 | input_ids=input_ids,
78 | attention_mask=attention_mask,
79 | token_type_ids=token_type_ids
80 | )
81 |
82 | last_hidden = torch.cat(
83 | (
84 | all_hidden_outputs[-1][:, 0],
85 | all_hidden_outputs[-2][:, 0],
86 | all_hidden_outputs[-3][:, 0],
87 | all_hidden_outputs[-4][:, 0],
88 | all_hidden_outputs[-5][:, 0]
89 | ),
90 | 1
91 | )
92 |
93 | logits = self.classifier(last_hidden)
94 | outputs = (logits,) + (pooled_out,)
95 |
96 | return outputs
97 |
--------------------------------------------------------------------------------
/data/code/process_data/process_data.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import os
4 | import sys
5 | import json
6 | import logging
7 | import warnings
8 | import pandas as pd
9 | from tqdm import tqdm
10 | from argparse import ArgumentParser
11 | from data.code.util.others.label2id import label2id
12 |
13 | sys.path.append('../../../data')
14 |
15 | logging.basicConfig()
16 | logger = logging.getLogger('第五届达观杯')
17 | logger.setLevel(logging.INFO)
18 |
19 |
20 | def cut_text(text, args):
21 | char = [i for i in text.split(' ')]
22 | length = len(char)
23 | if length > args.max_length:
24 | head = char[:32]
25 | tail = char[-96:]
26 | new_char = head + tail
27 | new_text = ''
28 | for i in new_char:
29 | new_text += i + ' '
30 | new_text = new_text.strip()
31 | return new_text
32 | else:
33 | return text.strip()
34 |
35 |
36 | def process_unlabeled_data(args):
37 | text = []
38 | with open(args.unlabeled_path, 'r') as f, open(args.out_unlabeled_path, 'w', encoding='utf-8') as w:
39 | for i in tqdm(range(args.number_unlabeled), desc='processing unlabeled data'):
40 | line_data = f.readline()
41 | if line_data:
42 | data = json.loads(line_data)
43 | title = data['title']
44 | content = data['content']
45 | if title == '' or content == '':
46 | continue
47 | else:
48 | text.append(title)
49 | text.append(content)
50 | for j in text:
51 | w.writelines(j + '\n')
52 | text = []
53 | else:
54 | break
55 |
56 |
57 | def process_text(args):
58 | train = pd.read_csv(args.train_path)
59 | test = pd.read_csv(args.test_path)
60 |
61 | train_text = train['text'].tolist()
62 | test_text = test['text'].tolist()
63 | pretrain_text = train_text + test_text
64 |
65 | label = train['label'].tolist()
66 |
67 | pretrain_sentence, train_sentence, train_sentence1, test_sentence = [], [], [], []
68 | for i in pretrain_text:
69 | pretrain_sentence.append(i.strip())
70 |
71 | pretrain_sentence = list(set(pretrain_sentence))
72 |
73 | logger.info(f'total pretrain data : {len(pretrain_sentence)}.')
74 |
75 | for i in train_text:
76 | train_sentence.append(cut_text(i, args))
77 |
78 | for i in range(len(train_sentence)):
79 | tgt_level1, tgt_level2 = label[i].split('-')
80 | tgt = label2id[label[i]]
81 | line = train_sentence[i] + '\t' + str(tgt) + '\t' + str(int(tgt_level1) - 1)
82 | train_sentence1.append(line)
83 |
84 | logger.info(f'total train data : {len(train_sentence)}.')
85 |
86 | for i in test_text:
87 | test_sentence.append(cut_text(i, args))
88 |
89 | logger.info(f'total test data : {len(test_sentence)}.')
90 |
91 | return pretrain_sentence, train_sentence1, test_sentence
92 |
93 |
94 | def write(text_list, out_path):
95 | with open(out_path, 'w', encoding='utf-8') as f:
96 | for i in text_list:
97 | f.writelines(i + '\n')
98 |
99 | logger.info(f'process data has been written to {out_path}.')
100 |
101 |
102 | if __name__ == '__main__':
103 | parser = ArgumentParser()
104 |
105 | parser.add_argument('--max_length', type=int, default=128)
106 | parser.add_argument('--number_unlabeled', type=int, default=180000)
107 | parser.add_argument('--unlabeled_path', type=str, default='../../raw_data/datagrand_2021_unlabeled_data.json')
108 | parser.add_argument('--train_path', type=str, default='../../raw_data/datagrand_2021_train.csv')
109 | parser.add_argument('--test_path', type=str, default='../../raw_data/datagrand_2021_test.csv')
110 | parser.add_argument('--out_path', type=str, default='../../user_data/process_data/')
111 | parser.add_argument('--out_unlabeled_path', type=str,
112 | default='../../user_data/process_data/unlabeled_pretrain.txt')
113 |
114 | warnings.filterwarnings('ignore')
115 | args = parser.parse_args()
116 |
117 | os.makedirs(args.out_path, exist_ok=True)
118 |
119 | out_pretrain_path = os.path.join(args.out_path, 'pretrain.txt')
120 | out_train_path = os.path.join(args.out_path, 'train.txt')
121 | out_test_path = os.path.join(args.out_path, 'test.txt')
122 |
123 | process_unlabeled_data(args)
124 | pretrain, train, test = process_text(args)
125 |
126 | write(pretrain, out_pretrain_path)
127 | write(train, out_train_path)
128 | write(test, out_test_path)
129 |
130 | logger.info(f'data processing completed .')
131 |
--------------------------------------------------------------------------------
/data/code/util/tools/predict_tools.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import os
4 | import sys
5 | import pickle
6 | import numpy as np
7 | import pandas as pd
8 | from tqdm import tqdm
9 | from collections import defaultdict
10 | from transformers import BertTokenizer
11 | from torch.utils.data import Dataset, DataLoader
12 |
13 | sys.path.append('../../../../data')
14 | from data.code.models.nezha import *
15 |
16 |
17 | def build_model_and_tokenizer_nezha(args):
18 | tokenizer = BertTokenizer.from_pretrained(args.vocab_path)
19 | model = NeZhaSequenceClassification_P.from_pretrained(os.path.join(args.load_model_path, f'last-checkpoint'))
20 | model.to(args.device)
21 | model.eval()
22 |
23 | return tokenizer, model
24 |
25 |
26 | def read_data(args, tokenizer):
27 | test_df = pd.read_csv(args.test_path, header=None, sep='\t')
28 |
29 | inputs = defaultdict(list)
30 | for i, row in tqdm(test_df.iterrows(), desc=f'Preprocessing test data', total=len(test_df)):
31 | sentence = row[0]
32 | build_bert_inputs(inputs, sentence, tokenizer)
33 |
34 | data_cache_path = args.data_cache_path
35 | if not os.path.exists(data_cache_path):
36 | os.makedirs(data_cache_path)
37 |
38 | cache_pkl_path = os.path.join(data_cache_path, 'test.pkl')
39 | with open(cache_pkl_path, 'wb') as f:
40 | pickle.dump(inputs, f)
41 |
42 | return cache_pkl_path
43 |
44 |
45 | def build_bert_inputs(inputs, sentence, tokenizer):
46 | inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True,
47 | return_token_type_ids=True, return_attention_mask=True)
48 | inputs['input_ids'].append(inputs_dict['input_ids'])
49 | inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
50 | inputs['attention_mask'].append(inputs_dict['attention_mask'])
51 |
52 |
53 | class DGDataset(Dataset):
54 | def __init__(self, data_dict: dict):
55 | super(DGDataset, self).__init__()
56 | self.data_dict = data_dict
57 |
58 | def __getitem__(self, index: int) -> tuple:
59 | data = (
60 | self.data_dict['input_ids'][index],
61 | self.data_dict['token_type_ids'][index],
62 | self.data_dict['attention_mask'][index]
63 | )
64 | return data
65 |
66 | def __len__(self) -> int:
67 | return len(self.data_dict['input_ids'])
68 |
69 |
70 | class Collator:
71 | def __init__(self, max_seq_len: int, tokenizer: BertTokenizer):
72 | self.max_seq_len = max_seq_len
73 | self.tokenizer = tokenizer
74 |
75 | def pad_and_truncate(self, input_ids_list, token_type_ids_list,
76 | attention_mask_list, max_seq_len):
77 | input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long)
78 | token_type_ids = torch.zeros_like(input_ids)
79 | attention_mask = torch.zeros_like(input_ids)
80 | for i in range(len(input_ids_list)):
81 | seq_len = len(input_ids_list[i])
82 | if seq_len <= max_seq_len:
83 | input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long)
84 | token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
85 | attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)
86 | else:
87 | input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id],
88 | dtype=torch.long)
89 | token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long)
90 | attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long)
91 |
92 | return input_ids, token_type_ids, attention_mask
93 |
94 | def __call__(self, examples: list) -> dict:
95 | input_ids_list, token_type_ids_list, attention_mask_list = list(zip(*examples))
96 | cur_max_seq_len = max(len(input_id) for input_id in input_ids_list)
97 | max_seq_len = min(cur_max_seq_len, self.max_seq_len)
98 |
99 | input_ids, token_type_ids, attention_mask = self.pad_and_truncate(input_ids_list, token_type_ids_list,
100 | attention_mask_list, max_seq_len)
101 |
102 | data_dict = {
103 | 'input_ids': input_ids,
104 | 'token_type_ids': token_type_ids,
105 | 'attention_mask': attention_mask
106 | }
107 |
108 | return data_dict
109 |
110 |
111 | def load_data(args, tokenizer):
112 | cache_pkl_path = os.path.join(args.data_cache_path, 'test.pkl')
113 |
114 | with open(cache_pkl_path, 'rb') as f:
115 | test_data = pickle.load(f)
116 |
117 | collate_fn = Collator(args.max_seq_len, tokenizer)
118 | test_dataset = DGDataset(test_data)
119 | test_dataloader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False,
120 | num_workers=0, collate_fn=collate_fn)
121 | return test_dataloader
122 |
123 |
124 | def save2csv(args, p_logit):
125 | logit_path = os.path.join(args.output_result_path, 'full_logit.csv')
126 | result = pd.DataFrame(p_logit, columns=["label%d" % i for i in range(p_logit.shape[-1])])
127 | result.to_csv(logit_path, index=False)
128 |
129 | print(f"result hace save in :{logit_path} .")
130 |
131 |
132 | def batch2cuda(args, batch):
133 | return {item: value.to(args.device) for item, value in list(batch.items())}
134 |
135 |
136 | def predict(test_dataloader, pre_model, args):
137 | p_logit = []
138 |
139 | val_iterator = tqdm(test_dataloader, desc='Predict', total=len(test_dataloader))
140 |
141 | with torch.no_grad():
142 | for batch in val_iterator:
143 | batch_cuda = batch2cuda(args, batch)
144 | logits = pre_model(**batch_cuda)[0]
145 | p_logit.extend(torch.softmax(logits, -1).cpu().numpy())
146 |
147 | return np.vstack(p_logit)
148 |
149 |
150 | def create_dirs(path_list):
151 | for path in path_list:
152 | os.makedirs(path, exist_ok=True)
153 |
--------------------------------------------------------------------------------
/data/code/util/modeling/modeling_nezha/configuration.py:
--------------------------------------------------------------------------------
1 |
2 | from transformers import PretrainedConfig
3 |
4 | NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
5 |
6 | class NeZhaConfig(PretrainedConfig):
7 | r"""
8 | This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
9 | It is used to instantiate an ALBERT model according to the specified arguments, defining the model
10 | architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
11 | the ALBERT `xxlarge `__ architecture.
12 |
13 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
14 | to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
15 | for more information.
16 |
17 |
18 | Args:
19 | vocab_size (:obj:`int`, optional, defaults to 30000):
20 | Vocabulary size of the ALBERT model. Defines the different tokens that
21 | can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
22 | embedding_size (:obj:`int`, optional, defaults to 128):
23 | Dimensionality of vocabulary embeddings.
24 | hidden_size (:obj:`int`, optional, defaults to 4096):
25 | Dimensionality of the encoder layers and the pooler layer.
26 | num_hidden_layers (:obj:`int`, optional, defaults to 12):
27 | Number of hidden layers in the Transformer encoder.
28 | num_hidden_groups (:obj:`int`, optional, defaults to 1):
29 | Number of groups for the hidden layers, parameters in the same group are shared.
30 | num_attention_heads (:obj:`int`, optional, defaults to 64):
31 | Number of attention heads for each attention layer in the Transformer encoder.
32 | intermediate_size (:obj:`int`, optional, defaults to 16384):
33 | The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
34 | inner_group_num (:obj:`int`, optional, defaults to 1):
35 | The number of inner repetition of attention and ffn.
36 | hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
37 | The non-linear activation function (function or string) in the encoder and pooler.
38 | If string, "gelu", "relu", "swish" and "gelu_new" are supported.
39 | hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
40 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
41 | attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
42 | The dropout ratio for the attention probabilities.
43 | max_position_embeddings (:obj:`int`, optional, defaults to 512):
44 | The maximum sequence length that this model might ever be used with. Typically set this to something
45 | large (e.g., 512 or 1024 or 2048).
46 | type_vocab_size (:obj:`int`, optional, defaults to 2):
47 | The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
48 | initializer_range (:obj:`float`, optional, defaults to 0.02):
49 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
50 | layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
51 | The epsilon used by the layer normalization layers.
52 | classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
53 | The dropout ratio for attached classifiers.
54 |
55 | Example::
56 |
57 | from transformers import AlbertConfig, AlbertModel
58 | # Initializing an ALBERT-xxlarge style configuration
59 | albert_xxlarge_configuration = AlbertConfig()
60 |
61 | # Initializing an ALBERT-base style configuration
62 | albert_base_configuration = AlbertConfig(
63 | hidden_size=768,
64 | num_attention_heads=12,
65 | intermediate_size=3072,
66 | )
67 |
68 | # Initializing a model from the ALBERT-base style configuration
69 | model = AlbertModel(albert_xxlarge_configuration)
70 |
71 | # Accessing the model configuration
72 | configuration = model.config
73 |
74 | Attributes:
75 | pretrained_config_archive_map (Dict[str, str]):
76 | A dictionary containing all the available pre-trained checkpoints.
77 | """
78 |
79 | pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP
80 | model_type = "nezha"
81 |
82 | def __init__(
83 | self,
84 | vocab_size=30000,
85 | embedding_size=128,
86 | hidden_size=4096,
87 | num_hidden_layers=12,
88 | num_hidden_groups=1,
89 | num_attention_heads=64,
90 | intermediate_size=16384,
91 | inner_group_num=1,
92 | hidden_act="gelu_new",
93 | hidden_dropout_prob=0,
94 | attention_probs_dropout_prob=0,
95 | max_position_embeddings=512,
96 | max_relative_position=64,
97 | type_vocab_size=2,
98 | initializer_range=0.02,
99 | layer_norm_eps=1e-12,
100 | classifier_dropout_prob=0.1,
101 | use_relative_position=True,
102 | pad_token_id=0,
103 | bos_token_id=2,
104 | eos_token_id=3,
105 | **kwargs
106 | ):
107 | super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
108 |
109 | self.vocab_size = vocab_size
110 | self.embedding_size = embedding_size
111 | self.hidden_size = hidden_size
112 | self.num_hidden_layers = num_hidden_layers
113 | self.num_hidden_groups = num_hidden_groups
114 | self.num_attention_heads = num_attention_heads
115 | self.inner_group_num = inner_group_num
116 | self.hidden_act = hidden_act
117 | self.intermediate_size = intermediate_size
118 | self.hidden_dropout_prob = hidden_dropout_prob
119 | self.attention_probs_dropout_prob = attention_probs_dropout_prob
120 | self.max_position_embeddings = max_position_embeddings
121 | self.max_relative_position = max_relative_position
122 | self.type_vocab_size = type_vocab_size
123 | self.initializer_range = initializer_range
124 | self.layer_norm_eps = layer_norm_eps
125 | self.use_relative_position=use_relative_position
126 | self.classifier_dropout_prob = classifier_dropout_prob
127 |
--------------------------------------------------------------------------------
/data/code/finetune_code/run_classify.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import gc
4 | import sys
5 | import warnings
6 | from torch import multiprocessing
7 | from argparse import ArgumentParser
8 |
9 | sys.path.append('../../../data')
10 | from data.code.util.tools.finetune_tools import *
11 |
12 | multiprocessing.set_sharing_strategy('file_system')
13 |
14 |
15 | class PGD:
16 | def __init__(self, args, model):
17 | self.model = model
18 | self.emb_backup = {}
19 | self.grad_backup = {}
20 | self.epsilon = args.epsilon
21 | self.emb_name = args.emb_name
22 | self.alpha = args.alpha
23 |
24 | def attack(self, is_first_attack=False):
25 | for name, param in self.model.bert.named_parameters():
26 | if param.requires_grad and self.emb_name in name:
27 | if is_first_attack:
28 | self.emb_backup[name] = param.data.clone()
29 | norm = torch.norm(param.grad)
30 | if norm != 0 and not torch.isnan(norm):
31 | r_at = self.alpha * param.grad / norm
32 | param.data.add_(r_at)
33 | param.data = self.project(name, param.data, self.epsilon)
34 |
35 | def restore(self):
36 | for name, param in self.model.bert.named_parameters():
37 | if param.requires_grad and self.emb_name in name:
38 | assert name in self.emb_backup
39 | param.data = self.emb_backup[name]
40 | self.emb_backup = {}
41 |
42 | def project(self, param_name, param_data, epsilon):
43 | r = param_data - self.emb_backup[param_name]
44 | if torch.norm(r) > epsilon:
45 | r = epsilon * r / torch.norm(r)
46 | return self.emb_backup[param_name] + r
47 |
48 | def backup_grad(self):
49 | for name, param in self.model.bert.named_parameters():
50 | if param.requires_grad and param.grad is not None:
51 | self.grad_backup[name] = param.grad.clone()
52 |
53 | def restore_grad(self):
54 | for name, param in self.model.bert.named_parameters():
55 | if param.requires_grad and param.grad is not None:
56 | param.grad = self.grad_backup[name]
57 |
58 |
59 | def train(args):
60 | tokenizer, model = build_model_and_tokenizer(args)
61 |
62 | if not os.path.exists(os.path.join(args.data_cache_path, 'train.pkl')):
63 | read_data(args, tokenizer)
64 |
65 | train_dataloader = load_data(args, tokenizer)
66 |
67 | total_steps = args.num_epochs * len(train_dataloader)
68 |
69 | optimizer, scheduler = build_optimizer(args, model, total_steps)
70 |
71 | total_loss, cur_avg_loss, global_steps = 0., 0., 0
72 |
73 | for epoch in range(1, args.num_epochs + 1):
74 |
75 | train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
76 |
77 | model.train()
78 |
79 | for batch in train_iterator:
80 | batch_cuda = batch2cuda(args, batch)
81 | loss, logits = model(**batch_cuda)[:2]
82 |
83 | # TSA, 仅 backward loss 小于 阈值的 loss
84 | start, end = 1. / logits.shape[-1], 1
85 | tsa_thresh = get_tsa_thresh(args, global_steps, total_steps, start, end)
86 | larger_than_threshold = torch.exp(-loss) > tsa_thresh
87 | loss_mask = torch.ones_like(batch_cuda['labels'], dtype=torch.float32) * (1 - larger_than_threshold.
88 | type(torch.float32))
89 | loss = torch.sum(loss * loss_mask, dim=-1) / torch.max(torch.sum(loss_mask, dim=-1),
90 | torch.tensor(1.).to(args.device))
91 |
92 | total_loss += loss.item()
93 | cur_avg_loss += loss.item()
94 |
95 | loss.backward()
96 |
97 | if args.adv == 'pgd':
98 | pgd = PGD(args, model)
99 | K = args.adv_k
100 | pgd.backup_grad()
101 | for t in range(K):
102 | pgd.attack(is_first_attack=(t == 0))
103 | if t != K - 1:
104 | model.zero_grad()
105 | else:
106 | pgd.restore_grad()
107 | adv_loss, adv_logits = model(**batch_cuda)[:2]
108 | adv_loss.backward()
109 | pgd.restore()
110 |
111 | optimizer.step()
112 | scheduler.step()
113 | optimizer.zero_grad()
114 |
115 | if args.ema_start:
116 | ema.update()
117 |
118 | if epoch >= args.ema_start_epoch:
119 | args.ema_start = True
120 | ema = EMA(model.module if hasattr(model, 'module') else model, decay=0.999)
121 |
122 | if (global_steps + 1) % args.logging_step == 0:
123 | epoch_avg_loss = cur_avg_loss / args.logging_step
124 | global_avg_loss = total_loss / (global_steps + 1)
125 |
126 | print(f"\n>> epoch - {epoch}, global steps - {global_steps + 1}, "
127 | f"epoch avg loss - {epoch_avg_loss:.4f}, global avg loss - {global_avg_loss:.4f}.")
128 |
129 | cur_avg_loss = 0.0
130 |
131 | global_steps += 1
132 |
133 | if epoch >= args.ema_start_epoch:
134 | ema.apply_shadow()
135 |
136 | save_model(args, model, tokenizer)
137 |
138 | del model, tokenizer, optimizer, scheduler
139 | torch.cuda.empty_cache()
140 | gc.collect()
141 |
142 |
143 | def main():
144 | parser = ArgumentParser()
145 | parser.add_argument('--output_path', type=str,
146 | default='../../user_data/output_model')
147 | parser.add_argument('--train_path', type=str,
148 | default='../../user_data/process_data/train.txt')
149 | parser.add_argument('--data_cache_path', type=str,
150 | default='../../user_data/process_data/pkl')
151 | parser.add_argument('--vocab_path', type=str,
152 | default='../../user_data/tokenizer/vocab.txt')
153 | parser.add_argument('--model_path', type=str,
154 | default='../../user_data/saved_pretrain_model_record/checkpoint-240000')
155 |
156 | parser.add_argument('--num_epochs', type=int, default=4)
157 | parser.add_argument('--batch_size', type=int, default=32)
158 | parser.add_argument('--max_seq_len', type=int, default=128)
159 |
160 | parser.add_argument('--learning_rate', type=float, default=2e-5)
161 | parser.add_argument('--downstream_learning_rate', type=float, default=1e-4)
162 | parser.add_argument('--eps', type=float, default=1e-8)
163 |
164 | parser.add_argument('--adv_k', type=int, default=10)
165 | parser.add_argument('--alpha', type=float, default=0.3)
166 | parser.add_argument('--epsilon', type=float, default=0.5)
167 | parser.add_argument('--emb_name', type=str, default='word_embeddings.')
168 | parser.add_argument('--adv', type=str, default='pgd', choices=['', 'pgd'])
169 |
170 | parser.add_argument('--lookahead_k', type=int, default=5)
171 | parser.add_argument('--lookahead_alpha', type=int, default=1)
172 |
173 | parser.add_argument('--ema_start', type=bool, default=False)
174 | parser.add_argument('--ema_start_epoch', type=int, default=3)
175 |
176 | parser.add_argument('--schedule', type=str, default='log', choices=['linear', 'exp', 'log'])
177 |
178 | parser.add_argument('--warmup_ratio', type=float, default=0.1)
179 | parser.add_argument('--weight_decay', type=float, default=0.01)
180 |
181 | parser.add_argument('--logging_step', type=int, default=100)
182 |
183 | parser.add_argument('--seed', type=int, default=2021)
184 |
185 | parser.add_argument('--device', type=str, default='cuda')
186 |
187 | warnings.filterwarnings('ignore')
188 | args = parser.parse_args()
189 |
190 | os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
191 |
192 | seed_everything(args.seed)
193 | train(args)
194 |
195 |
196 | if __name__ == '__main__':
197 | main()
198 |
--------------------------------------------------------------------------------
/data/code/pretrain_code/run_pretrain.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import os
4 | import re
5 | import sys
6 | import random
7 | import warnings
8 | import numpy as np
9 | import pandas as pd
10 | from tqdm import tqdm
11 | from typing import List, Tuple
12 | from collections import defaultdict
13 | from argparse import ArgumentParser
14 |
15 | import torch
16 | from torch.utils.data import Dataset
17 | from transformers import BertTokenizer, TrainingArguments
18 |
19 | sys.path.append('../../../data')
20 | from data.code.util.others.hanzi import punctuation
21 | from data.code.util.pretrain_utils.trainer import Trainer
22 | from data.code.util.modeling.modeling_nezha.modeling import NeZhaConfig, NeZhaForMaskedLM
23 |
24 | warnings.filterwarnings('ignore')
25 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
26 | os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1'
27 |
28 |
29 | def seed_everything(seed):
30 | random.seed(seed)
31 | np.random.seed(seed)
32 | torch.manual_seed(seed)
33 | torch.cuda.manual_seed_all(seed)
34 | return seed
35 |
36 |
37 | def read_data(pretrain_file_path, tokenizer: BertTokenizer) -> dict:
38 | pretrain_df = pd.read_csv(pretrain_file_path, header=None, sep='\t')
39 | inputs = defaultdict(list)
40 | for i, row in tqdm(pretrain_df.iterrows(), desc='', total=len(pretrain_df)):
41 | sentence = row[0].strip()
42 | sentence = re.sub(r"[%s]+" % punctuation, '[SEP]', sentence)
43 | inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True,
44 | return_token_type_ids=True, return_attention_mask=True)
45 | inputs['input_ids'].append(inputs_dict['input_ids'])
46 | inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
47 | inputs['attention_mask'].append(inputs_dict['attention_mask'])
48 |
49 | return inputs
50 |
51 |
52 | class DGDataset(Dataset):
53 | def __init__(self, data_dict: dict):
54 | super(Dataset, self).__init__()
55 | self.data_dict = data_dict
56 |
57 | def __getitem__(self, index: int) -> tuple:
58 | data = (self.data_dict['input_ids'][index],
59 | self.data_dict['token_type_ids'][index],
60 | self.data_dict['attention_mask'][index])
61 |
62 | return data
63 |
64 | def __len__(self) -> int:
65 | return len(self.data_dict['input_ids'])
66 |
67 |
68 | class DGDataCollator:
69 | def __init__(self, max_seq_len: int, tokenizer: BertTokenizer, mlm_probability=0.15):
70 | self.max_seq_len = max_seq_len
71 | self.tokenizer = tokenizer
72 | self.mlm_probability = mlm_probability
73 | self.special_token_ids = {tokenizer.cls_token_id, tokenizer.sep_token_id}
74 |
75 | def pad_and_truncate(self, input_ids_list, token_type_ids_list,
76 | attention_mask_list, max_seq_len):
77 | input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long)
78 | token_type_ids = torch.zeros_like(input_ids)
79 | attention_mask = torch.zeros_like(input_ids)
80 | for i in range(len(input_ids_list)):
81 | seq_len = len(input_ids_list[i])
82 | if seq_len <= max_seq_len:
83 | input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long)
84 | token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
85 | attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)
86 | else:
87 | input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id],
88 | dtype=torch.long)
89 | token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long)
90 | attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long)
91 | return input_ids, token_type_ids, attention_mask
92 |
93 | def _ngram_mask(self, input_ids, max_seq_len):
94 | cand_indexes = []
95 | for (i, id_) in enumerate(input_ids):
96 | if id_ in self.special_token_ids:
97 | continue
98 | cand_indexes.append([i])
99 | num_to_predict = max(1, int(round(len(input_ids) * self.mlm_probability)))
100 |
101 | max_ngram = 3
102 | ngrams = np.arange(1, max_ngram + 1, dtype=np.int64)
103 | pvals = 1. / np.arange(1, max_ngram + 1)
104 | pvals /= pvals.sum(keepdims=True)
105 |
106 | ngram_indexes = []
107 | for idx in range(len(cand_indexes)):
108 | ngram_index = []
109 | for n in ngrams:
110 | ngram_index.append(cand_indexes[idx:idx + n])
111 | ngram_indexes.append(ngram_index)
112 | np.random.shuffle(ngram_indexes)
113 |
114 | covered_indexes = set()
115 |
116 | for cand_index_set in ngram_indexes:
117 | if len(covered_indexes) >= num_to_predict:
118 | break
119 | if not cand_index_set:
120 | continue
121 | for index_set in cand_index_set[0]:
122 | for index in index_set:
123 | if index in covered_indexes:
124 | continue
125 | n = np.random.choice(ngrams[:len(cand_index_set)],
126 | p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True))
127 | index_set = sum(cand_index_set[n - 1], [])
128 | n -= 1
129 | while len(covered_indexes) + len(index_set) > num_to_predict:
130 | if n == 0:
131 | break
132 | index_set = sum(cand_index_set[n - 1], [])
133 | n -= 1
134 | if len(covered_indexes) + len(index_set) > num_to_predict:
135 | continue
136 | is_any_index_covered = False
137 | for index in index_set:
138 | if index in covered_indexes:
139 | is_any_index_covered = True
140 | break
141 | if is_any_index_covered:
142 | continue
143 | for index in index_set:
144 | covered_indexes.add(index)
145 |
146 | mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_ids))]
147 | mask_labels += [0] * (max_seq_len - len(mask_labels))
148 |
149 | return torch.tensor(mask_labels[:max_seq_len])
150 |
151 | def ngram_mask(self, input_ids_list: List[list], max_seq_len: int):
152 | mask_labels = []
153 | for i, input_ids in enumerate(input_ids_list):
154 | mask_label = self._ngram_mask(input_ids, max_seq_len)
155 | mask_labels.append(mask_label)
156 | return torch.stack(mask_labels, dim=0)
157 |
158 | def mask_tokens(self, inputs: torch.Tensor, mask_labels: torch.Tensor) -> \
159 | Tuple[torch.Tensor, torch.Tensor]:
160 |
161 | labels = inputs.clone()
162 | probability_matrix = mask_labels
163 |
164 | # word struct prediction
165 |
166 | '''
167 | complete by yourself
168 | '''
169 |
170 | special_tokens_mask = [
171 | self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
172 | ]
173 | probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
174 | masked_indices = probability_matrix.bool()
175 | labels[~masked_indices] = -100
176 | indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
177 | inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
178 | indices_random = torch.bernoulli(
179 | torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
180 | random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
181 | inputs[indices_random] = random_words[indices_random]
182 | return inputs, labels
183 |
184 | def __call__(self, examples: list) -> dict:
185 | input_ids_list, token_type_ids_list, attention_mask_list = list(zip(*examples))
186 | cur_max_seq_len = max(len(input_id) for input_id in input_ids_list)
187 | max_seq_len = min(cur_max_seq_len, self.max_seq_len)
188 |
189 | input_ids, token_type_ids, attention_mask = self.pad_and_truncate(input_ids_list,
190 | token_type_ids_list,
191 | attention_mask_list,
192 | max_seq_len)
193 | batch_mask = self.ngram_mask(input_ids_list, max_seq_len)
194 | input_ids, mlm_labels = self.mask_tokens(input_ids, batch_mask)
195 | data_dict = {
196 | 'input_ids': input_ids,
197 | 'attention_mask': attention_mask,
198 | 'token_type_ids': token_type_ids,
199 | 'labels': mlm_labels
200 | }
201 |
202 | return data_dict
203 |
204 |
205 | def main():
206 | parser = ArgumentParser()
207 | parser.add_argument('--pretrain_data_path', type=str, default='../../user_data/process_data/unlabeled_pretrain.txt')
208 | parser.add_argument('--pretrain_model_path', type=str, default='../../user_data/pretrain_model/nezha-cn-base')
209 | parser.add_argument('--vocab_path', type=str, default='../../user_data/tokenizer/vocab.txt')
210 | parser.add_argument('--save_path', type=str, default='../../user_data/saved_pretrain_model')
211 | parser.add_argument('--record_save_path', type=str, default='../../user_data/saved_pretrain_model_record')
212 | parser.add_argument('--mlm_probability', type=float, default=0.15)
213 | parser.add_argument('--num_train_epochs', type=int, default=100)
214 | parser.add_argument('--seq_length', type=int, default=128)
215 | parser.add_argument('--batch_size', type=int, default=64)
216 | parser.add_argument('--learning_rate', type=float, default=6e-5)
217 | parser.add_argument('--save_steps', type=int, default=10000)
218 | parser.add_argument('--ckpt_save_limit', type=int, default=6)
219 | parser.add_argument('--logging_steps', type=int, default=2000)
220 | parser.add_argument('--seed', type=int, default=2021)
221 | parser.add_argument('--fp16', type=str, default=True)
222 | parser.add_argument('--fp16_backend', type=str, default='amp')
223 |
224 | warnings.filterwarnings('ignore')
225 | args = parser.parse_args()
226 |
227 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
228 | os.makedirs(os.path.dirname(args.record_save_path), exist_ok=True)
229 |
230 | tokenizer = BertTokenizer.from_pretrained(args.vocab_path)
231 | model_config = NeZhaConfig.from_pretrained(args.pretrain_model_path)
232 |
233 | data = read_data(args.pretrain_data_path, tokenizer)
234 |
235 | data_collator = DGDataCollator(max_seq_len=args.seq_length,
236 | tokenizer=tokenizer,
237 | mlm_probability=args.mlm_probability)
238 | model = NeZhaForMaskedLM.from_pretrained(pretrained_model_name_or_path=args.pretrain_model_path,
239 | config=model_config)
240 | model.resize_token_embeddings(tokenizer.vocab_size)
241 | dataset = DGDataset(data)
242 |
243 | training_args = TrainingArguments(
244 | seed=args.seed,
245 | fp16=args.fp16,
246 | fp16_backend=args.fp16_backend,
247 | save_steps=args.save_steps,
248 | prediction_loss_only=True,
249 | logging_steps=args.logging_steps,
250 | output_dir=args.record_save_path,
251 | learning_rate=args.learning_rate,
252 | save_total_limit=args.ckpt_save_limit,
253 | num_train_epochs=args.num_train_epochs,
254 | per_device_train_batch_size=args.batch_size
255 | )
256 |
257 | trainer = Trainer(
258 | model=model,
259 | args=training_args,
260 | train_dataset=dataset,
261 | data_collator=data_collator
262 | )
263 |
264 | trainer.train()
265 | trainer.save_model(args.save_path)
266 | tokenizer.save_pretrained(args.save_path)
267 |
268 |
269 | if __name__ == '__main__':
270 | main()
271 |
--------------------------------------------------------------------------------
/data/code/util/tools/finetune_tools.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import pickle
4 | import random
5 | import numpy as np
6 | import pandas as pd
7 | from torch.optim import AdamW
8 | from torch.utils.data import Dataset, DataLoader
9 | from tqdm import tqdm
10 | from transformers import BertTokenizer
11 | from collections import defaultdict
12 |
13 | from torch.optim import Optimizer
14 | from torch.optim.lr_scheduler import LambdaLR
15 |
16 | sys.path.append('../../../../data')
17 | from data.code.models.nezha import *
18 |
19 |
20 | def seed_everything(seed):
21 | torch.manual_seed(seed)
22 | torch.cuda.manual_seed(seed)
23 | torch.cuda.manual_seed_all(seed)
24 | torch.backends.cudnn.benchmark = False
25 | torch.backends.cudnn.deterministic = True
26 | random.seed(seed)
27 | np.random.seed(seed)
28 | os.environ['PYTHONHASHSEED'] = str(seed)
29 |
30 |
31 | def batch2cuda(args, batch):
32 | return {item: value.to(args.device) for item, value in list(batch.items())}
33 |
34 |
35 | def build_model_and_tokenizer(args):
36 | tokenizer = BertTokenizer.from_pretrained(args.vocab_path)
37 | model = NeZhaSequenceClassification_F.from_pretrained(args.model_path)
38 | model.to(args.device)
39 |
40 | return tokenizer, model
41 |
42 |
43 | class PGD:
44 | def __init__(self, args, model):
45 | self.model = model
46 | self.emb_backup = {}
47 | self.grad_backup = {}
48 | self.epsilon = args.epsilon
49 | self.emb_name = args.emb_name
50 | self.alpha = args.alpha
51 |
52 | def attack(self, is_first_attack=False):
53 | for name, param in self.model.bert.named_parameters():
54 | if param.requires_grad and self.emb_name in name:
55 | if is_first_attack:
56 | self.emb_backup[name] = param.data.clone()
57 | norm = torch.norm(param.grad)
58 | if norm != 0 and not torch.isnan(norm):
59 | r_at = self.alpha * param.grad / norm
60 | param.data.add_(r_at)
61 | param.data = self.project(name, param.data, self.epsilon)
62 |
63 | def restore(self):
64 | for name, param in self.model.bert.named_parameters():
65 | if param.requires_grad and self.emb_name in name:
66 | assert name in self.emb_backup
67 | param.data = self.emb_backup[name]
68 | self.emb_backup = {}
69 |
70 | def project(self, param_name, param_data, epsilon):
71 | r = param_data - self.emb_backup[param_name]
72 | if torch.norm(r) > epsilon:
73 | r = epsilon * r / torch.norm(r)
74 | return self.emb_backup[param_name] + r
75 |
76 | def backup_grad(self):
77 | for name, param in self.model.bert.named_parameters():
78 | if param.requires_grad and param.grad is not None:
79 | self.grad_backup[name] = param.grad.clone()
80 |
81 | def restore_grad(self):
82 | for name, param in self.model.bert.named_parameters():
83 | if param.requires_grad and param.grad is not None:
84 | param.grad = self.grad_backup[name]
85 |
86 |
87 | class Lookahead(Optimizer):
88 | def __init__(self, optimizer, k=5, alpha=0.5):
89 | self.optimizer = optimizer
90 | self.k = k
91 | self.alpha = alpha
92 | self.param_groups = self.optimizer.param_groups
93 | self.state = defaultdict(dict)
94 | self.fast_state = self.optimizer.state
95 | for group in self.param_groups:
96 | group["counter"] = 0
97 |
98 | def update(self, group):
99 | for fast in group["params"]:
100 | param_state = self.state[fast]
101 | if "slow_param" not in param_state:
102 | param_state["slow_param"] = torch.zeros_like(fast.data)
103 | param_state["slow_param"].copy_(fast.data)
104 | slow = param_state["slow_param"]
105 | slow += (fast.data - slow) * self.alpha
106 | fast.data.copy_(slow)
107 |
108 | def update_lookahead(self):
109 | for group in self.param_groups:
110 | self.update(group)
111 |
112 | def step(self, closure=None):
113 | loss = self.optimizer.step(closure)
114 | for group in self.param_groups:
115 | if group["counter"] == 0:
116 | self.update(group)
117 | group["counter"] += 1
118 | if group["counter"] >= self.k:
119 | group["counter"] = 0
120 | return loss
121 |
122 | def state_dict(self):
123 | fast_state_dict = self.optimizer.state_dict()
124 | slow_state = {
125 | (id(k) if isinstance(k, torch.Tensor) else k): v
126 | for k, v in self.state.items()
127 | }
128 | fast_state = fast_state_dict["state"]
129 | param_groups = fast_state_dict["param_groups"]
130 | return {
131 | "fast_state": fast_state,
132 | "slow_state": slow_state,
133 | "param_groups": param_groups,
134 | }
135 |
136 | def load_state_dict(self, state_dict):
137 | slow_state_dict = {
138 | "state": state_dict["slow_state"],
139 | "param_groups": state_dict["param_groups"],
140 | }
141 | fast_state_dict = {
142 | "state": state_dict["fast_state"],
143 | "param_groups": state_dict["param_groups"],
144 | }
145 | super(Lookahead, self).load_state_dict(slow_state_dict)
146 | self.optimizer.load_state_dict(fast_state_dict)
147 | self.fast_state = self.optimizer.state
148 |
149 | def add_param_group(self, param_group):
150 | param_group["counter"] = 0
151 | self.optimizer.add_param_group(param_group)
152 |
153 |
154 | class EMA:
155 | def __init__(self, model, decay):
156 | self.model = model
157 | self.decay = decay
158 | self.shadow = {}
159 | self.backup = {}
160 | self.register()
161 |
162 | def register(self):
163 | for name, param in self.model.named_parameters():
164 | if param.requires_grad:
165 | self.shadow[name] = param.data.clone()
166 |
167 | def update(self):
168 | for name, param in self.model.named_parameters():
169 | if param.requires_grad:
170 | assert name in self.shadow
171 | new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name]
172 | self.shadow[name] = new_average.clone()
173 |
174 | def apply_shadow(self):
175 | for name, param in self.model.named_parameters():
176 | if param.requires_grad:
177 | assert name in self.shadow
178 | self.backup[name] = param.data
179 | param.data = self.shadow[name]
180 |
181 | def restore(self):
182 | for name, param in self.model.named_parameters():
183 | if param.requires_grad:
184 | assert name in self.backup
185 | param.data = self.backup[name]
186 | self.backup = {}
187 |
188 |
189 | class WarmupLinearSchedule(LambdaLR):
190 | def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
191 | self.warmup_steps = warmup_steps
192 | self.t_total = t_total
193 | super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
194 |
195 | def lr_lambda(self, step):
196 | if step < self.warmup_steps:
197 | return float(step) / float(max(1, self.warmup_steps))
198 | return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
199 |
200 |
201 | def build_optimizer(args, model, train_steps):
202 | no_decay = ['bias', 'LayerNorm.weight']
203 |
204 | bert_model_param, bert_downstream_param = [], []
205 |
206 | for items in model.named_parameters():
207 | if "bert" in items:
208 | bert_model_param.append(items)
209 | else:
210 | bert_downstream_param.append(items)
211 |
212 | optimizer_grouped_parameters = [
213 | {"params": [p for n, p in bert_model_param if
214 | not any(nd in n for nd in no_decay)],
215 | 'weight_decay_rate': args.weight_decay, "lr": args.learning_rate},
216 | {'params': [p for n, p in bert_model_param if
217 | any(nd in n for nd in no_decay)],
218 | 'weight_decay_rate': 0.0, 'lr': args.learning_rate},
219 |
220 | {"params": [p for n, p in bert_downstream_param if
221 | not any(nd in n for nd in no_decay)],
222 | 'weight_decay_rate': args.weight_decay, "lr": args.downstream_learning_rate},
223 | {'params': [p for n, p in bert_downstream_param if
224 | any(nd in n for nd in no_decay)],
225 | 'weight_decay_rate': 0.0, 'lr': args.downstream_learning_rate}
226 | ]
227 |
228 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.eps)
229 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_steps * args.warmup_ratio,
230 | t_total=train_steps)
231 | optimizer = Lookahead(optimizer, args.lookahead_k, args.lookahead_alpha)
232 |
233 | return optimizer, scheduler
234 |
235 |
236 | def save_model(args, model, tokenizer):
237 | model_to_save = model.module if hasattr(model, 'module') else model
238 | model_save_path = os.path.join(args.output_path, f'last-checkpoint')
239 | model_to_save.save_pretrained(model_save_path)
240 | tokenizer.save_vocabulary(model_save_path)
241 |
242 | print(f'model saved in : {model_save_path} .')
243 |
244 |
245 | def get_tsa_thresh(args, global_step, num_train_steps, start, end):
246 | training_progress = torch.tensor(float(global_step) / float(num_train_steps))
247 |
248 | if args.schedule == 'linear':
249 | threshold = training_progress
250 | elif args.schedule == 'exp':
251 | scale = 5
252 | threshold = torch.exp((training_progress - 1) * scale)
253 | elif args.schedule == 'log':
254 | scale = 5
255 | threshold = 1 - torch.exp((-training_progress) * scale)
256 |
257 | output = threshold * (end - start) + start
258 |
259 | return output.to(args.device)
260 |
261 |
262 | def read_data(args, tokenizer):
263 | train_df = pd.read_csv(args.train_path, header=None, sep='\t')
264 |
265 | inputs = defaultdict(list)
266 | for i, row in tqdm(train_df.iterrows(), desc=f'Preprocessing train data', total=len(train_df)):
267 | sentence, label, level1_label = row
268 | build_bert_inputs(inputs, label, level1_label, sentence, tokenizer)
269 |
270 | data_cache_path = args.data_cache_path
271 | if not os.path.exists(data_cache_path):
272 | os.makedirs(data_cache_path)
273 |
274 | cache_pkl_path = os.path.join(data_cache_path, 'train.pkl')
275 | with open(cache_pkl_path, 'wb') as f:
276 | pickle.dump(inputs, f)
277 |
278 | return cache_pkl_path
279 |
280 |
281 | def build_bert_inputs(inputs, label, level1_label, sentence, tokenizer):
282 | inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True,
283 | return_token_type_ids=True, return_attention_mask=True)
284 | inputs['input_ids'].append(inputs_dict['input_ids'])
285 | inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
286 | inputs['attention_mask'].append(inputs_dict['attention_mask'])
287 | inputs['labels'].append(label)
288 | inputs['level1_labels'].append(level1_label)
289 |
290 |
291 | class DGDataset(Dataset):
292 | def __init__(self, data_dict: dict, tokenizer: BertTokenizer):
293 | super(DGDataset, self).__init__()
294 | self.data_dict = data_dict
295 | self.tokenizer = tokenizer
296 |
297 | def __getitem__(self, index: int) -> tuple:
298 | data = (
299 | self.data_dict['input_ids'][index],
300 | self.data_dict['token_type_ids'][index],
301 | self.data_dict['attention_mask'][index],
302 | self.data_dict['labels'][index],
303 | self.data_dict['level1_labels'][index]
304 | )
305 |
306 | return data
307 |
308 | def __len__(self) -> int:
309 | return len(self.data_dict['input_ids'])
310 |
311 |
312 | class Collator:
313 | def __init__(self, max_seq_len: int, tokenizer: BertTokenizer):
314 | self.max_seq_len = max_seq_len
315 | self.tokenizer = tokenizer
316 |
317 | def pad_and_truncate(self, input_ids_list, token_type_ids_list,
318 | attention_mask_list, labels_list, level1_labels_list, max_seq_len):
319 | input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long)
320 | token_type_ids = torch.zeros_like(input_ids)
321 | attention_mask = torch.zeros_like(input_ids)
322 | for i in range(len(input_ids_list)):
323 | seq_len = len(input_ids_list[i])
324 | if seq_len <= max_seq_len:
325 | input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long)
326 | token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
327 | attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)
328 | else:
329 | input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id],
330 | dtype=torch.long)
331 | token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long)
332 | attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long)
333 |
334 | labels = torch.tensor(labels_list, dtype=torch.long)
335 | level1_labels = torch.tensor(level1_labels_list, dtype=torch.long)
336 | return input_ids, token_type_ids, attention_mask, labels, level1_labels
337 |
338 | def __call__(self, examples: list) -> dict:
339 | input_ids_list, token_type_ids_list, attention_mask_list, labels_list, level1_labels_list = list(zip(*examples))
340 | cur_max_seq_len = max(len(input_id) for input_id in input_ids_list)
341 | max_seq_len = min(cur_max_seq_len, self.max_seq_len)
342 |
343 | input_ids, token_type_ids, attention_mask, labels, level1_labels = \
344 | self.pad_and_truncate(input_ids_list, token_type_ids_list, attention_mask_list,
345 | labels_list, level1_labels_list, max_seq_len)
346 |
347 | data_dict = {
348 | 'input_ids': input_ids,
349 | 'token_type_ids': token_type_ids,
350 | 'attention_mask': attention_mask,
351 | 'labels': labels,
352 | 'level1_labels': level1_labels
353 | }
354 |
355 | return data_dict
356 |
357 |
358 | def load_data(args, tokenizer):
359 | cache_pkl_path = os.path.join(args.data_cache_path, 'train.pkl')
360 |
361 | with open(cache_pkl_path, 'rb') as f:
362 | train_data = pickle.load(f)
363 |
364 | collate_fn = Collator(args.max_seq_len, tokenizer)
365 | train_dataset = DGDataset(train_data, tokenizer)
366 | train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True,
367 | num_workers=0, collate_fn=collate_fn)
368 | return train_dataloader
369 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
293 |
294 |
295 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
--------------------------------------------------------------------------------
/data/code/util/pretrain_utils/trainer_args.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import json
4 | import os
5 | from dataclasses import asdict, dataclass, field
6 | from enum import Enum
7 | from typing import Any, Dict, List, Optional
8 |
9 | from transformers.file_utils import (
10 | cached_property,
11 | is_torch_available,
12 | is_torch_tpu_available,
13 | torch_required,
14 | )
15 | from transformers.trainer_utils import EvaluationStrategy, SchedulerType
16 | from transformers.utils import logging
17 |
18 |
19 | if is_torch_available():
20 | import torch
21 |
22 | if is_torch_tpu_available():
23 | import torch_xla.core.xla_model as xm
24 |
25 |
26 | logger = logging.get_logger(__name__)
27 |
28 |
29 | def default_logdir() -> str:
30 | """
31 | Same default as PyTorch
32 | """
33 | import socket
34 | from datetime import datetime
35 |
36 | current_time = datetime.now().strftime("%b%d_%H-%M-%S")
37 | return os.path.join("runs", current_time + "_" + socket.gethostname())
38 |
39 |
40 | @dataclass
41 | class TrainingArguments:
42 | """
43 | TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
44 | itself**.
45 |
46 | Using :class:`~transformers.HfArgumentParser` we can turn this class into `argparse
47 | `__ arguments that can be specified on the command
48 | line.
49 |
50 |
51 |
52 |
53 | Parameters:
54 | output_dir (:obj:`str`):
55 | The output directory where the model predictions and checkpoints will be written.
56 | overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
57 | If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
58 | :obj:`output_dir` points to a checkpoint directory.
59 | do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
60 | Whether to run training or not. This argument is not directly used by :class:`~transformers.Trainer`, it's
61 | intended to be used by your training/evaluation scripts instead. See the `example scripts
62 | `__ for more details.
63 | do_eval (:obj:`bool`, `optional`):
64 | Whether to run evaluation on the validation set or not. Will be set to :obj:`True` if
65 | :obj:`evaluation_strategy` is different from :obj:`"no"`. This argument is not directly used by
66 | :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
67 | the `example scripts `__ for more
68 | details.
69 | do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
70 | Whether to run predictions on the test set or not. This argument is not directly used by
71 | :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
72 | the `example scripts `__ for more
73 | details.
74 | evaluation_strategy (:obj:`str` or :class:`~transformers.trainer_utils.EvaluationStrategy`, `optional`, defaults to :obj:`"no"`):
75 | The evaluation strategy to adopt during training. Possible values are:
76 |
77 | * :obj:`"no"`: No evaluation is done during training.
78 | * :obj:`"steps"`: Evaluation is done (and logged) every :obj:`eval_steps`.
79 | * :obj:`"epoch"`: Evaluation is done at the end of each epoch.
80 |
81 | prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
82 | When performing evaluation and generating predictions, only returns the loss.
83 | per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
84 | The batch size per GPU/TPU core/CPU for training.
85 | per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
86 | The batch size per GPU/TPU core/CPU for evaluation.
87 | gradient_accumulation_steps (:obj:`int`, `optional`, defaults to 1):
88 | Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
89 |
90 | .. warning::
91 |
92 | When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
93 | logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
94 | examples.
95 | eval_accumulation_steps (:obj:`int`, `optional`):
96 | Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
97 | left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but
98 | requires more memory).
99 | learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
100 | The initial learning rate for :class:`~transformers.AdamW` optimizer.
101 | weight_decay (:obj:`float`, `optional`, defaults to 0):
102 | The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in
103 | :class:`~transformers.AdamW` optimizer.
104 | adam_beta1 (:obj:`float`, `optional`, defaults to 0.9):
105 | The beta1 hyperparameter for the :class:`~transformers.AdamW` optimizer.
106 | adam_beta2 (:obj:`float`, `optional`, defaults to 0.999):
107 | The beta2 hyperparameter for the :class:`~transformers.AdamW` optimizer.
108 | adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
109 | The epsilon hyperparameter for the :class:`~transformers.AdamW` optimizer.
110 | max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
111 | Maximum gradient norm (for gradient clipping).
112 | num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
113 | Total number of training epochs to perform (if not an integer, will perform the decimal part percents of
114 | the last epoch before stopping training).
115 | max_steps (:obj:`int`, `optional`, defaults to -1):
116 | If set to a positive number, the total number of training steps to perform. Overrides
117 | :obj:`num_train_epochs`.
118 | lr_scheduler_type (:obj:`str` or :class:`~transformers.SchedulerType`, `optional`, defaults to :obj:`"linear"`):
119 | The scheduler type to use. See the documentation of :class:`~transformers.SchedulerType` for all possible
120 | values.
121 | warmup_steps (:obj:`int`, `optional`, defaults to 0):
122 | Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
123 | logging_dir (:obj:`str`, `optional`):
124 | `TensorBoard `__ log directory. Will default to
125 | `runs/**CURRENT_DATETIME_HOSTNAME**`.
126 | logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
127 | Whether to log and evaluate the first :obj:`global_step` or not.
128 | logging_steps (:obj:`int`, `optional`, defaults to 500):
129 | Number of update steps between two logs.
130 | save_steps (:obj:`int`, `optional`, defaults to 500):
131 | Number of updates steps before two checkpoint saves.
132 | save_total_limit (:obj:`int`, `optional`):
133 | If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
134 | :obj:`output_dir`.
135 | no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
136 | Whether to not use CUDA even when it is available or not.
137 | seed (:obj:`int`, `optional`, defaults to 42):
138 | Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
139 | :func:`~transformers.Trainer.model_init` function to instantiate the model if it has some randomly
140 | initialized parameters.
141 | fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
142 | Whether to use 16-bit (mixed) precision training (through NVIDIA Apex) instead of 32-bit training.
143 | fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
144 | For :obj:`fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
145 | on the `Apex documentation `__.
146 | fp16_backend (:obj:`str`, `optional`, defaults to :obj:`"auto"`):
147 | The backend to use for mixed precision training. Must be one of :obj:`"auto"`, :obj:`"amp"` or
148 | :obj:`"apex"`. :obj:`"auto"` will use AMP or APEX depending on the PyTorch version detected, while the
149 | other choices will force the requested backend.
150 | local_rank (:obj:`int`, `optional`, defaults to -1):
151 | Rank of the process during distributed training.
152 | tpu_num_cores (:obj:`int`, `optional`):
153 | When training on TPU, the number of TPU cores (automatically passed by launcher script).
154 | debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
155 | When training on TPU, whether to print debug metrics or not.
156 | dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
157 | Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
158 | or not.
159 | eval_steps (:obj:`int`, `optional`):
160 | Number of update steps between two evaluations if :obj:`evaluation_strategy="steps"`. Will default to the
161 | same value as :obj:`logging_steps` if not set.
162 | dataloader_num_workers (:obj:`int`, `optional`, defaults to 0):
163 | Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
164 | main process.
165 | past_index (:obj:`int`, `optional`, defaults to -1):
166 | Some models like :doc:`TransformerXL <../model_doc/transformerxl>` or :doc`XLNet <../model_doc/xlnet>` can
167 | make use of the past hidden states for their predictions. If this argument is set to a positive int, the
168 | ``Trainer`` will use the corresponding output (usually index 2) as the past state and feed it to the model
169 | at the next training step under the keyword argument ``mems``.
170 | run_name (:obj:`str`, `optional`):
171 | A descriptor for the run. Typically used for `wandb `_ logging.
172 | disable_tqdm (:obj:`bool`, `optional`):
173 | Whether or not to disable the tqdm progress bars and table of metrics produced by
174 | :class:`~transformers.notebook.NotebookTrainingTracker` in Jupyter Notebooks. Will default to :obj:`True`
175 | if the logging level is set to warn or lower (default), :obj:`False` otherwise.
176 | remove_unused_columns (:obj:`bool`, `optional`, defaults to :obj:`True`):
177 | If using :obj:`datasets.Dataset` datasets, whether or not to automatically remove the columns unused by the
178 | model forward method.
179 |
180 | (Note that this behavior is not implemented for :class:`~transformers.TFTrainer` yet.)
181 | label_names (:obj:`List[str]`, `optional`):
182 | The list of keys in your dictionary of inputs that correspond to the labels.
183 |
184 | Will eventually default to :obj:`["labels"]` except if the model used is one of the
185 | :obj:`XxxForQuestionAnswering` in which case it will default to :obj:`["start_positions",
186 | "end_positions"]`.
187 | load_best_model_at_end (:obj:`bool`, `optional`, defaults to :obj:`False`):
188 | Whether or not to load the best model found during training at the end of training.
189 |
190 | .. note::
191 |
192 | When set to :obj:`True`, the parameters :obj:`save_steps` will be ignored and the model will be saved
193 | after each evaluation.
194 | metric_for_best_model (:obj:`str`, `optional`):
195 | Use in conjunction with :obj:`load_best_model_at_end` to specify the metric to use to compare two different
196 | models. Must be the name of a metric returned by the evaluation with or without the prefix :obj:`"eval_"`.
197 | Will default to :obj:`"loss"` if unspecified and :obj:`load_best_model_at_end=True` (to use the evaluation
198 | loss).
199 |
200 | If you set this value, :obj:`greater_is_better` will default to :obj:`True`. Don't forget to set it to
201 | :obj:`False` if your metric is better when lower.
202 | greater_is_better (:obj:`bool`, `optional`):
203 | Use in conjunction with :obj:`load_best_model_at_end` and :obj:`metric_for_best_model` to specify if better
204 | models should have a greater metric or not. Will default to:
205 |
206 | - :obj:`True` if :obj:`metric_for_best_model` is set to a value that isn't :obj:`"loss"` or
207 | :obj:`"eval_loss"`.
208 | - :obj:`False` if :obj:`metric_for_best_model` is not set, or set to :obj:`"loss"` or :obj:`"eval_loss"`.
209 | ignore_skip_data (:obj:`bool`, `optional`, defaults to :obj:`False`):
210 | When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
211 | stage as in the previous training. If set to :obj:`True`, the training will begin faster (as that skipping
212 | step can take a long time) but will not yield the same results as the interrupted training would have.
213 | sharded_ddp (:obj:`bool`, `optional`, defaults to :obj:`False`):
214 | Use Sharded DDP training from `FairScale `__ (in distributed
215 | training only). This is an experimental feature.
216 | deepspeed (:obj:`str`, `optional`):
217 | Use `Deepspeed `__. This is an experimental feature and its API may
218 | evolve in the future. The value is the location of its json config file (usually ``ds_config.json``).
219 | label_smoothing_factor (:obj:`float`, `optional`, defaults to 0.0):
220 | The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
221 | labels are changed from 0s and 1s to :obj:`label_smoothing_factor/num_labels` and :obj:`1 -
222 | label_smoothing_factor + label_smoothing_factor/num_labels` respectively.
223 | adafactor (:obj:`bool`, `optional`, defaults to :obj:`False`):
224 | Whether or not to use the :class:`~transformers.Adafactor` optimizer instead of
225 | :class:`~transformers.AdamW`.
226 | group_by_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
227 | Whether or not to group together samples of roughly the same legnth in the training dataset (to minimize
228 | padding applied and be more efficient). Only useful if applying dynamic padding.
229 | report_to (:obj:`List[str]`, `optional`, defaults to the list of integrations platforms installed):
230 | The list of integrations to report the results and logs to. Supported platforms are :obj:`"azure_ml"`,
231 | :obj:`"comet_ml"`, :obj:`"mlflow"`, :obj:`"tensorboard"` and :obj:`"wandb"`.
232 | ddp_find_unused_parameters (:obj:`bool`, `optional`):
233 | When using distributed training, the value of the flag :obj:`find_unused_parameters` passed to
234 | :obj:`DistributedDataParallel`. Will default to :obj:`False` if gradient checkpointing is used, :obj:`True`
235 | otherwise.
236 | dataloader_pin_memory (:obj:`bool`, `optional`, defaults to :obj:`True`)):
237 | Whether you want to pin memory in data loaders or not. Will default to :obj:`True`.
238 | """
239 |
240 | output_dir: Optional[str] = field(
241 | default=None,
242 | metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
243 | )
244 | overwrite_output_dir: bool = field(
245 | default=False,
246 | metadata={
247 | "help": (
248 | "Overwrite the content of the output directory."
249 | "Use this to continue training if output_dir points to a checkpoint directory."
250 | )
251 | },
252 | )
253 |
254 | do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
255 | do_eval: bool = field(default=None, metadata={"help": "Whether to run eval on the dev set."})
256 | do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
257 | evaluation_strategy: EvaluationStrategy = field(
258 | default="no",
259 | metadata={"help": "The evaluation strategy to use."},
260 | )
261 | prediction_loss_only: bool = field(
262 | default=False,
263 | metadata={"help": "When performing evaluation and predictions, only returns the loss."},
264 | )
265 |
266 | per_device_train_batch_size: int = field(
267 | default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
268 | )
269 | per_device_eval_batch_size: int = field(
270 | default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
271 | )
272 |
273 | per_gpu_train_batch_size: Optional[int] = field(
274 | default=None,
275 | metadata={
276 | "help": "Deprecated, the use of `--per_device_train_batch_size` is preferred. "
277 | "Batch size per GPU/TPU core/CPU for training."
278 | },
279 | )
280 | per_gpu_eval_batch_size: Optional[int] = field(
281 | default=None,
282 | metadata={
283 | "help": "Deprecated, the use of `--per_device_eval_batch_size` is preferred."
284 | "Batch size per GPU/TPU core/CPU for evaluation."
285 | },
286 | )
287 |
288 | gradient_accumulation_steps: int = field(
289 | default=1,
290 | metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
291 | )
292 | eval_accumulation_steps: Optional[int] = field(
293 | default=None,
294 | metadata={"help": "Number of predictions steps to accumulate before moving the tensors to the CPU."},
295 | )
296 |
297 | learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
298 | weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
299 | adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
300 | adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
301 | adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
302 | max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})
303 |
304 | num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
305 | max_steps: int = field(
306 | default=-1,
307 | metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
308 | )
309 | lr_scheduler_type: SchedulerType = field(
310 | default="linear",
311 | metadata={"help": "The scheduler type to use."},
312 | )
313 | warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
314 |
315 | logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."})
316 | logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"})
317 | logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
318 | save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
319 | save_total_limit: Optional[int] = field(
320 | default=None,
321 | metadata={
322 | "help": (
323 | "Limit the total amount of checkpoints."
324 | "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints"
325 | )
326 | },
327 | )
328 | no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
329 | seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
330 |
331 | fp16: bool = field(
332 | default=False,
333 | metadata={"help": "Whether to use 16-bit (mixed) precision (through NVIDIA Apex) instead of 32-bit"},
334 | )
335 | fp16_opt_level: str = field(
336 | default="O1",
337 | metadata={
338 | "help": (
339 | "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
340 | "See details at https://nvidia.github.io/apex/amp.html"
341 | )
342 | },
343 | )
344 | fp16_backend: str = field(
345 | default="auto",
346 | metadata={"help": "The backend to be used for mixed precision.", "choices": ["auto", "amp", "apex"]},
347 | )
348 | local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
349 |
350 | tpu_num_cores: Optional[int] = field(
351 | default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"}
352 | )
353 | tpu_metrics_debug: bool = field(
354 | default=False,
355 | metadata={"help": "Deprecated, the use of `--debug` is preferred. TPU: Whether to print debug metrics"},
356 | )
357 | debug: bool = field(default=False, metadata={"help": "Whether to print debug metrics on TPU"})
358 |
359 | dataloader_drop_last: bool = field(
360 | default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
361 | )
362 | eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
363 | dataloader_num_workers: int = field(
364 | default=0,
365 | metadata={
366 | "help": "Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process."
367 | },
368 | )
369 |
370 | past_index: int = field(
371 | default=-1,
372 | metadata={"help": "If >=0, uses the corresponding part of the output as the past state for next step."},
373 | )
374 |
375 | run_name: Optional[str] = field(
376 | default=None, metadata={"help": "An optional descriptor for the run. Notably used for wandb logging."}
377 | )
378 | disable_tqdm: Optional[bool] = field(
379 | default=None, metadata={"help": "Whether or not to disable the tqdm progress bars."}
380 | )
381 |
382 | remove_unused_columns: Optional[bool] = field(
383 | default=True, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."}
384 | )
385 | label_names: Optional[List[str]] = field(
386 | default=None, metadata={"help": "The list of keys in your dictionary of inputs that correspond to the labels."}
387 | )
388 |
389 | load_best_model_at_end: Optional[bool] = field(
390 | default=False,
391 | metadata={"help": "Whether or not to load the best model found during training at the end of training."},
392 | )
393 | metric_for_best_model: Optional[str] = field(
394 | default=None, metadata={"help": "The metric to use to compare two different models."}
395 | )
396 | greater_is_better: Optional[bool] = field(
397 | default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."}
398 | )
399 | ignore_data_skip: bool = field(
400 | default=False,
401 | metadata={
402 | "help": "When resuming training, whether or not to skip the first epochs and batches to get to the same training data."
403 | },
404 | )
405 | sharded_ddp: bool = field(
406 | default=False,
407 | metadata={"help": "Whether or not to use sharded DDP training (in distributed training only)."},
408 | )
409 | deepspeed: Optional[str] = field(
410 | default=None,
411 | metadata={"help": "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json)"},
412 | )
413 | label_smoothing_factor: float = field(
414 | default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
415 | )
416 | adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
417 | group_by_length: bool = field(
418 | default=False,
419 | metadata={"help": "Whether or not to group samples of roughly the same length together when batching."},
420 | )
421 | report_to: Optional[List[str]] = field(
422 | default=None, metadata={"help": "The list of integrations to report the results and logs to."}
423 | )
424 | ddp_find_unused_parameters: Optional[bool] = field(
425 | default=None,
426 | metadata={
427 | "help": "When using distributed training, the value of the flag `find_unused_parameters` passed to "
428 | "`DistributedDataParallel`."
429 | },
430 | )
431 | dataloader_pin_memory: bool = field(
432 | default=True, metadata={"help": "Whether or not to pin memory for DataLoader."}
433 | )
434 | _n_gpu: int = field(init=False, repr=False, default=-1)
435 |
436 | def __post_init__(self):
437 | if self.output_dir is None and os.getenv("SM_OUTPUT_DATA_DIR") is None:
438 | raise ValueError(
439 | "`output_dir` is only optional if it can get inferred from the environment. Please set a value for "
440 | "`output_dir`."
441 | )
442 | elif os.getenv("SM_OUTPUT_DATA_DIR") is not None:
443 | if self.output_dir is not None:
444 | logger.warn(
445 | "`output_dir` is overwritten by the env variable 'SM_OUTPUT_DATA_DIR' "
446 | f"({os.getenv('SM_OUTPUT_DATA_DIR')})."
447 | )
448 | self.output_dir = os.getenv("SM_OUTPUT_DATA_DIR")
449 | if self.disable_tqdm is None:
450 | self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
451 | self.evaluation_strategy = EvaluationStrategy(self.evaluation_strategy)
452 | self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type)
453 | if self.do_eval is False and self.evaluation_strategy != EvaluationStrategy.NO:
454 | self.do_eval = True
455 | if self.eval_steps is None:
456 | self.eval_steps = self.logging_steps
457 |
458 | if self.load_best_model_at_end and self.metric_for_best_model is None:
459 | self.metric_for_best_model = "loss"
460 | if self.greater_is_better is None and self.metric_for_best_model is not None:
461 | self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
462 | if self.run_name is None:
463 | self.run_name = self.output_dir
464 |
465 | if is_torch_available() and self.device.type != "cuda" and self.fp16:
466 | raise ValueError("Mixed precision training with AMP or APEX (`--fp16`) can only be used on CUDA devices.")
467 | if self.report_to is None:
468 | # Import at runtime to avoid a circular import.
469 | from transformers.integrations import get_available_reporting_integrations
470 |
471 | self.report_to = get_available_reporting_integrations()
472 |
473 | def __repr__(self):
474 | # We override the default repr to remove deprecated arguments from the repr. This method should be removed once
475 | # those deprecated arguments are removed form TrainingArguments. (TODO: v5)
476 | self_as_dict = asdict(self)
477 | del self_as_dict["per_gpu_train_batch_size"]
478 | del self_as_dict["per_gpu_eval_batch_size"]
479 | attrs_as_str = [f"{k}={v}" for k, v in self_as_dict.items()]
480 | return f"{self.__class__.__name__}({', '.join(attrs_as_str)})"
481 |
482 | @property
483 | def train_batch_size(self) -> int:
484 | """
485 | The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
486 | """
487 | if self.per_gpu_train_batch_size:
488 | logger.warning(
489 | "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
490 | "version. Using `--per_device_train_batch_size` is preferred."
491 | )
492 | per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
493 | train_batch_size = per_device_batch_size * max(1, self.n_gpu)
494 | return train_batch_size
495 |
496 | @property
497 | def eval_batch_size(self) -> int:
498 | """
499 | The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
500 | """
501 | if self.per_gpu_eval_batch_size:
502 | logger.warning(
503 | "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
504 | "version. Using `--per_device_eval_batch_size` is preferred."
505 | )
506 | per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
507 | eval_batch_size = per_device_batch_size * max(1, self.n_gpu)
508 | return eval_batch_size
509 |
510 | @cached_property
511 | @torch_required
512 | def _setup_devices(self) -> "torch.device":
513 | logger.info("PyTorch: setting up devices")
514 | if self.no_cuda:
515 | device = torch.device("cpu")
516 | self._n_gpu = 0
517 | elif is_torch_tpu_available():
518 | device = xm.xla_device()
519 | self._n_gpu = 0
520 | elif self.deepspeed:
521 | # deepspeed performs its own DDP internally, and requires the program to be started with:
522 | # deepspeed ./program.py
523 | # rather than:
524 | # python -m torch.distributed.launch --nproc_per_node=2 ./program.py
525 | from transformers.integrations import is_deepspeed_available
526 |
527 | if not is_deepspeed_available():
528 | raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.")
529 | import deepspeed
530 |
531 | deepspeed.init_distributed()
532 | device = torch.device("cuda", self.local_rank)
533 | self._n_gpu = 1
534 | elif self.local_rank == -1:
535 | # if n_gpu is > 1 we'll use nn.DataParallel.
536 | # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
537 | # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
538 | # trigger an error that a device index is missing. Index 0 takes into account the
539 | # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
540 | # will use the first GPU in that env, i.e. GPU#1
541 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
542 | # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
543 | # the default value.
544 | self._n_gpu = torch.cuda.device_count()
545 | else:
546 | # Here, we'll use torch.distributed.
547 | # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
548 | torch.distributed.init_process_group(backend="nccl")
549 | device = torch.device("cuda", self.local_rank)
550 | self._n_gpu = 1
551 |
552 | if device.type == "cuda":
553 | torch.cuda.set_device(device)
554 |
555 | return device
556 |
557 | @property
558 | @torch_required
559 | def device(self) -> "torch.device":
560 | """
561 | The device used by this process.
562 | """
563 | return self._setup_devices
564 |
565 | @property
566 | @torch_required
567 | def n_gpu(self):
568 | """
569 | The number of GPUs used by this process.
570 |
571 | Note:
572 | This will only be greater than one when you have multiple GPUs available but are not using distributed
573 | training. For distributed training, it will always be 1.
574 | """
575 | # Make sure `self._n_gpu` is properly setup.
576 | _ = self._setup_devices
577 | return self._n_gpu
578 |
579 | @property
580 | @torch_required
581 | def parallel_mode(self):
582 | """
583 | The current mode used for parallelism if multiple GPUs/TPU cores are available. One of:
584 |
585 | - :obj:`ParallelMode.NOT_PARALLEL`: no parallelism (CPU or one GPU).
586 | - :obj:`ParallelMode.NOT_DISTRIBUTED`: several GPUs in one single process (uses :obj:`torch.nn.DataParallel`).
587 | - :obj:`ParallelMode.DISTRIBUTED`: several GPUs, each ahving its own process (uses
588 | :obj:`torch.nn.DistributedDataParallel`).
589 | - :obj:`ParallelMode.TPU`: several TPU cores.
590 | """
591 | if is_torch_tpu_available():
592 | return ParallelMode.TPU
593 | elif self.local_rank != -1:
594 | return ParallelMode.DISTRIBUTED
595 | elif self.n_gpu > 1:
596 | return ParallelMode.NOT_DISTRIBUTED
597 | else:
598 | return ParallelMode.NOT_PARALLEL
599 |
600 | def to_dict(self):
601 | """
602 | Serializes this instance while replace `Enum` by their values (for JSON serialization support).
603 | """
604 | d = asdict(self)
605 | for k, v in d.items():
606 | if isinstance(v, Enum):
607 | d[k] = v.value
608 | return d
609 |
610 | def to_json_string(self):
611 | """
612 | Serializes this instance to a JSON string.
613 | """
614 | return json.dumps(self.to_dict(), indent=2)
615 |
616 | def to_sanitized_dict(self) -> Dict[str, Any]:
617 | """
618 | Sanitized serialization to use with TensorBoard’s hparams
619 | """
620 | d = self.to_dict()
621 | d = {**d, **{"train_batch_size": self.train_batch_size, "eval_batch_size": self.eval_batch_size}}
622 |
623 | valid_types = [bool, int, float, str]
624 | if is_torch_available():
625 | valid_types.append(torch.Tensor)
626 |
627 | return {k: v if type(v) in valid_types else str(v) for k, v in d.items()}
628 |
629 |
630 | class ParallelMode(Enum):
631 | NOT_PARALLEL = "not_parallel"
632 | NOT_DISTRIBUTED = "not_distributed"
633 | DISTRIBUTED = "distributed"
634 | SAGEMAKER_DISTRIBUTED = "sm_distributed"
635 | TPU = "tpu"
636 |
--------------------------------------------------------------------------------
/data/code/util/modeling/modeling_nezha/modeling.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import logging
4 | import torch
5 |
6 | from torch import nn
7 | from torch.nn import CrossEntropyLoss, MSELoss
8 |
9 | from .configuration import NeZhaConfig
10 | from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
11 | from transformers.modeling_utils import PreTrainedModel, prune_linear_layer
12 | from transformers.models.bert.modeling_bert import (
13 | BertOutput,
14 | BertPooler,
15 | BertSelfOutput,
16 | BertIntermediate,
17 | BertOnlyMLMHead,
18 | BertOnlyNSPHead,
19 | BertLMPredictionHead,
20 | BERT_START_DOCSTRING,
21 | BERT_INPUTS_DOCSTRING,
22 | )
23 |
24 | logger = logging.getLogger(__name__)
25 |
26 | _CONFIG_FOR_DOC = "NeZhaConfig"
27 | _TOKENIZER_FOR_DOC = "NeZhaTokenizer"
28 |
29 | NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = []
30 | NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP = {}
31 |
32 |
33 | def load_tf_weights_in_nezha(model, config, tf_checkpoint_path):
34 | """Load tf checkpoints in a pytorch model."""
35 | try:
36 | import re
37 | import numpy as np
38 | import tensorflow as tf
39 | except ImportError:
40 | logger.error(
41 | "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
42 | "https://www.tensorflow.org/install/ for installation instructions."
43 | )
44 | raise
45 |
46 | tf_path = os.path.abspath(tf_checkpoint_path)
47 | logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
48 | # Load weights from TF model
49 | init_vars = tf.train.list_variables(tf_path)
50 | names = []
51 | arrays = []
52 | for name, shape in init_vars:
53 | # logger.info("Loading TF weight {} with shape {}".format(name, shape))
54 | array = tf.train.load_variable(tf_path, name)
55 | names.append(name)
56 | arrays.append(array)
57 |
58 | for name, array in zip(names, arrays):
59 | name = name.split("/")
60 | # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
61 | # which are not required for using pretrained model
62 | if any(
63 | n in ["adam_v", "adam_m", "lamb_m", "lamb_v", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1",
64 | "global_step", "good_steps", "loss_scale", 'bad_steps']
65 | for n in name
66 | ):
67 | logger.info("Skipping {}".format("/".join(name)))
68 | continue
69 | pointer = model
70 | for m_name in name:
71 | if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
72 | scope_names = re.split(r"_(\d+)", m_name)
73 | else:
74 | scope_names = [m_name]
75 | if scope_names[0] == "kernel" or scope_names[0] == "gamma":
76 | pointer = getattr(pointer, "weight")
77 | elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
78 | pointer = getattr(pointer, "bias")
79 | elif scope_names[0] == "output_weights":
80 | pointer = getattr(pointer, "weight")
81 | elif scope_names[0] == "squad":
82 | pointer = getattr(pointer, "classifier")
83 | else:
84 | try:
85 | pointer = getattr(pointer, scope_names[0])
86 | except AttributeError:
87 | logger.info("Skipping {}".format("/".join(name)))
88 | continue
89 | if len(scope_names) >= 2:
90 | num = int(scope_names[1])
91 | pointer = pointer[num]
92 | if m_name[-11:] == "_embeddings":
93 | pointer = getattr(pointer, "weight")
94 | elif m_name == "kernel":
95 | array = np.transpose(array)
96 | try:
97 | assert (
98 | pointer.shape == array.shape
99 | ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
100 | except AssertionError as e:
101 | e.args += (pointer.shape, array.shape)
102 | raise
103 | logger.info("Initialize PyTorch weight {}".format(name))
104 | pointer.data = torch.from_numpy(array)
105 | return model
106 |
107 |
108 | class NeZhaEmbeddings(nn.Module):
109 | """
110 | Construct the embeddings from word, position and token_type embeddings.
111 | """
112 |
113 | def __init__(self, config):
114 | super().__init__()
115 | self.use_relative_position = config.use_relative_position
116 | self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
117 | self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
118 | # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
119 | # any TensorFlow checkpoint file
120 | self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
121 | self.dropout = nn.Dropout(config.hidden_dropout_prob)
122 |
123 | def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None):
124 | if input_ids is not None:
125 | input_shape = input_ids.size()
126 | else:
127 | input_shape = inputs_embeds.size()[:-1]
128 | device = input_ids.device if input_ids is not None else inputs_embeds.device
129 | if token_type_ids is None:
130 | token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
131 | if inputs_embeds is None:
132 | inputs_embeds = self.word_embeddings(input_ids)
133 | token_type_embeddings = self.token_type_embeddings(token_type_ids)
134 |
135 | embeddings = inputs_embeds + token_type_embeddings
136 |
137 | # embeddings = inputs_embeds + token_type_embeddings
138 | embeddings = self.LayerNorm(embeddings)
139 | embeddings = self.dropout(embeddings)
140 | return embeddings
141 |
142 |
143 | def relative_position_encoding(depth, max_length=512, max_relative_position=127):
144 | vocab_size = max_relative_position * 2 + 1
145 | range_vec = torch.arange(max_length)
146 | range_mat = range_vec.repeat(max_length).view(max_length, max_length)
147 | distance_mat = range_mat - torch.t(range_mat)
148 | distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position)
149 | final_mat = distance_mat_clipped + max_relative_position
150 |
151 | embeddings_table = torch.zeros(vocab_size, depth)
152 | position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
153 | div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth))
154 | embeddings_table[:, 0::2] = torch.sin(position * div_term)
155 | embeddings_table[:, 1::2] = torch.cos(position * div_term)
156 | embeddings_table = embeddings_table.unsqueeze(0).transpose(0, 1).squeeze(1)
157 |
158 | flat_relative_positions_matrix = final_mat.view(-1)
159 | one_hot_relative_positions_matrix = torch.nn.functional.one_hot(flat_relative_positions_matrix,
160 | num_classes=vocab_size).float()
161 | positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table)
162 | my_shape = list(final_mat.size())
163 | my_shape.append(depth)
164 | positions_encoding = positions_encoding.view(my_shape)
165 | return positions_encoding
166 |
167 |
168 | class NeZhaSelfAttention(nn.Module):
169 | def __init__(self, config):
170 | super().__init__()
171 | if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
172 | raise ValueError(
173 | "The hidden size (%d) is not a multiple of the number of attention "
174 | "heads (%d)" % (config.hidden_size, config.num_attention_heads)
175 | )
176 | self.output_attentions = config.output_attentions
177 |
178 | self.num_attention_heads = config.num_attention_heads
179 | self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
180 | self.all_head_size = self.num_attention_heads * self.attention_head_size
181 |
182 | self.query = nn.Linear(config.hidden_size, self.all_head_size)
183 | self.key = nn.Linear(config.hidden_size, self.all_head_size)
184 | self.value = nn.Linear(config.hidden_size, self.all_head_size)
185 | self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
186 |
187 | self.relative_positions_encoding = relative_position_encoding(max_length=config.max_position_embeddings,
188 | depth=self.attention_head_size,
189 | max_relative_position=config.max_relative_position)
190 |
191 | def transpose_for_scores(self, x):
192 | new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
193 | x = x.view(*new_x_shape)
194 | return x.permute(0, 2, 1, 3)
195 |
196 | def forward(
197 | self,
198 | hidden_states,
199 | attention_mask=None,
200 | head_mask=None,
201 | encoder_hidden_states=None,
202 | encoder_attention_mask=None,
203 | ):
204 |
205 | mixed_query_layer = self.query(hidden_states)
206 |
207 | # If this is instantiated as a cross-attention module, the keys
208 | # and values come from an encoder; the attention mask needs to be
209 | # such that the encoder's padding tokens are not attended to.
210 | if encoder_hidden_states is not None:
211 | mixed_key_layer = self.key(encoder_hidden_states)
212 | mixed_value_layer = self.value(encoder_hidden_states)
213 | attention_mask = encoder_attention_mask
214 | else:
215 | mixed_key_layer = self.key(hidden_states)
216 | mixed_value_layer = self.value(hidden_states)
217 |
218 | query_layer = self.transpose_for_scores(mixed_query_layer)
219 | key_layer = self.transpose_for_scores(mixed_key_layer)
220 | value_layer = self.transpose_for_scores(mixed_value_layer)
221 |
222 | # Take the dot product between "query" and "key" to get the raw attention scores.
223 | attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
224 |
225 | batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size()
226 |
227 | relations_keys = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :].to(hidden_states.device)
228 | query_layer_t = query_layer.permute(2, 0, 1, 3)
229 |
230 | query_layer_r = query_layer_t.contiguous().view(from_seq_length, batch_size * num_attention_heads,
231 | self.attention_head_size)
232 | key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1))
233 | key_position_scores_r = key_position_scores.view(from_seq_length, batch_size,
234 | num_attention_heads, from_seq_length)
235 | key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3)
236 | attention_scores = attention_scores + key_position_scores_r_t
237 |
238 | attention_scores = attention_scores / math.sqrt(self.attention_head_size)
239 | if attention_mask is not None:
240 | # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
241 | attention_scores = attention_scores + attention_mask
242 |
243 | # Normalize the attention scores to probabilities.
244 | attention_probs = nn.Softmax(dim=-1)(attention_scores)
245 |
246 | # This is actually dropping out entire tokens to attend to, which might
247 | # seem a bit unusual, but is taken from the original Transformer paper.
248 | attention_probs = self.dropout(attention_probs)
249 |
250 | # Mask heads if we want to
251 | if head_mask is not None:
252 | attention_probs = attention_probs * head_mask
253 |
254 | context_layer = torch.matmul(attention_probs, value_layer)
255 |
256 | relations_values = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :].to(hidden_states.device)
257 | attention_probs_t = attention_probs.permute(2, 0, 1, 3)
258 | attentions_probs_r = attention_probs_t.contiguous().view(from_seq_length, batch_size * num_attention_heads,
259 | to_seq_length)
260 | value_position_scores = torch.matmul(attentions_probs_r, relations_values)
261 | value_position_scores_r = value_position_scores.view(from_seq_length, batch_size,
262 | num_attention_heads, self.attention_head_size)
263 | value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3)
264 | context_layer = context_layer + value_position_scores_r_t
265 |
266 | context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
267 | new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
268 | context_layer = context_layer.view(*new_context_layer_shape)
269 |
270 | outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
271 | return outputs
272 |
273 |
274 | class NeZhaAttention(nn.Module):
275 | def __init__(self, config):
276 | super().__init__()
277 | self.self = NeZhaSelfAttention(config)
278 | self.output = BertSelfOutput(config)
279 | self.pruned_heads = set()
280 |
281 | def prune_heads(self, heads):
282 | if len(heads) == 0:
283 | return
284 | mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
285 | heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads
286 | for head in heads:
287 | # Compute how many pruned heads are before the head and move the index accordingly
288 | head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
289 | mask[head] = 0
290 | mask = mask.view(-1).contiguous().eq(1)
291 | index = torch.arange(len(mask))[mask].long()
292 | # Prune linear layers
293 | self.self.query = prune_linear_layer(self.self.query, index)
294 | self.self.key = prune_linear_layer(self.self.key, index)
295 | self.self.value = prune_linear_layer(self.self.value, index)
296 | self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
297 | # Update hyper params and store pruned heads
298 | self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
299 | self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
300 | self.pruned_heads = self.pruned_heads.union(heads)
301 |
302 | def forward(
303 | self,
304 | hidden_states,
305 | attention_mask=None,
306 | head_mask=None,
307 | encoder_hidden_states=None,
308 | encoder_attention_mask=None,
309 | ):
310 | self_outputs = self.self(
311 | hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
312 | )
313 | attention_output = self.output(self_outputs[0], hidden_states)
314 | outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
315 | return outputs
316 |
317 |
318 | class NeZhaLayer(nn.Module):
319 | def __init__(self, config):
320 | super().__init__()
321 | self.attention = NeZhaAttention(config)
322 | self.is_decoder = config.is_decoder
323 | if self.is_decoder:
324 | self.crossattention = NeZhaAttention(config)
325 | self.intermediate = BertIntermediate(config)
326 | self.output = BertOutput(config)
327 |
328 | def forward(
329 | self,
330 | hidden_states,
331 | attention_mask=None,
332 | head_mask=None,
333 | encoder_hidden_states=None,
334 | encoder_attention_mask=None,
335 | ):
336 | self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
337 | attention_output = self_attention_outputs[0]
338 | outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
339 |
340 | if self.is_decoder and encoder_hidden_states is not None:
341 | cross_attention_outputs = self.crossattention(
342 | attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
343 | )
344 | attention_output = cross_attention_outputs[0]
345 | outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights
346 |
347 | intermediate_output = self.intermediate(attention_output)
348 | layer_output = self.output(intermediate_output, attention_output)
349 | outputs = (layer_output,) + outputs
350 | return outputs
351 |
352 |
353 | class NeZhaEncoder(nn.Module):
354 | def __init__(self, config):
355 | super().__init__()
356 | self.output_attentions = config.output_attentions
357 | # self.output_hidden_states = config.output_hidden_states
358 | self.output_hidden_states = True
359 | self.layer = nn.ModuleList([NeZhaLayer(config) for _ in range(config.num_hidden_layers)])
360 |
361 | def forward(
362 | self,
363 | hidden_states,
364 | attention_mask=None,
365 | head_mask=None,
366 | encoder_hidden_states=None,
367 | encoder_attention_mask=None,
368 | ):
369 | all_hidden_states = ()
370 | all_attentions = ()
371 | for i, layer_module in enumerate(self.layer):
372 | if self.output_hidden_states:
373 | all_hidden_states = all_hidden_states + (hidden_states,)
374 | layer_outputs = layer_module(
375 | hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask
376 | )
377 | hidden_states = layer_outputs[0]
378 | if self.output_attentions:
379 | all_attentions = all_attentions + (layer_outputs[1],)
380 | # Add last layer
381 | if self.output_hidden_states:
382 | all_hidden_states = all_hidden_states + (hidden_states,)
383 |
384 | outputs = (hidden_states,)
385 | if self.output_hidden_states:
386 | outputs = outputs + (all_hidden_states,)
387 | if self.output_attentions:
388 | outputs = outputs + (all_attentions,)
389 | return outputs # last-layer hidden state, (all hidden states), (all attentions)
390 |
391 |
392 | class NeZhaPreTrainedModel(PreTrainedModel):
393 | """ An abstract class to handle weights initialization and
394 | a simple interface for downloading and loading pretrained models.
395 | """
396 | config_class = NeZhaConfig
397 | pretrained_model_archive_map = NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP
398 | load_tf_weights = load_tf_weights_in_nezha
399 | base_model_prefix = "bert"
400 |
401 | def _init_weights(self, module):
402 | """ Initialize the weights """
403 | if isinstance(module, (nn.Linear, nn.Embedding)):
404 | # Slightly different from the TF version which uses truncated_normal for initialization
405 | # cf https://github.com/pytorch/pytorch/pull/5617
406 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
407 | elif isinstance(module, nn.LayerNorm):
408 | module.bias.data.zero_()
409 | module.weight.data.fill_(1.0)
410 | if isinstance(module, nn.Linear) and module.bias is not None:
411 | module.bias.data.zero_()
412 |
413 |
414 | @add_start_docstrings(
415 | "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
416 | BERT_START_DOCSTRING,
417 | )
418 | class NeZhaModel(NeZhaPreTrainedModel):
419 | """
420 | The model can behave as an encoder (with only self-attention) as well
421 | as a decoder, in which case a layer of cross-attention is added between
422 | the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
423 | Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
424 |
425 | To behave as an decoder the model needs to be initialized with the
426 | :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
427 | :obj:`encoder_hidden_states` is expected as an input to the forward pass.
428 |
429 | .. _`Attention is all you need`:
430 | https://arxiv.org/abs/1706.03762
431 |
432 | """
433 |
434 | def __init__(self, config):
435 | super().__init__(config)
436 | self.config = config
437 | self.embeddings = NeZhaEmbeddings(config)
438 | self.encoder = NeZhaEncoder(config)
439 | self.pooler = BertPooler(config)
440 | self.init_weights()
441 |
442 | def get_input_embeddings(self):
443 | return self.embeddings.word_embeddings
444 |
445 | def set_input_embeddings(self, value):
446 | self.embeddings.word_embeddings = value
447 |
448 | def _prune_heads(self, heads_to_prune):
449 | """ Prunes heads of the model.
450 | heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
451 | See base class PreTrainedModel
452 | """
453 | for layer, heads in heads_to_prune.items():
454 | self.encoder.layer[layer].attention.prune_heads(heads)
455 |
456 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
457 | def forward(
458 | self,
459 | input_ids=None,
460 | attention_mask=None,
461 | token_type_ids=None,
462 | head_mask=None,
463 | position_ids=None,
464 | inputs_embeds=None,
465 | encoder_hidden_states=None,
466 | encoder_attention_mask=None,
467 | ):
468 | r"""
469 | Return:
470 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
471 | last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
472 | Sequence of hidden-states at the output of the last layer of the model.
473 | pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
474 | Last layer hidden-state of the first token of the sequence (classification token)
475 | further processed by a Linear layer and a Tanh activation function. The Linear
476 | layer weights are trained from the next sentence prediction (classification)
477 | objective during pre-training.
478 |
479 | This output is usually *not* a good summary
480 | of the semantic content of the input, you're often better with averaging or pooling
481 | the sequence of hidden-states for the whole input sequence.
482 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
483 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
484 | of shape :obj:`(batch_size, sequence_length, hidden_size)`.
485 |
486 | Hidden-states of the model at the output of each layer plus the initial embedding outputs.
487 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
488 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
489 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
490 |
491 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
492 | heads.
493 |
494 | Examples::
495 |
496 | from transformers import BertModel, BertTokenizer
497 | import torch
498 |
499 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
500 | model = BertModel.from_pretrained('bert-base-uncased')
501 |
502 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
503 | outputs = model(input_ids)
504 |
505 | last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
506 |
507 | """
508 |
509 | if input_ids is not None and inputs_embeds is not None:
510 | raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
511 | elif input_ids is not None:
512 | input_shape = input_ids.size()
513 | elif inputs_embeds is not None:
514 | input_shape = inputs_embeds.size()[:-1]
515 | else:
516 | raise ValueError("You have to specify either input_ids or inputs_embeds")
517 |
518 | device = input_ids.device if input_ids is not None else inputs_embeds.device
519 |
520 | if attention_mask is None:
521 | attention_mask = torch.ones(input_shape, device=device)
522 | if token_type_ids is None:
523 | token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
524 |
525 | # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
526 | # ourselves in which case we just need to make it broadcastable to all heads.
527 | extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
528 | attention_mask, input_shape, self.device
529 | )
530 |
531 | # If a 2D ou 3D attention mask is provided for the cross-attention
532 | # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
533 | if self.config.is_decoder and encoder_hidden_states is not None:
534 | encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
535 | encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
536 | if encoder_attention_mask is None:
537 | encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
538 | encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
539 | else:
540 | encoder_extended_attention_mask = None
541 |
542 | # Prepare head mask if needed
543 | # 1.0 in head_mask indicate we keep the head
544 | # attention_probs has shape bsz x n_heads x N x N
545 | # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
546 | # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
547 | head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
548 |
549 | embedding_output = self.embeddings(
550 | input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
551 | )
552 | encoder_outputs = self.encoder(
553 | embedding_output,
554 | attention_mask=extended_attention_mask,
555 | head_mask=head_mask,
556 | encoder_hidden_states=encoder_hidden_states,
557 | encoder_attention_mask=encoder_extended_attention_mask,
558 | )
559 | sequence_output = encoder_outputs[0]
560 | pooled_output = self.pooler(sequence_output)
561 |
562 | outputs = (sequence_output, pooled_output,) + encoder_outputs[
563 | 1:
564 | ] # add hidden_states and attentions if they are here
565 | return outputs # sequence_output, pooled_output, (hidden_states), (attentions)
566 |
567 |
568 | class BertPreTrainingHeads(nn.Module):
569 | def __init__(self, config):
570 | super().__init__()
571 | self.predictions = BertLMPredictionHead(config)
572 | self.seq_relationship = nn.Linear(config.hidden_size, 2)
573 |
574 | def forward(self, sequence_output, pooled_output):
575 | prediction_scores = self.predictions(sequence_output)
576 | seq_relationship_score = self.seq_relationship(pooled_output)
577 | return prediction_scores, seq_relationship_score
578 |
579 |
580 | @add_start_docstrings(
581 | """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
582 | a `next sentence prediction (classification)` head. """,
583 | BERT_START_DOCSTRING,
584 | )
585 | class NeZhaForPreTraining(NeZhaPreTrainedModel):
586 | def __init__(self, config):
587 | super().__init__(config)
588 | self.bert = NeZhaModel(config)
589 | self.cls = BertPreTrainingHeads(config)
590 | self.init_weights()
591 |
592 | def get_output_embeddings(self):
593 | return self.cls.predictions.decoder
594 |
595 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
596 | def forward(
597 | self,
598 | input_ids=None,
599 | attention_mask=None,
600 | token_type_ids=None,
601 | head_mask=None,
602 | position_ids=None,
603 | inputs_embeds=None,
604 | labels=None,
605 | sentence_span_labels=None,
606 | ):
607 |
608 | outputs = self.bert(
609 | input_ids,
610 | attention_mask=attention_mask,
611 | token_type_ids=token_type_ids,
612 | head_mask=head_mask,
613 | inputs_embeds=inputs_embeds,
614 | )
615 |
616 | sequence_output, pooled_output = outputs[:2]
617 | prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
618 | # add hidden states and attention if they are here
619 | outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]
620 |
621 | if labels is not None and sentence_span_labels is not None:
622 | loss_fct = CrossEntropyLoss()
623 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
624 |
625 | next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), sentence_span_labels.view(-1))
626 |
627 | pseudo_labels = torch.argmax(torch.softmax(seq_relationship_score, -1), 1)
628 | pseudo_loss = loss_fct(seq_relationship_score.view(-1, 2), pseudo_labels.view(-1))
629 | next_sentence_loss = next_sentence_loss + 0.5 * pseudo_loss
630 |
631 | total_loss = masked_lm_loss + next_sentence_loss
632 | outputs = (total_loss,) + outputs
633 |
634 | return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
635 |
636 |
637 | @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
638 | class NeZhaForMaskedLM(NeZhaPreTrainedModel):
639 | def __init__(self, config):
640 | super().__init__(config)
641 | self.bert = NeZhaModel(config)
642 | self.cls = BertOnlyMLMHead(config)
643 | self.init_weights()
644 |
645 | def get_output_embeddings(self):
646 | return self.cls.predictions.decoder
647 |
648 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
649 | def forward(
650 | self,
651 | input_ids=None,
652 | attention_mask=None,
653 | token_type_ids=None,
654 | head_mask=None,
655 | position_ids=None,
656 | inputs_embeds=None,
657 | encoder_hidden_states=None,
658 | encoder_attention_mask=None,
659 | labels=None,
660 | ):
661 | r"""
662 | masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
663 | Labels for computing the masked language modeling loss.
664 | Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
665 | Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
666 | in ``[0, ..., config.vocab_size]``
667 | lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
668 | Labels for computing the left-to-right language modeling loss (next word prediction).
669 | Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
670 | Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
671 | in ``[0, ..., config.vocab_size]``
672 |
673 | Returns:
674 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
675 | masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
676 | Masked language modeling loss.
677 | ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided):
678 | Next token prediction loss.
679 | prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
680 | Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
681 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
682 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
683 | of shape :obj:`(batch_size, sequence_length, hidden_size)`.
684 |
685 | Hidden-states of the model at the output of each layer plus the initial embedding outputs.
686 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
687 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
688 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
689 |
690 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
691 | heads.
692 |
693 | Examples::
694 |
695 | from transformers import BertTokenizer, BertForMaskedLM
696 | import torch
697 |
698 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
699 | model = BertForMaskedLM.from_pretrained('bert-base-uncased')
700 |
701 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
702 | outputs = model(input_ids, masked_lm_labels=input_ids)
703 |
704 | loss, prediction_scores = outputs[:2]
705 |
706 | """
707 | outputs = self.bert(
708 | input_ids,
709 | attention_mask=attention_mask,
710 | token_type_ids=token_type_ids,
711 | head_mask=head_mask,
712 | inputs_embeds=inputs_embeds,
713 | encoder_hidden_states=encoder_hidden_states,
714 | encoder_attention_mask=encoder_attention_mask,
715 | )
716 |
717 | sequence_output = outputs[0]
718 | prediction_scores = self.cls(sequence_output)
719 | outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
720 |
721 | # Although this may seem awkward, BertForMaskedLM supports two scenarios:
722 | # 1. If a tensor that contains the indices of masked labels is provided,
723 | # the cross-entropy is the MLM cross-entropy that measures the likelihood
724 | # of predictions for masked words.
725 | # 2. If `lm_labels` is provided we are in a causal scenario where we
726 | # try to predict the next token for each input in the decoder.
727 | masked_lm_labels = None
728 | if labels is not None:
729 | loss_fct = CrossEntropyLoss() # -100 index = padding token
730 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
731 | outputs = (masked_lm_loss,) + outputs
732 | return outputs # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
733 |
734 | def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
735 | input_shape = input_ids.shape
736 | effective_batch_size = input_shape[0]
737 |
738 | # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
739 | if attention_mask is None:
740 | attention_mask = input_ids.new_ones(input_shape)
741 |
742 | # if model is does not use a causal mask then add a dummy token
743 | if self.config.is_decoder is False:
744 | assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
745 | attention_mask = torch.cat(
746 | [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1
747 | )
748 |
749 | dummy_token = torch.full(
750 | (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
751 | )
752 | input_ids = torch.cat([input_ids, dummy_token], dim=1)
753 |
754 | return {"input_ids": input_ids, "attention_mask": attention_mask}
755 |
756 |
757 | @add_start_docstrings(
758 | """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
759 | )
760 | class NeZhaForNextSentencePrediction(NeZhaPreTrainedModel):
761 | def __init__(self, config):
762 | super().__init__(config)
763 | self.bert = NeZhaModel(config)
764 | self.cls = BertOnlyNSPHead(config)
765 | self.init_weights()
766 |
767 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
768 | def forward(
769 | self,
770 | input_ids=None,
771 | attention_mask=None,
772 | token_type_ids=None,
773 | head_mask=None,
774 | position_ids=None,
775 | inputs_embeds=None,
776 | next_sentence_label=None,
777 | ):
778 | r"""
779 | next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
780 | Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
781 | Indices should be in ``[0, 1]``.
782 | ``0`` indicates sequence B is a continuation of sequence A,
783 | ``1`` indicates sequence B is a random sequence.
784 |
785 | Returns:
786 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
787 | loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided):
788 | Next sequence prediction (classification) loss.
789 | seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
790 | Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
791 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
792 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
793 | of shape :obj:`(batch_size, sequence_length, hidden_size)`.
794 |
795 | Hidden-states of the model at the output of each layer plus the initial embedding outputs.
796 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
797 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
798 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
799 |
800 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
801 | heads.
802 |
803 | Examples::
804 |
805 | from transformers import BertTokenizer, BertForNextSentencePrediction
806 | import torch
807 |
808 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
809 | model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
810 |
811 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
812 | outputs = model(input_ids)
813 |
814 | seq_relationship_scores = outputs[0]
815 |
816 | """
817 |
818 | outputs = self.bert(
819 | input_ids,
820 | attention_mask=attention_mask,
821 | token_type_ids=token_type_ids,
822 | head_mask=head_mask,
823 | inputs_embeds=inputs_embeds,
824 | )
825 |
826 | pooled_output = outputs[1]
827 | seq_relationship_score = self.cls(pooled_output)
828 | outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here
829 | if next_sentence_label is not None:
830 | loss_fct = CrossEntropyLoss()
831 | next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
832 | outputs = (next_sentence_loss,) + outputs
833 |
834 | return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
835 |
836 |
837 | @add_start_docstrings(
838 | """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
839 | the pooled output) e.g. for GLUE tasks. """,
840 | BERT_START_DOCSTRING,
841 | )
842 | class NeZhaForSequenceClassification(NeZhaPreTrainedModel):
843 | def __init__(self, config):
844 | super().__init__(config)
845 | self.num_labels = config.num_labels
846 | self.bert = NeZhaModel(config)
847 | self.dropout = nn.Dropout(config.hidden_dropout_prob)
848 | self.classifier = nn.Linear(config.hidden_size, config.num_labels)
849 | self.init_weights()
850 |
851 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
852 | def forward(
853 | self,
854 | input_ids=None,
855 | attention_mask=None,
856 | token_type_ids=None,
857 | position_ids=None,
858 | head_mask=None,
859 | inputs_embeds=None,
860 | labels=None,
861 | ):
862 | r"""
863 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
864 | Labels for computing the sequence classification/regression loss.
865 | Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
866 | If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
867 | If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
868 |
869 | Returns:
870 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
871 | loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
872 | Classification (or regression if config.num_labels==1) loss.
873 | logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
874 | Classification (or regression if config.num_labels==1) scores (before SoftMax).
875 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
876 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
877 | of shape :obj:`(batch_size, sequence_length, hidden_size)`.
878 |
879 | Hidden-states of the model at the output of each layer plus the initial embedding outputs.
880 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
881 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
882 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
883 |
884 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
885 | heads.
886 |
887 | Examples::
888 |
889 | from transformers import BertTokenizer, BertForSequenceClassification
890 | import torch
891 |
892 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
893 | model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
894 |
895 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
896 | labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
897 | outputs = model(input_ids, labels=labels)
898 |
899 | loss, logits = outputs[:2]
900 |
901 | """
902 |
903 | outputs = self.bert(
904 | input_ids,
905 | attention_mask=attention_mask,
906 | token_type_ids=token_type_ids,
907 | head_mask=head_mask,
908 | inputs_embeds=inputs_embeds,
909 | )
910 |
911 | pooled_output = outputs[1]
912 |
913 | pooled_output = self.dropout(pooled_output)
914 | logits = self.classifier(pooled_output)
915 |
916 | outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
917 |
918 | if labels is not None:
919 | if self.num_labels == 1:
920 | # We are doing regression
921 | loss_fct = MSELoss()
922 | loss = loss_fct(logits.view(-1), labels.view(-1))
923 | else:
924 | loss_fct = CrossEntropyLoss()
925 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
926 | outputs = (loss,) + outputs
927 |
928 | return outputs # (loss), logits, (hidden_states), (attentions)
929 |
930 |
931 | @add_start_docstrings(
932 | """Bert Model with a multiple choice classification head on top (a linear layer on top of
933 | the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
934 | BERT_START_DOCSTRING,
935 | )
936 | class NeZhaForMultipleChoice(NeZhaPreTrainedModel):
937 | def __init__(self, config):
938 | super().__init__(config)
939 | self.bert = NeZhaModel(config)
940 | self.dropout = nn.Dropout(config.hidden_dropout_prob)
941 | self.classifier = nn.Linear(config.hidden_size, 1)
942 | self.init_weights()
943 |
944 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
945 | def forward(
946 | self,
947 | input_ids=None,
948 | attention_mask=None,
949 | token_type_ids=None,
950 | head_mask=None,
951 | position_ids=None,
952 | inputs_embeds=None,
953 | labels=None,
954 | ):
955 | r"""
956 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
957 | Labels for computing the multiple choice classification loss.
958 | Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
959 | of the input tensors. (see `input_ids` above)
960 |
961 | Returns:
962 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
963 | loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
964 | Classification loss.
965 | classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
966 | `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
967 |
968 | Classification scores (before SoftMax).
969 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
970 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
971 | of shape :obj:`(batch_size, sequence_length, hidden_size)`.
972 |
973 | Hidden-states of the model at the output of each layer plus the initial embedding outputs.
974 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
975 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
976 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
977 |
978 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
979 | heads.
980 |
981 | Examples::
982 |
983 | from transformers import BertTokenizer, BertForMultipleChoice
984 | import torch
985 |
986 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
987 | model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
988 | choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
989 |
990 | input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
991 | labels = torch.tensor(1).unsqueeze(0) # Batch size 1
992 | outputs = model(input_ids, labels=labels)
993 |
994 | loss, classification_scores = outputs[:2]
995 |
996 | """
997 | num_choices = input_ids.shape[1]
998 |
999 | input_ids = input_ids.view(-1, input_ids.size(-1))
1000 | attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
1001 | token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
1002 |
1003 | outputs = self.bert(
1004 | input_ids,
1005 | attention_mask=attention_mask,
1006 | token_type_ids=token_type_ids,
1007 | head_mask=head_mask,
1008 | inputs_embeds=inputs_embeds,
1009 | )
1010 |
1011 | pooled_output = outputs[1]
1012 |
1013 | pooled_output = self.dropout(pooled_output)
1014 | logits = self.classifier(pooled_output)
1015 | reshaped_logits = logits.view(-1, num_choices)
1016 |
1017 | outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
1018 |
1019 | if labels is not None:
1020 | loss_fct = CrossEntropyLoss()
1021 | loss = loss_fct(reshaped_logits, labels)
1022 | outputs = (loss,) + outputs
1023 |
1024 | return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
1025 |
1026 |
1027 | @add_start_docstrings(
1028 | """Bert Model with a token classification head on top (a linear layer on top of
1029 | the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
1030 | BERT_START_DOCSTRING,
1031 | )
1032 | class NeZhaForTokenClassification(NeZhaPreTrainedModel):
1033 | def __init__(self, config):
1034 | super().__init__(config)
1035 | self.num_labels = config.num_labels
1036 | self.bert = NeZhaModel(config)
1037 | self.dropout = nn.Dropout(config.hidden_dropout_prob)
1038 | self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1039 | self.init_weights()
1040 |
1041 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1042 | def forward(
1043 | self,
1044 | input_ids=None,
1045 | attention_mask=None,
1046 | token_type_ids=None,
1047 | head_mask=None,
1048 | position_ids=None,
1049 | inputs_embeds=None,
1050 | labels=None,
1051 | ):
1052 | r"""
1053 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
1054 | Labels for computing the token classification loss.
1055 | Indices should be in ``[0, ..., config.num_labels - 1]``.
1056 |
1057 | Returns:
1058 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
1059 | loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
1060 | Classification loss.
1061 | scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
1062 | Classification scores (before SoftMax).
1063 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
1064 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
1065 | of shape :obj:`(batch_size, sequence_length, hidden_size)`.
1066 |
1067 | Hidden-states of the model at the output of each layer plus the initial embedding outputs.
1068 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
1069 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
1070 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
1071 |
1072 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
1073 | heads.
1074 |
1075 | Examples::
1076 |
1077 | from transformers import BertTokenizer, BertForTokenClassification
1078 | import torch
1079 |
1080 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1081 | model = BertForTokenClassification.from_pretrained('bert-base-uncased')
1082 |
1083 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
1084 | labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
1085 | outputs = model(input_ids, labels=labels)
1086 |
1087 | loss, scores = outputs[:2]
1088 |
1089 | """
1090 |
1091 | outputs = self.bert(
1092 | input_ids,
1093 | attention_mask=attention_mask,
1094 | token_type_ids=token_type_ids,
1095 | head_mask=head_mask,
1096 | inputs_embeds=inputs_embeds,
1097 | )
1098 |
1099 | sequence_output = outputs[0]
1100 |
1101 | sequence_output = self.dropout(sequence_output)
1102 | logits = self.classifier(sequence_output)
1103 |
1104 | outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
1105 | if labels is not None:
1106 | loss_fct = CrossEntropyLoss()
1107 | # Only keep active parts of the loss
1108 | if attention_mask is not None:
1109 | active_loss = attention_mask.view(-1) == 1
1110 | active_logits = logits.view(-1, self.num_labels)
1111 | active_labels = torch.where(
1112 | active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
1113 | )
1114 | loss = loss_fct(active_logits, active_labels)
1115 | else:
1116 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1117 | outputs = (loss,) + outputs
1118 |
1119 | return outputs # (loss), scores, (hidden_states), (attentions)
1120 |
1121 |
1122 | @add_start_docstrings(
1123 | """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
1124 | layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
1125 | BERT_START_DOCSTRING,
1126 | )
1127 | class NeZhaForQuestionAnswering(NeZhaPreTrainedModel):
1128 | def __init__(self, config):
1129 | super().__init__(config)
1130 | self.num_labels = config.num_labels
1131 | self.bert = NeZhaModel(config)
1132 | self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1133 | self.init_weights()
1134 |
1135 | @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1136 | def forward(
1137 | self,
1138 | input_ids=None,
1139 | attention_mask=None,
1140 | token_type_ids=None,
1141 | head_mask=None,
1142 | inputs_embeds=None,
1143 | position_ids=None,
1144 | start_positions=None,
1145 | end_positions=None,
1146 | ):
1147 | r"""
1148 | start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1149 | Labels for position (index) of the start of the labelled span for computing the token classification loss.
1150 | Positions are clamped to the length of the sequence (`sequence_length`).
1151 | Position outside of the sequence are not taken into account for computing the loss.
1152 | end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1153 | Labels for position (index) of the end of the labelled span for computing the token classification loss.
1154 | Positions are clamped to the length of the sequence (`sequence_length`).
1155 | Position outside of the sequence are not taken into account for computing the loss.
1156 |
1157 | Returns:
1158 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
1159 | loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
1160 | Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
1161 | start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
1162 | Span-start scores (before SoftMax).
1163 | end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
1164 | Span-end scores (before SoftMax).
1165 | hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
1166 | Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
1167 | of shape :obj:`(batch_size, sequence_length, hidden_size)`.
1168 |
1169 | Hidden-states of the model at the output of each layer plus the initial embedding outputs.
1170 | attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
1171 | Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
1172 | :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
1173 |
1174 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
1175 | heads.
1176 |
1177 | Examples::
1178 |
1179 | from transformers import BertTokenizer, BertForQuestionAnswering
1180 | import torch
1181 |
1182 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1183 | model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
1184 |
1185 | question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
1186 | encoding = tokenizer.encode_plus(question, text)
1187 | input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
1188 | start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
1189 |
1190 | all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
1191 | answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
1192 |
1193 | assert answer == "a nice puppet"
1194 |
1195 | """
1196 |
1197 | outputs = self.bert(
1198 | input_ids,
1199 | attention_mask=attention_mask,
1200 | token_type_ids=token_type_ids,
1201 | head_mask=head_mask,
1202 | inputs_embeds=inputs_embeds,
1203 | )
1204 |
1205 | sequence_output = outputs[0]
1206 |
1207 | logits = self.qa_outputs(sequence_output)
1208 | start_logits, end_logits = logits.split(1, dim=-1)
1209 | start_logits = start_logits.squeeze(-1)
1210 | end_logits = end_logits.squeeze(-1)
1211 |
1212 | outputs = (start_logits, end_logits,) + outputs[2:]
1213 | if start_positions is not None and end_positions is not None:
1214 | # If we are on multi-GPU, split add a dimension
1215 | if len(start_positions.size()) > 1:
1216 | start_positions = start_positions.squeeze(-1)
1217 | if len(end_positions.size()) > 1:
1218 | end_positions = end_positions.squeeze(-1)
1219 | # sometimes the start/end positions are outside our model inputs, we ignore these terms
1220 | ignored_index = start_logits.size(1)
1221 | start_positions.clamp_(0, ignored_index)
1222 | end_positions.clamp_(0, ignored_index)
1223 |
1224 | loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1225 | start_loss = loss_fct(start_logits, start_positions)
1226 | end_loss = loss_fct(end_logits, end_positions)
1227 | total_loss = (start_loss + end_loss) / 2
1228 | outputs = (total_loss,) + outputs
1229 |
1230 | return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
1231 |
--------------------------------------------------------------------------------