├── data
    ├── requirements.txt
    ├── read_me.md
    └── code
    │   ├── models
    │       ├── __pycache__
    │       │   └── nezha.cpython-37.pyc
    │       └── nezha.py
    │   ├── util
    │       ├── others
    │       │   ├── __pycache__
    │       │   │   ├── hanzi.cpython-37.pyc
    │       │   │   └── label2id.cpython-37.pyc
    │       │   ├── label2id.py
    │       │   └── hanzi.py
    │       ├── tools
    │       │   ├── __pycache__
    │       │   │   ├── predict_tools.cpython-37.pyc
    │       │   │   └── finetune_tools.cpython-37.pyc
    │       │   ├── predict_tools.py
    │       │   └── finetune_tools.py
    │       ├── pretrain_utils
    │       │   ├── __pycache__
    │       │   │   ├── trainer.cpython-37.pyc
    │       │   │   └── trainer_args.cpython-37.pyc
    │       │   └── trainer_args.py
    │       └── modeling
    │       │   └── modeling_nezha
    │       │       ├── __pycache__
    │       │           ├── modeling.cpython-37.pyc
    │       │           └── configuration.cpython-37.pyc
    │       │       ├── configuration.py
    │       │       └── modeling.py
    │   ├── fusion_code
    │       └── run_fusion.py
    │   ├── predict_code
    │       └── run_predictor.py
    │   ├── build_vocab
    │       └── build_vocab.py
    │   ├── process_data
    │       └── process_data.py
    │   ├── finetune_code
    │       └── run_classify.py
    │   └── pretrain_code
    │       └── run_pretrain.py
├── .idea
    ├── .gitignore
    ├── vcs.xml
    ├── misc.xml
    ├── inspectionProfiles
    │   ├── profiles_settings.xml
    │   └── Project_Default.xml
    ├── modules.xml
    └── daguancup_end2end.iml
├── .gitattributes
└── READ_ME.md


/data/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.7.1
2 | transformers==4.3.0.rc1


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/data/read_me.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/read_me.md


--------------------------------------------------------------------------------
/data/code/models/__pycache__/nezha.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/models/__pycache__/nezha.cpython-37.pyc


--------------------------------------------------------------------------------
/data/code/util/others/__pycache__/hanzi.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/others/__pycache__/hanzi.cpython-37.pyc


--------------------------------------------------------------------------------
/data/code/util/others/__pycache__/label2id.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/others/__pycache__/label2id.cpython-37.pyc


--------------------------------------------------------------------------------
/data/code/util/tools/__pycache__/predict_tools.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/tools/__pycache__/predict_tools.cpython-37.pyc


--------------------------------------------------------------------------------
/data/code/util/tools/__pycache__/finetune_tools.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/tools/__pycache__/finetune_tools.cpython-37.pyc


--------------------------------------------------------------------------------
/data/code/util/pretrain_utils/__pycache__/trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/pretrain_utils/__pycache__/trainer.cpython-37.pyc


--------------------------------------------------------------------------------
/data/code/util/pretrain_utils/__pycache__/trainer_args.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/pretrain_utils/__pycache__/trainer_args.cpython-37.pyc


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/data/code/util/modeling/modeling_nezha/__pycache__/modeling.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/modeling/modeling_nezha/__pycache__/modeling.cpython-37.pyc


--------------------------------------------------------------------------------
/data/code/util/modeling/modeling_nezha/__pycache__/configuration.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuxuelinwudi/daguancup_end2end/HEAD/data/code/util/modeling/modeling_nezha/__pycache__/configuration.cpython-37.pyc


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (torch_pure) (53)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/daguancup_end2end.iml" filepath="$PROJECT_DIR$/.idea/daguancup_end2end.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/daguancup_end2end.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.7 (torch_pure) (53)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="PyDocumentationSettings">
 9 |     <option name="format" value="PLAIN" />
10 |     <option name="myDocStringFormat" value="Plain" />
11 |   </component>
12 |   <component name="TestRunnerService">
13 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
14 |   </component>
15 | </module>


--------------------------------------------------------------------------------
/data/code/fusion_code/run_fusion.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | import os
 4 | import sys
 5 | import csv
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | sys.path.append('../../../data')
10 | from argparse import ArgumentParser
11 | from data.code.util.others.label2id import id2label
12 | 
13 | 
14 | def fusion(args):
15 |     k, predictions = 0, 0
16 | 
17 |     tmp = pd.read_csv(os.path.join(args.result_path, 'output_result', 'full_logit.csv'))
18 |     tmp = tmp.values
19 |     predictions += tmp
20 |     predictions = np.argmax(predictions, axis=-1)
21 |     result = []
22 |     for i in predictions:
23 |         result.append((k, id2label[str(i)]))
24 |         k += 1
25 |     write2tsv(args.submit_path, result)
26 | 
27 | 
28 | def write2tsv(output_path, data):
29 |     with open(output_path, 'w', newline='') as f:
30 |         tsv_w = csv.writer(f, delimiter=',')
31 |         tsv_w.writerow(['id', 'label'])
32 |         tsv_w.writerows(data)
33 | 
34 | 
35 | def main():
36 |     parser = ArgumentParser()
37 |     parser.add_argument('--result_path', type=str, default="../../user_data")
38 |     parser.add_argument('--submit_path', type=str, default=f'../../prediction_result/result.csv')
39 | 
40 |     args = parser.parse_args()
41 | 
42 |     fusion(args)
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/data/code/util/others/label2id.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | label2id = {
 4 |     '1-1': 0,
 5 |     '1-10': 1,
 6 |     '1-4': 2,
 7 |     '1-9': 3,
 8 |     '10-26': 4,
 9 |     '2-11': 5,
10 |     '2-14': 6,
11 |     '2-17': 7,
12 |     '2-2': 8,
13 |     '2-25': 9,
14 |     '2-3': 10,
15 |     '2-33': 11,
16 |     '2-6': 12,
17 |     '3-5': 13,
18 |     '4-7': 14,
19 |     '5-12': 15,
20 |     '5-22': 16,
21 |     '5-24': 17,
22 |     '5-30': 18,
23 |     '5-35': 19,
24 |     '6-13': 20,
25 |     '6-15': 21,
26 |     '6-19': 22,
27 |     '6-20': 23,
28 |     '6-21': 24,
29 |     '6-28': 25,
30 |     '6-29': 26,
31 |     '6-31': 27,
32 |     '6-32': 28,
33 |     '6-34': 29,
34 |     '6-8': 30,
35 |     '7-16': 31,
36 |     '8-18': 32,
37 |     '8-27': 33,
38 |     '9-23': 34
39 | }
40 | 
41 | id2label = {
42 |     '0': '1-1',
43 |     '1': '1-10',
44 |     '2': '1-4',
45 |     '3': '1-9',
46 |     '4': '10-26',
47 |     '5': '2-11',
48 |     '6': '2-14',
49 |     '7': '2-17',
50 |     '8': '2-2',
51 |     '9': '2-25',
52 |     '10': '2-3',
53 |     '11': '2-33',
54 |     '12': '2-6',
55 |     '13': '3-5',
56 |     '14': '4-7',
57 |     '15': '5-12',
58 |     '16': '5-22',
59 |     '17': '5-24',
60 |     '18': '5-30',
61 |     '19': '5-35',
62 |     '20': '6-13',
63 |     '21': '6-15',
64 |     '22': '6-19',
65 |     '23': '6-20',
66 |     '24': '6-21',
67 |     '25': '6-28',
68 |     '26': '6-29',
69 |     '27': '6-31',
70 |     '28': '6-32',
71 |     '29': '6-34',
72 |     '30': '6-8',
73 |     '31': '7-16',
74 |     '32': '8-18',
75 |     '33': '8-27',
76 |     '34': '9-23'
77 | }
78 | 
79 | # print(label2id['9-23'])
80 | # print(id2label['0'])


--------------------------------------------------------------------------------
/data/code/predict_code/run_predictor.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | import sys
 4 | import warnings
 5 | from argparse import ArgumentParser
 6 | 
 7 | sys.path.append('../../../data')
 8 | from data.code.util.tools.predict_tools import *
 9 | 
10 | 
11 | def main():
12 |     parser = ArgumentParser()
13 | 
14 |     parser.add_argument('--vocab_path', type=str, default='../../user_data/tokenizer/vocab.txt')
15 |     parser.add_argument('--output_result_path', type=str, default='../../user_data/output_result')
16 |     parser.add_argument('--data_cache_path', type=str, default='../../user_data/process_data/pkl')
17 |     parser.add_argument('--test_path', type=str, default='../../user_data/process_data/test.txt')
18 |     parser.add_argument('--load_model_path', type=str, default='../../user_data/output_model')
19 |     parser.add_argument('--batch_size', type=int, default=128 * 8)
20 |     parser.add_argument('--max_seq_len', type=int, default=128)
21 |     parser.add_argument('--device', type=str, default='cuda')
22 | 
23 |     args = parser.parse_args()
24 |     warnings.filterwarnings('ignore')
25 | 
26 |     os.makedirs(args.output_result_path, exist_ok=True)
27 |     tokenizer = BertTokenizer.from_pretrained(args.vocab_path)
28 | 
29 |     if not os.path.exists(os.path.join(args.data_cache_path, 'test.pkl')):
30 |         read_data(args, tokenizer)
31 | 
32 |     test_dataloader = load_data(args, tokenizer)
33 | 
34 |     model = NeZhaSequenceClassification_P.from_pretrained(os.path.join(args.load_model_path, f'last-checkpoint'))
35 |     model.to(args.device)
36 |     model.eval()
37 | 
38 |     final_res = predict(test_dataloader, model, args)
39 |     final_res.tolist()
40 |     save2csv(args, final_res)
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/READ_ME.md:
--------------------------------------------------------------------------------
 1 | # 0.The 5th Dagan Cup, Team name: XiaoChuan Sun , 4th in the A list, 7th in the B list, single model throughout.
 2 | ##### Competition Address:https://www.datafountain.cn/competitions/512/ranking?isRedance=0&sch=1804
 3 | 
 4 | # 1.data process details
 5 | 
 6 | ##### 1.1.The maximum length of a sentence is limited to 128, and any sentence longer than 128 is truncated (by taking the first 32 and the last 96).
 7 | 
 8 | 
 9 | # 2.pretrain details
10 | 
11 | ##### 2.1.The data used is the first 18W json (title+content) of the unlabeled data, totaling 36W (training set and test set data are not used, because I forgot to use them).
12 | 
13 | ##### 2.2.The pre-training model used is nezha-cn-base, and the pre-training task is albert's ngram mask, as well as the Word Structural Objective task borrowed from structbert, in the time of mask, a randomly selected trigram is disrupted, and while the model predicts the original token, it also does the restoration operation, which is equivalent to the improvement of this task of structbert.
14 | 
15 | 
16 | # 3.finetune details
17 | 
18 | ##### 3.1.Regular tricks are: PGD, Lookahead, EMA, stratified learning rate, TSA, etc.
19 | ##### 3.2.Customized the model architecture as follows.
20 | ###### 3.2.1.Taking the CLS of the last five layers of all hidden layer states for splicing works best (tried many kinds of structures, such as: post-connected CNN/LSTM, MSD, MEAN-POOLING, etc.).
21 | ###### 3.2.2.Because the data comes with two levels of labels, the labels are cut (primary label: 10, secondary label 35) and the loss is calculated separately (for the output hidden_state, it goes through two linear layers respectively, each linear layer output dimension corresponds to a different number of labels).
22 | ###### 3.2.3.The self-researched method, in the model fine-tuning, in each batch, let the model to predict the training set, the prediction results and the real label between the loss of feedback, pulling the distance between the predicted label and the real label, the effect has slightly improved, in other data sets have been tested, not deep investigation of it, is an innovative point.
23 | 


--------------------------------------------------------------------------------
/data/code/build_vocab/build_vocab.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | import os
 4 | import torch
 5 | import random
 6 | import logging
 7 | import warnings
 8 | import numpy as np
 9 | from argparse import ArgumentParser
10 | 
11 | from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer
12 | from transformers import BertTokenizer
13 | 
14 | logging.basicConfig()
15 | logger = logging.getLogger('build vocab')
16 | logger.setLevel(logging.INFO)
17 | 
18 | 
19 | def seed_everything(seed):
20 |     random.seed(seed)
21 |     os.environ['PYTHONHASHSEED'] = str(seed)
22 |     np.random.seed(seed)
23 |     torch.manual_seed(seed)
24 |     torch.cuda.manual_seed(seed)
25 |     torch.cuda.manual_seed_all(seed)
26 |     torch.backends.cudnn.benchmark = False
27 |     torch.backends.cudnn.deterministic = True
28 | 
29 | 
30 | def train_tokenizer(args):
31 |     tokenizer = BertWordPieceTokenizer(
32 |         clean_text=False,
33 |         handle_chinese_chars=True,
34 |         strip_accents=False,
35 |         lowercase=False
36 |     )
37 |     special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
38 | 
39 |     # for i in range(100):
40 |     #     special_tokens.append(f"[unused{i}]")
41 | 
42 |     tokenizer.train(
43 |         files=[args.file_path, args.unlabeled_file_path],
44 |         vocab_size=args.vocab_size,
45 |         min_frequency=1,
46 |         special_tokens=special_tokens,
47 |         limit_alphabet=args.vocab_size,
48 |         wordpieces_prefix="##"
49 |     )
50 |     os.makedirs(args.out_path, exist_ok=True)
51 |     tokenizer.save_model(args.out_path)
52 |     tokenizer = BertTokenizer.from_pretrained(args.out_path,
53 |                                               do_lower_case=False,
54 |                                               strip_accents=False)
55 |     tokenizer.save_pretrained(args.out_path)
56 |     logger.info(f'save tokenizer, with vocab_size: {tokenizer.vocab_size}')
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     parser = ArgumentParser()
61 | 
62 |     parser.add_argument('--seed', type=int, default=2021)
63 |     parser.add_argument('--vocab_size', type=int, default=21128)
64 |     parser.add_argument('--file_path', type=str, default='../../user_data/process_data/pretrain.txt')
65 |     parser.add_argument('--unlabeled_file_path', type=str,
66 |                         default='../../user_data/process_data/unlabeled_pretrain.txt')
67 |     parser.add_argument('--out_path', type=str, default='../../user_data/tokenizer')
68 | 
69 |     warnings.filterwarnings('ignore')
70 |     args = parser.parse_args()
71 | 
72 |     seed_everything(args.seed)
73 | 
74 |     train_tokenizer(args)
75 | 
76 |     logger.info(f'vocab creation completed .')
77 | 


--------------------------------------------------------------------------------
/data/code/util/others/hanzi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Constants for working with Chinese characters."""
 3 | 
 4 | from __future__ import unicode_literals
 5 | import sys
 6 | 
 7 | #: Character code ranges for pertinent CJK ideograph Unicode blocks.
 8 | characters = cjk_ideographs = (
 9 |     '\u3007'         # Ideographic number zero, see issue #17
10 |     '\u4E00-\u9FFF'  # CJK Unified Ideographs
11 |     '\u3400-\u4DBF'  # CJK Unified Ideographs Extension A
12 |     '\uF900-\uFAFF'  # CJK Compatibility Ideographs
13 | )
14 | if sys.maxunicode > 0xFFFF:
15 |     characters += (
16 |         '\U00020000-\U0002A6DF'  # CJK Unified Ideographs Extension B
17 |         '\U0002A700-\U0002B73F'  # CJK Unified Ideographs Extension C
18 |         '\U0002B740-\U0002B81F'  # CJK Unified Ideographs Extension D
19 |         '\U0002F800-\U0002FA1F'  # CJK Compatibility Ideographs Supplement
20 |     )
21 | 
22 | #: Character code ranges for the Kangxi radicals and CJK Radicals Supplement.
23 | radicals = (
24 |     '\u2F00-\u2FD5'  # Kangxi Radicals
25 |     '\u2E80-\u2EF3'  # CJK Radicals Supplement
26 | )
27 | 
28 | #: A string containing Chinese punctuation marks (non-stops).
29 | non_stops = (
30 |     # Fullwidth ASCII variants
31 |     '\uFF02\uFF03\uFF04\uFF05\uFF06\uFF07\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0D'
32 |     '\uFF0F\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF20\uFF3B\uFF3C\uFF3D\uFF3E\uFF3F'
33 |     '\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF60'
34 | 
35 |     # Halfwidth CJK punctuation
36 |     '\uFF62\uFF63\uFF64'
37 | 
38 |     # CJK symbols and punctuation
39 |     '\u3000\u3001\u3003'
40 | 
41 |     # CJK angle and corner brackets
42 |     '\u3008\u3009\u300A\u300B\u300C\u300D\u300E\u300F\u3010\u3011'
43 | 
44 |     # CJK brackets and symbols/punctuation
45 |     '\u3014\u3015\u3016\u3017\u3018\u3019\u301A\u301B\u301C\u301D\u301E\u301F'
46 | 
47 |     # Other CJK symbols
48 |     '\u3030'
49 | 
50 |     # Special CJK indicators
51 |     '\u303E\u303F'
52 | 
53 |     # Dashes
54 |     '\u2013\u2014'
55 | 
56 |     # Quotation marks and apostrophe
57 |     '\u2018\u2019\u201B\u201C\u201D\u201E\u201F'
58 | 
59 |     # General punctuation
60 |     '\u2026\u2027'
61 | 
62 |     # Overscores and underscores
63 |     '\uFE4F'
64 | 
65 |     # Small form variants
66 |     '\uFE51\uFE54'
67 | 
68 |     # Latin punctuation
69 |     '\u00B7'
70 | )
71 | 
72 | #: A string of Chinese stops.
73 | stops = (
74 |     '\uFF01'  # Fullwidth exclamation mark
75 |     '\uFF1F'  # Fullwidth question mark
76 |     '\uFF61'  # Halfwidth ideographic full stop
77 |     '\u3002'  # Ideographic full stop
78 | )
79 | 
80 | #: A string containing all Chinese punctuation.
81 | punctuation = non_stops + stops
82 | 
83 | # A sentence end is defined by a stop followed by zero or more
84 | # container-closing marks (e.g. quotation or brackets).
85 | _sentence_end = '[{stops}][」﹂”』’》）］｝〕〗〙〛〉】]*'.format(stops=stops)
86 | 
87 | #: A regular expression pattern for a Chinese sentence. A sentence is defined
88 | #: as a series of characters and non-stop punctuation marks followed by a stop
89 | #: and zero or more container-closing punctuation marks (e.g. apostrophe or
90 | # brackets).
91 | sent = sentence = '[{characters}{radicals}{non_stops}]*{sentence_end}'.format(
92 |     characters=characters, radicals=radicals, non_stops=non_stops,
93 |     sentence_end=_sentence_end)


--------------------------------------------------------------------------------
/data/code/models/nezha.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from data.code.util.modeling.modeling_nezha.modeling import NeZhaPreTrainedModel, NeZhaModel
 4 | 
 5 | 
 6 | class NeZhaSequenceClassification_F(NeZhaPreTrainedModel):
 7 |     def __init__(self, config):
 8 |         super().__init__(config)
 9 |         self.level1_num_labels = 10
10 |         self.num_labels = 35
11 |         self.bert = NeZhaModel(config)
12 |         self.level1_classifier = nn.Linear(config.hidden_size * 5, self.level1_num_labels)
13 |         self.classifier = nn.Linear(config.hidden_size * 5, self.num_labels)
14 |         self.init_weights()
15 | 
16 |     def forward(
17 |             self,
18 |             input_ids=None,
19 |             attention_mask=None,
20 |             token_type_ids=None,
21 |             labels=None,
22 |             level1_labels=None
23 |     ):
24 |         attention_mask = torch.ne(input_ids, 0)
25 |         encoder_out, pooled_out, all_hidden_outputs = self.bert(
26 |             input_ids=input_ids,
27 |             attention_mask=attention_mask,
28 |             token_type_ids=token_type_ids
29 |         )
30 | 
31 |         last_hidden = torch.cat(
32 |             (
33 |                 all_hidden_outputs[-1][:, 0],
34 |                 all_hidden_outputs[-2][:, 0],
35 |                 all_hidden_outputs[-3][:, 0],
36 |                 all_hidden_outputs[-4][:, 0],
37 |                 all_hidden_outputs[-5][:, 0]
38 |             ),
39 |             1
40 |         )
41 | 
42 |         logits = self.classifier(last_hidden)
43 |         outputs = (logits,) + (pooled_out,)
44 | 
45 |         if labels is not None:
46 |             loss_fct = nn.CrossEntropyLoss()
47 |             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
48 | 
49 |             if level1_labels is not None:
50 |                 level1_logits = self.level1_classifier(last_hidden)
51 |                 level1_loss = loss_fct(level1_logits.view(-1, self.level1_num_labels),
52 |                                        level1_labels.view(-1))
53 |                 loss = loss + 0.5 * level1_loss
54 |             outputs = (loss,) + outputs
55 | 
56 |         return outputs
57 | 
58 | 
59 | class NeZhaSequenceClassification_P(NeZhaPreTrainedModel):
60 |     def __init__(self, config):
61 |         super().__init__(config)
62 |         self.level1_num_labels = 10
63 |         self.num_labels = 35
64 |         self.bert = NeZhaModel(config)
65 |         self.level1_classifier = nn.Linear(config.hidden_size * 5, self.level1_num_labels)
66 |         self.classifier = nn.Linear(config.hidden_size * 5, self.num_labels)
67 |         self.init_weights()
68 | 
69 |     def forward(
70 |             self,
71 |             input_ids=None,
72 |             attention_mask=None,
73 |             token_type_ids=None
74 |     ):
75 |         attention_mask = torch.ne(input_ids, 0)
76 |         encoder_out, pooled_out, all_hidden_outputs = self.bert(
77 |             input_ids=input_ids,
78 |             attention_mask=attention_mask,
79 |             token_type_ids=token_type_ids
80 |         )
81 | 
82 |         last_hidden = torch.cat(
83 |             (
84 |                 all_hidden_outputs[-1][:, 0],
85 |                 all_hidden_outputs[-2][:, 0],
86 |                 all_hidden_outputs[-3][:, 0],
87 |                 all_hidden_outputs[-4][:, 0],
88 |                 all_hidden_outputs[-5][:, 0]
89 |             ),
90 |             1
91 |         )
92 | 
93 |         logits = self.classifier(last_hidden)
94 |         outputs = (logits,) + (pooled_out,)
95 | 
96 |         return outputs
97 | 


--------------------------------------------------------------------------------
/data/code/process_data/process_data.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | import os
  4 | import sys
  5 | import json
  6 | import logging
  7 | import warnings
  8 | import pandas as pd
  9 | from tqdm import tqdm
 10 | from argparse import ArgumentParser
 11 | from data.code.util.others.label2id import label2id
 12 | 
 13 | sys.path.append('../../../data')
 14 | 
 15 | logging.basicConfig()
 16 | logger = logging.getLogger('第五届达观杯')
 17 | logger.setLevel(logging.INFO)
 18 | 
 19 | 
 20 | def cut_text(text, args):
 21 |     char = [i for i in text.split(' ')]
 22 |     length = len(char)
 23 |     if length > args.max_length:
 24 |         head = char[:32]
 25 |         tail = char[-96:]
 26 |         new_char = head + tail
 27 |         new_text = ''
 28 |         for i in new_char:
 29 |             new_text += i + ' '
 30 |         new_text = new_text.strip()
 31 |         return new_text
 32 |     else:
 33 |         return text.strip()
 34 | 
 35 | 
 36 | def process_unlabeled_data(args):
 37 |     text = []
 38 |     with open(args.unlabeled_path, 'r') as f, open(args.out_unlabeled_path, 'w', encoding='utf-8') as w:
 39 |         for i in tqdm(range(args.number_unlabeled), desc='processing unlabeled data'):
 40 |             line_data = f.readline()
 41 |             if line_data:
 42 |                 data = json.loads(line_data)
 43 |                 title = data['title']
 44 |                 content = data['content']
 45 |                 if title == '' or content == '':
 46 |                     continue
 47 |                 else:
 48 |                     text.append(title)
 49 |                     text.append(content)
 50 |                 for j in text:
 51 |                     w.writelines(j + '\n')
 52 |                 text = []
 53 |             else:
 54 |                 break
 55 | 
 56 | 
 57 | def process_text(args):
 58 |     train = pd.read_csv(args.train_path)
 59 |     test = pd.read_csv(args.test_path)
 60 | 
 61 |     train_text = train['text'].tolist()
 62 |     test_text = test['text'].tolist()
 63 |     pretrain_text = train_text + test_text
 64 | 
 65 |     label = train['label'].tolist()
 66 | 
 67 |     pretrain_sentence, train_sentence, train_sentence1, test_sentence = [], [], [], []
 68 |     for i in pretrain_text:
 69 |         pretrain_sentence.append(i.strip())
 70 | 
 71 |     pretrain_sentence = list(set(pretrain_sentence))
 72 | 
 73 |     logger.info(f'total pretrain data : {len(pretrain_sentence)}.')
 74 | 
 75 |     for i in train_text:
 76 |         train_sentence.append(cut_text(i, args))
 77 | 
 78 |     for i in range(len(train_sentence)):
 79 |         tgt_level1, tgt_level2 = label[i].split('-')
 80 |         tgt = label2id[label[i]]
 81 |         line = train_sentence[i] + '\t' + str(tgt) + '\t' + str(int(tgt_level1) - 1)
 82 |         train_sentence1.append(line)
 83 | 
 84 |     logger.info(f'total train data : {len(train_sentence)}.')
 85 | 
 86 |     for i in test_text:
 87 |         test_sentence.append(cut_text(i, args))
 88 | 
 89 |     logger.info(f'total test data : {len(test_sentence)}.')
 90 | 
 91 |     return pretrain_sentence, train_sentence1, test_sentence
 92 | 
 93 | 
 94 | def write(text_list, out_path):
 95 |     with open(out_path, 'w', encoding='utf-8') as f:
 96 |         for i in text_list:
 97 |             f.writelines(i + '\n')
 98 | 
 99 |     logger.info(f'process data has been written to {out_path}.')
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     parser = ArgumentParser()
104 | 
105 |     parser.add_argument('--max_length', type=int, default=128)
106 |     parser.add_argument('--number_unlabeled', type=int, default=180000)
107 |     parser.add_argument('--unlabeled_path', type=str, default='../../raw_data/datagrand_2021_unlabeled_data.json')
108 |     parser.add_argument('--train_path', type=str, default='../../raw_data/datagrand_2021_train.csv')
109 |     parser.add_argument('--test_path', type=str, default='../../raw_data/datagrand_2021_test.csv')
110 |     parser.add_argument('--out_path', type=str, default='../../user_data/process_data/')
111 |     parser.add_argument('--out_unlabeled_path', type=str,
112 |                         default='../../user_data/process_data/unlabeled_pretrain.txt')
113 | 
114 |     warnings.filterwarnings('ignore')
115 |     args = parser.parse_args()
116 | 
117 |     os.makedirs(args.out_path, exist_ok=True)
118 | 
119 |     out_pretrain_path = os.path.join(args.out_path, 'pretrain.txt')
120 |     out_train_path = os.path.join(args.out_path, 'train.txt')
121 |     out_test_path = os.path.join(args.out_path, 'test.txt')
122 | 
123 |     process_unlabeled_data(args)
124 |     pretrain, train, test = process_text(args)
125 | 
126 |     write(pretrain, out_pretrain_path)
127 |     write(train, out_train_path)
128 |     write(test, out_test_path)
129 | 
130 |     logger.info(f'data processing completed .')
131 | 


--------------------------------------------------------------------------------
/data/code/util/tools/predict_tools.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | import os
  4 | import sys
  5 | import pickle
  6 | import numpy as np
  7 | import pandas as pd
  8 | from tqdm import tqdm
  9 | from collections import defaultdict
 10 | from transformers import BertTokenizer
 11 | from torch.utils.data import Dataset, DataLoader
 12 | 
 13 | sys.path.append('../../../../data')
 14 | from data.code.models.nezha import *
 15 | 
 16 | 
 17 | def build_model_and_tokenizer_nezha(args):
 18 |     tokenizer = BertTokenizer.from_pretrained(args.vocab_path)
 19 |     model = NeZhaSequenceClassification_P.from_pretrained(os.path.join(args.load_model_path, f'last-checkpoint'))
 20 |     model.to(args.device)
 21 |     model.eval()
 22 | 
 23 |     return tokenizer, model
 24 | 
 25 | 
 26 | def read_data(args, tokenizer):
 27 |     test_df = pd.read_csv(args.test_path, header=None, sep='\t')
 28 | 
 29 |     inputs = defaultdict(list)
 30 |     for i, row in tqdm(test_df.iterrows(), desc=f'Preprocessing test data', total=len(test_df)):
 31 |         sentence = row[0]
 32 |         build_bert_inputs(inputs, sentence, tokenizer)
 33 | 
 34 |     data_cache_path = args.data_cache_path
 35 |     if not os.path.exists(data_cache_path):
 36 |         os.makedirs(data_cache_path)
 37 | 
 38 |     cache_pkl_path = os.path.join(data_cache_path, 'test.pkl')
 39 |     with open(cache_pkl_path, 'wb') as f:
 40 |         pickle.dump(inputs, f)
 41 | 
 42 |     return cache_pkl_path
 43 | 
 44 | 
 45 | def build_bert_inputs(inputs, sentence, tokenizer):
 46 |     inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True,
 47 |                                         return_token_type_ids=True, return_attention_mask=True)
 48 |     inputs['input_ids'].append(inputs_dict['input_ids'])
 49 |     inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
 50 |     inputs['attention_mask'].append(inputs_dict['attention_mask'])
 51 | 
 52 | 
 53 | class DGDataset(Dataset):
 54 |     def __init__(self, data_dict: dict):
 55 |         super(DGDataset, self).__init__()
 56 |         self.data_dict = data_dict
 57 | 
 58 |     def __getitem__(self, index: int) -> tuple:
 59 |         data = (
 60 |             self.data_dict['input_ids'][index],
 61 |             self.data_dict['token_type_ids'][index],
 62 |             self.data_dict['attention_mask'][index]
 63 |         )
 64 |         return data
 65 | 
 66 |     def __len__(self) -> int:
 67 |         return len(self.data_dict['input_ids'])
 68 | 
 69 | 
 70 | class Collator:
 71 |     def __init__(self, max_seq_len: int, tokenizer: BertTokenizer):
 72 |         self.max_seq_len = max_seq_len
 73 |         self.tokenizer = tokenizer
 74 | 
 75 |     def pad_and_truncate(self, input_ids_list, token_type_ids_list,
 76 |                          attention_mask_list, max_seq_len):
 77 |         input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long)
 78 |         token_type_ids = torch.zeros_like(input_ids)
 79 |         attention_mask = torch.zeros_like(input_ids)
 80 |         for i in range(len(input_ids_list)):
 81 |             seq_len = len(input_ids_list[i])
 82 |             if seq_len <= max_seq_len:
 83 |                 input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long)
 84 |                 token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
 85 |                 attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)
 86 |             else:
 87 |                 input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id],
 88 |                                             dtype=torch.long)
 89 |                 token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long)
 90 |                 attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long)
 91 | 
 92 |         return input_ids, token_type_ids, attention_mask
 93 | 
 94 |     def __call__(self, examples: list) -> dict:
 95 |         input_ids_list, token_type_ids_list, attention_mask_list = list(zip(*examples))
 96 |         cur_max_seq_len = max(len(input_id) for input_id in input_ids_list)
 97 |         max_seq_len = min(cur_max_seq_len, self.max_seq_len)
 98 | 
 99 |         input_ids, token_type_ids, attention_mask = self.pad_and_truncate(input_ids_list, token_type_ids_list,
100 |                                                                           attention_mask_list, max_seq_len)
101 | 
102 |         data_dict = {
103 |             'input_ids': input_ids,
104 |             'token_type_ids': token_type_ids,
105 |             'attention_mask': attention_mask
106 |         }
107 | 
108 |         return data_dict
109 | 
110 | 
111 | def load_data(args, tokenizer):
112 |     cache_pkl_path = os.path.join(args.data_cache_path, 'test.pkl')
113 | 
114 |     with open(cache_pkl_path, 'rb') as f:
115 |         test_data = pickle.load(f)
116 | 
117 |     collate_fn = Collator(args.max_seq_len, tokenizer)
118 |     test_dataset = DGDataset(test_data)
119 |     test_dataloader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False,
120 |                                  num_workers=0, collate_fn=collate_fn)
121 |     return test_dataloader
122 | 
123 | 
124 | def save2csv(args, p_logit):
125 |     logit_path = os.path.join(args.output_result_path, 'full_logit.csv')
126 |     result = pd.DataFrame(p_logit, columns=["label%d" % i for i in range(p_logit.shape[-1])])
127 |     result.to_csv(logit_path, index=False)
128 | 
129 |     print(f"result hace save in ：{logit_path} .")
130 | 
131 | 
132 | def batch2cuda(args, batch):
133 |     return {item: value.to(args.device) for item, value in list(batch.items())}
134 | 
135 | 
136 | def predict(test_dataloader, pre_model, args):
137 |     p_logit = []
138 | 
139 |     val_iterator = tqdm(test_dataloader, desc='Predict', total=len(test_dataloader))
140 | 
141 |     with torch.no_grad():
142 |         for batch in val_iterator:
143 |             batch_cuda = batch2cuda(args, batch)
144 |             logits = pre_model(**batch_cuda)[0]
145 |             p_logit.extend(torch.softmax(logits, -1).cpu().numpy())
146 | 
147 |     return np.vstack(p_logit)
148 | 
149 | 
150 | def create_dirs(path_list):
151 |     for path in path_list:
152 |         os.makedirs(path, exist_ok=True)
153 | 


--------------------------------------------------------------------------------
/data/code/util/modeling/modeling_nezha/configuration.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from transformers import PretrainedConfig
  3 | 
  4 | NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
  5 | 
  6 | class NeZhaConfig(PretrainedConfig):
  7 |     r"""
  8 |         This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
  9 |         It is used to instantiate an ALBERT model according to the specified arguments, defining the model
 10 |         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
 11 |         the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
 12 | 
 13 |         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
 14 |         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
 15 |         for more information.
 16 | 
 17 | 
 18 |         Args:
 19 |             vocab_size (:obj:`int`, optional, defaults to 30000):
 20 |                 Vocabulary size of the ALBERT model. Defines the different tokens that
 21 |                 can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
 22 |             embedding_size (:obj:`int`, optional, defaults to 128):
 23 |                 Dimensionality of vocabulary embeddings.
 24 |             hidden_size (:obj:`int`, optional, defaults to 4096):
 25 |                 Dimensionality of the encoder layers and the pooler layer.
 26 |             num_hidden_layers (:obj:`int`, optional, defaults to 12):
 27 |                 Number of hidden layers in the Transformer encoder.
 28 |             num_hidden_groups (:obj:`int`, optional, defaults to 1):
 29 |                 Number of groups for the hidden layers, parameters in the same group are shared.
 30 |             num_attention_heads (:obj:`int`, optional, defaults to 64):
 31 |                 Number of attention heads for each attention layer in the Transformer encoder.
 32 |             intermediate_size (:obj:`int`, optional, defaults to 16384):
 33 |                 The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 34 |             inner_group_num (:obj:`int`, optional, defaults to 1):
 35 |                 The number of inner repetition of attention and ffn.
 36 |             hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
 37 |                 The non-linear activation function (function or string) in the encoder and pooler.
 38 |                 If string, "gelu", "relu", "swish" and "gelu_new" are supported.
 39 |             hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
 40 |                 The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
 41 |             attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
 42 |                 The dropout ratio for the attention probabilities.
 43 |             max_position_embeddings (:obj:`int`, optional, defaults to 512):
 44 |                 The maximum sequence length that this model might ever be used with. Typically set this to something
 45 |                 large (e.g., 512 or 1024 or 2048).
 46 |             type_vocab_size (:obj:`int`, optional, defaults to 2):
 47 |                 The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
 48 |             initializer_range (:obj:`float`, optional, defaults to 0.02):
 49 |                 The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 50 |             layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
 51 |                 The epsilon used by the layer normalization layers.
 52 |             classifier_dropout_prob (:obj:`float`, optional, defaults to 0.1):
 53 |                 The dropout ratio for attached classifiers.
 54 | 
 55 |         Example::
 56 | 
 57 |             from transformers import AlbertConfig, AlbertModel
 58 |             # Initializing an ALBERT-xxlarge style configuration
 59 |             albert_xxlarge_configuration = AlbertConfig()
 60 | 
 61 |             # Initializing an ALBERT-base style configuration
 62 |             albert_base_configuration = AlbertConfig(
 63 |                 hidden_size=768,
 64 |                 num_attention_heads=12,
 65 |                 intermediate_size=3072,
 66 |             )
 67 | 
 68 |             # Initializing a model from the ALBERT-base style configuration
 69 |             model = AlbertModel(albert_xxlarge_configuration)
 70 | 
 71 |             # Accessing the model configuration
 72 |             configuration = model.config
 73 | 
 74 |         Attributes:
 75 |             pretrained_config_archive_map (Dict[str, str]):
 76 |                 A dictionary containing all the available pre-trained checkpoints.
 77 |     """
 78 | 
 79 |     pretrained_config_archive_map = NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP
 80 |     model_type = "nezha"
 81 | 
 82 |     def __init__(
 83 |         self,
 84 |         vocab_size=30000,
 85 |         embedding_size=128,
 86 |         hidden_size=4096,
 87 |         num_hidden_layers=12,
 88 |         num_hidden_groups=1,
 89 |         num_attention_heads=64,
 90 |         intermediate_size=16384,
 91 |         inner_group_num=1,
 92 |         hidden_act="gelu_new",
 93 |         hidden_dropout_prob=0,
 94 |         attention_probs_dropout_prob=0,
 95 |         max_position_embeddings=512,
 96 |         max_relative_position=64,
 97 |         type_vocab_size=2,
 98 |         initializer_range=0.02,
 99 |         layer_norm_eps=1e-12,
100 |         classifier_dropout_prob=0.1,
101 |         use_relative_position=True,
102 |         pad_token_id=0,
103 |         bos_token_id=2,
104 |         eos_token_id=3,
105 |         **kwargs
106 |     ):
107 |         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
108 | 
109 |         self.vocab_size = vocab_size
110 |         self.embedding_size = embedding_size
111 |         self.hidden_size = hidden_size
112 |         self.num_hidden_layers = num_hidden_layers
113 |         self.num_hidden_groups = num_hidden_groups
114 |         self.num_attention_heads = num_attention_heads
115 |         self.inner_group_num = inner_group_num
116 |         self.hidden_act = hidden_act
117 |         self.intermediate_size = intermediate_size
118 |         self.hidden_dropout_prob = hidden_dropout_prob
119 |         self.attention_probs_dropout_prob = attention_probs_dropout_prob
120 |         self.max_position_embeddings = max_position_embeddings
121 |         self.max_relative_position = max_relative_position
122 |         self.type_vocab_size = type_vocab_size
123 |         self.initializer_range = initializer_range
124 |         self.layer_norm_eps = layer_norm_eps
125 |         self.use_relative_position=use_relative_position
126 |         self.classifier_dropout_prob = classifier_dropout_prob
127 | 


--------------------------------------------------------------------------------
/data/code/finetune_code/run_classify.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | import gc
  4 | import sys
  5 | import warnings
  6 | from torch import multiprocessing
  7 | from argparse import ArgumentParser
  8 | 
  9 | sys.path.append('../../../data')
 10 | from data.code.util.tools.finetune_tools import *
 11 | 
 12 | multiprocessing.set_sharing_strategy('file_system')
 13 | 
 14 | 
 15 | class PGD:
 16 |     def __init__(self, args, model):
 17 |         self.model = model
 18 |         self.emb_backup = {}
 19 |         self.grad_backup = {}
 20 |         self.epsilon = args.epsilon
 21 |         self.emb_name = args.emb_name
 22 |         self.alpha = args.alpha
 23 | 
 24 |     def attack(self, is_first_attack=False):
 25 |         for name, param in self.model.bert.named_parameters():
 26 |             if param.requires_grad and self.emb_name in name:
 27 |                 if is_first_attack:
 28 |                     self.emb_backup[name] = param.data.clone()
 29 |                 norm = torch.norm(param.grad)
 30 |                 if norm != 0 and not torch.isnan(norm):
 31 |                     r_at = self.alpha * param.grad / norm
 32 |                     param.data.add_(r_at)
 33 |                     param.data = self.project(name, param.data, self.epsilon)
 34 | 
 35 |     def restore(self):
 36 |         for name, param in self.model.bert.named_parameters():
 37 |             if param.requires_grad and self.emb_name in name:
 38 |                 assert name in self.emb_backup
 39 |                 param.data = self.emb_backup[name]
 40 |         self.emb_backup = {}
 41 | 
 42 |     def project(self, param_name, param_data, epsilon):
 43 |         r = param_data - self.emb_backup[param_name]
 44 |         if torch.norm(r) > epsilon:
 45 |             r = epsilon * r / torch.norm(r)
 46 |         return self.emb_backup[param_name] + r
 47 | 
 48 |     def backup_grad(self):
 49 |         for name, param in self.model.bert.named_parameters():
 50 |             if param.requires_grad and param.grad is not None:
 51 |                 self.grad_backup[name] = param.grad.clone()
 52 | 
 53 |     def restore_grad(self):
 54 |         for name, param in self.model.bert.named_parameters():
 55 |             if param.requires_grad and param.grad is not None:
 56 |                 param.grad = self.grad_backup[name]
 57 | 
 58 | 
 59 | def train(args):
 60 |     tokenizer, model = build_model_and_tokenizer(args)
 61 | 
 62 |     if not os.path.exists(os.path.join(args.data_cache_path, 'train.pkl')):
 63 |         read_data(args, tokenizer)
 64 | 
 65 |     train_dataloader = load_data(args, tokenizer)
 66 | 
 67 |     total_steps = args.num_epochs * len(train_dataloader)
 68 | 
 69 |     optimizer, scheduler = build_optimizer(args, model, total_steps)
 70 | 
 71 |     total_loss, cur_avg_loss, global_steps = 0., 0., 0
 72 | 
 73 |     for epoch in range(1, args.num_epochs + 1):
 74 | 
 75 |         train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
 76 | 
 77 |         model.train()
 78 | 
 79 |         for batch in train_iterator:
 80 |             batch_cuda = batch2cuda(args, batch)
 81 |             loss, logits = model(**batch_cuda)[:2]
 82 | 
 83 |             # TSA, 仅 backward loss 小于 阈值的 loss
 84 |             start, end = 1. / logits.shape[-1], 1
 85 |             tsa_thresh = get_tsa_thresh(args, global_steps, total_steps, start, end)
 86 |             larger_than_threshold = torch.exp(-loss) > tsa_thresh
 87 |             loss_mask = torch.ones_like(batch_cuda['labels'], dtype=torch.float32) * (1 - larger_than_threshold.
 88 |                                                                                       type(torch.float32))
 89 |             loss = torch.sum(loss * loss_mask, dim=-1) / torch.max(torch.sum(loss_mask, dim=-1),
 90 |                                                                    torch.tensor(1.).to(args.device))
 91 | 
 92 |             total_loss += loss.item()
 93 |             cur_avg_loss += loss.item()
 94 | 
 95 |             loss.backward()
 96 | 
 97 |             if args.adv == 'pgd':
 98 |                 pgd = PGD(args, model)
 99 |                 K = args.adv_k
100 |                 pgd.backup_grad()
101 |                 for t in range(K):
102 |                     pgd.attack(is_first_attack=(t == 0))
103 |                     if t != K - 1:
104 |                         model.zero_grad()
105 |                     else:
106 |                         pgd.restore_grad()
107 |                     adv_loss, adv_logits = model(**batch_cuda)[:2]
108 |                     adv_loss.backward()
109 |                 pgd.restore()
110 | 
111 |             optimizer.step()
112 |             scheduler.step()
113 |             optimizer.zero_grad()
114 | 
115 |             if args.ema_start:
116 |                 ema.update()
117 | 
118 |             if epoch >= args.ema_start_epoch:
119 |                 args.ema_start = True
120 |                 ema = EMA(model.module if hasattr(model, 'module') else model, decay=0.999)
121 | 
122 |             if (global_steps + 1) % args.logging_step == 0:
123 |                 epoch_avg_loss = cur_avg_loss / args.logging_step
124 |                 global_avg_loss = total_loss / (global_steps + 1)
125 | 
126 |                 print(f"\n>> epoch - {epoch},  global steps - {global_steps + 1}, "
127 |                       f"epoch avg loss - {epoch_avg_loss:.4f}, global avg loss - {global_avg_loss:.4f}.")
128 | 
129 |                 cur_avg_loss = 0.0
130 | 
131 |             global_steps += 1
132 | 
133 |         if epoch >= args.ema_start_epoch:
134 |             ema.apply_shadow()
135 | 
136 |         save_model(args, model, tokenizer)
137 | 
138 |     del model, tokenizer, optimizer, scheduler
139 |     torch.cuda.empty_cache()
140 |     gc.collect()
141 | 
142 | 
143 | def main():
144 |     parser = ArgumentParser()
145 |     parser.add_argument('--output_path', type=str,
146 |                         default='../../user_data/output_model')
147 |     parser.add_argument('--train_path', type=str,
148 |                         default='../../user_data/process_data/train.txt')
149 |     parser.add_argument('--data_cache_path', type=str,
150 |                         default='../../user_data/process_data/pkl')
151 |     parser.add_argument('--vocab_path', type=str,
152 |                         default='../../user_data/tokenizer/vocab.txt')
153 |     parser.add_argument('--model_path', type=str,
154 |                         default='../../user_data/saved_pretrain_model_record/checkpoint-240000')
155 | 
156 |     parser.add_argument('--num_epochs', type=int, default=4)
157 |     parser.add_argument('--batch_size', type=int, default=32)
158 |     parser.add_argument('--max_seq_len', type=int, default=128)
159 | 
160 |     parser.add_argument('--learning_rate', type=float, default=2e-5)
161 |     parser.add_argument('--downstream_learning_rate', type=float, default=1e-4)
162 |     parser.add_argument('--eps', type=float, default=1e-8)
163 | 
164 |     parser.add_argument('--adv_k', type=int, default=10)
165 |     parser.add_argument('--alpha', type=float, default=0.3)
166 |     parser.add_argument('--epsilon', type=float, default=0.5)
167 |     parser.add_argument('--emb_name', type=str, default='word_embeddings.')
168 |     parser.add_argument('--adv', type=str, default='pgd', choices=['', 'pgd'])
169 | 
170 |     parser.add_argument('--lookahead_k', type=int, default=5)
171 |     parser.add_argument('--lookahead_alpha', type=int, default=1)
172 | 
173 |     parser.add_argument('--ema_start', type=bool, default=False)
174 |     parser.add_argument('--ema_start_epoch', type=int, default=3)
175 | 
176 |     parser.add_argument('--schedule', type=str, default='log', choices=['linear', 'exp', 'log'])
177 | 
178 |     parser.add_argument('--warmup_ratio', type=float, default=0.1)
179 |     parser.add_argument('--weight_decay', type=float, default=0.01)
180 | 
181 |     parser.add_argument('--logging_step', type=int, default=100)
182 | 
183 |     parser.add_argument('--seed', type=int, default=2021)
184 | 
185 |     parser.add_argument('--device', type=str, default='cuda')
186 | 
187 |     warnings.filterwarnings('ignore')
188 |     args = parser.parse_args()
189 | 
190 |     os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
191 | 
192 |     seed_everything(args.seed)
193 |     train(args)
194 | 
195 | 
196 | if __name__ == '__main__':
197 |     main()
198 | 


--------------------------------------------------------------------------------
/data/code/pretrain_code/run_pretrain.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | import os
  4 | import re
  5 | import sys
  6 | import random
  7 | import warnings
  8 | import numpy as np
  9 | import pandas as pd
 10 | from tqdm import tqdm
 11 | from typing import List, Tuple
 12 | from collections import defaultdict
 13 | from argparse import ArgumentParser
 14 | 
 15 | import torch
 16 | from torch.utils.data import Dataset
 17 | from transformers import BertTokenizer, TrainingArguments
 18 | 
 19 | sys.path.append('../../../data')
 20 | from data.code.util.others.hanzi import punctuation
 21 | from data.code.util.pretrain_utils.trainer import Trainer
 22 | from data.code.util.modeling.modeling_nezha.modeling import NeZhaConfig, NeZhaForMaskedLM
 23 | 
 24 | warnings.filterwarnings('ignore')
 25 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 26 | os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1'
 27 | 
 28 | 
 29 | def seed_everything(seed):
 30 |     random.seed(seed)
 31 |     np.random.seed(seed)
 32 |     torch.manual_seed(seed)
 33 |     torch.cuda.manual_seed_all(seed)
 34 |     return seed
 35 | 
 36 | 
 37 | def read_data(pretrain_file_path, tokenizer: BertTokenizer) -> dict:
 38 |     pretrain_df = pd.read_csv(pretrain_file_path, header=None, sep='\t')
 39 |     inputs = defaultdict(list)
 40 |     for i, row in tqdm(pretrain_df.iterrows(), desc='', total=len(pretrain_df)):
 41 |         sentence = row[0].strip()
 42 |         sentence = re.sub(r"[%s]+" % punctuation, '[SEP]', sentence)
 43 |         inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True,
 44 |                                             return_token_type_ids=True, return_attention_mask=True)
 45 |         inputs['input_ids'].append(inputs_dict['input_ids'])
 46 |         inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
 47 |         inputs['attention_mask'].append(inputs_dict['attention_mask'])
 48 | 
 49 |     return inputs
 50 | 
 51 | 
 52 | class DGDataset(Dataset):
 53 |     def __init__(self, data_dict: dict):
 54 |         super(Dataset, self).__init__()
 55 |         self.data_dict = data_dict
 56 | 
 57 |     def __getitem__(self, index: int) -> tuple:
 58 |         data = (self.data_dict['input_ids'][index],
 59 |                 self.data_dict['token_type_ids'][index],
 60 |                 self.data_dict['attention_mask'][index])
 61 | 
 62 |         return data
 63 | 
 64 |     def __len__(self) -> int:
 65 |         return len(self.data_dict['input_ids'])
 66 | 
 67 | 
 68 | class DGDataCollator:
 69 |     def __init__(self, max_seq_len: int, tokenizer: BertTokenizer, mlm_probability=0.15):
 70 |         self.max_seq_len = max_seq_len
 71 |         self.tokenizer = tokenizer
 72 |         self.mlm_probability = mlm_probability
 73 |         self.special_token_ids = {tokenizer.cls_token_id, tokenizer.sep_token_id}
 74 | 
 75 |     def pad_and_truncate(self, input_ids_list, token_type_ids_list,
 76 |                          attention_mask_list, max_seq_len):
 77 |         input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long)
 78 |         token_type_ids = torch.zeros_like(input_ids)
 79 |         attention_mask = torch.zeros_like(input_ids)
 80 |         for i in range(len(input_ids_list)):
 81 |             seq_len = len(input_ids_list[i])
 82 |             if seq_len <= max_seq_len:
 83 |                 input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long)
 84 |                 token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
 85 |                 attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)
 86 |             else:
 87 |                 input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id],
 88 |                                             dtype=torch.long)
 89 |                 token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long)
 90 |                 attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long)
 91 |         return input_ids, token_type_ids, attention_mask
 92 | 
 93 |     def _ngram_mask(self, input_ids, max_seq_len):
 94 |         cand_indexes = []
 95 |         for (i, id_) in enumerate(input_ids):
 96 |             if id_ in self.special_token_ids:
 97 |                 continue
 98 |             cand_indexes.append([i])
 99 |         num_to_predict = max(1, int(round(len(input_ids) * self.mlm_probability)))
100 | 
101 |         max_ngram = 3
102 |         ngrams = np.arange(1, max_ngram + 1, dtype=np.int64)
103 |         pvals = 1. / np.arange(1, max_ngram + 1)
104 |         pvals /= pvals.sum(keepdims=True)
105 | 
106 |         ngram_indexes = []
107 |         for idx in range(len(cand_indexes)):
108 |             ngram_index = []
109 |             for n in ngrams:
110 |                 ngram_index.append(cand_indexes[idx:idx + n])
111 |             ngram_indexes.append(ngram_index)
112 |         np.random.shuffle(ngram_indexes)
113 | 
114 |         covered_indexes = set()
115 | 
116 |         for cand_index_set in ngram_indexes:
117 |             if len(covered_indexes) >= num_to_predict:
118 |                 break
119 |             if not cand_index_set:
120 |                 continue
121 |             for index_set in cand_index_set[0]:
122 |                 for index in index_set:
123 |                     if index in covered_indexes:
124 |                         continue
125 |             n = np.random.choice(ngrams[:len(cand_index_set)],
126 |                                  p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True))
127 |             index_set = sum(cand_index_set[n - 1], [])
128 |             n -= 1
129 |             while len(covered_indexes) + len(index_set) > num_to_predict:
130 |                 if n == 0:
131 |                     break
132 |                 index_set = sum(cand_index_set[n - 1], [])
133 |                 n -= 1
134 |             if len(covered_indexes) + len(index_set) > num_to_predict:
135 |                 continue
136 |             is_any_index_covered = False
137 |             for index in index_set:
138 |                 if index in covered_indexes:
139 |                     is_any_index_covered = True
140 |                     break
141 |             if is_any_index_covered:
142 |                 continue
143 |             for index in index_set:
144 |                 covered_indexes.add(index)
145 | 
146 |         mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_ids))]
147 |         mask_labels += [0] * (max_seq_len - len(mask_labels))
148 | 
149 |         return torch.tensor(mask_labels[:max_seq_len])
150 | 
151 |     def ngram_mask(self, input_ids_list: List[list], max_seq_len: int):
152 |         mask_labels = []
153 |         for i, input_ids in enumerate(input_ids_list):
154 |             mask_label = self._ngram_mask(input_ids, max_seq_len)
155 |             mask_labels.append(mask_label)
156 |         return torch.stack(mask_labels, dim=0)
157 | 
158 |     def mask_tokens(self, inputs: torch.Tensor, mask_labels: torch.Tensor) -> \
159 |             Tuple[torch.Tensor, torch.Tensor]:
160 | 
161 |         labels = inputs.clone()
162 |         probability_matrix = mask_labels
163 | 
164 |         # word struct prediction
165 | 
166 |         '''
167 |         complete by yourself
168 |         '''
169 | 
170 |         special_tokens_mask = [
171 |             self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
172 |         ]
173 |         probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
174 |         masked_indices = probability_matrix.bool()
175 |         labels[~masked_indices] = -100
176 |         indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
177 |         inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
178 |         indices_random = torch.bernoulli(
179 |             torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
180 |         random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
181 |         inputs[indices_random] = random_words[indices_random]
182 |         return inputs, labels
183 | 
184 |     def __call__(self, examples: list) -> dict:
185 |         input_ids_list, token_type_ids_list, attention_mask_list = list(zip(*examples))
186 |         cur_max_seq_len = max(len(input_id) for input_id in input_ids_list)
187 |         max_seq_len = min(cur_max_seq_len, self.max_seq_len)
188 | 
189 |         input_ids, token_type_ids, attention_mask = self.pad_and_truncate(input_ids_list,
190 |                                                                           token_type_ids_list,
191 |                                                                           attention_mask_list,
192 |                                                                           max_seq_len)
193 |         batch_mask = self.ngram_mask(input_ids_list, max_seq_len)
194 |         input_ids, mlm_labels = self.mask_tokens(input_ids, batch_mask)
195 |         data_dict = {
196 |             'input_ids': input_ids,
197 |             'attention_mask': attention_mask,
198 |             'token_type_ids': token_type_ids,
199 |             'labels': mlm_labels
200 |         }
201 | 
202 |         return data_dict
203 | 
204 | 
205 | def main():
206 |     parser = ArgumentParser()
207 |     parser.add_argument('--pretrain_data_path', type=str, default='../../user_data/process_data/unlabeled_pretrain.txt')
208 |     parser.add_argument('--pretrain_model_path', type=str, default='../../user_data/pretrain_model/nezha-cn-base')
209 |     parser.add_argument('--vocab_path', type=str, default='../../user_data/tokenizer/vocab.txt')
210 |     parser.add_argument('--save_path', type=str, default='../../user_data/saved_pretrain_model')
211 |     parser.add_argument('--record_save_path', type=str, default='../../user_data/saved_pretrain_model_record')
212 |     parser.add_argument('--mlm_probability', type=float, default=0.15)
213 |     parser.add_argument('--num_train_epochs', type=int, default=100)
214 |     parser.add_argument('--seq_length', type=int, default=128)
215 |     parser.add_argument('--batch_size', type=int, default=64)
216 |     parser.add_argument('--learning_rate', type=float, default=6e-5)
217 |     parser.add_argument('--save_steps', type=int, default=10000)
218 |     parser.add_argument('--ckpt_save_limit', type=int, default=6)
219 |     parser.add_argument('--logging_steps', type=int, default=2000)
220 |     parser.add_argument('--seed', type=int, default=2021)
221 |     parser.add_argument('--fp16', type=str, default=True)
222 |     parser.add_argument('--fp16_backend', type=str, default='amp')
223 | 
224 |     warnings.filterwarnings('ignore')
225 |     args = parser.parse_args()
226 | 
227 |     os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
228 |     os.makedirs(os.path.dirname(args.record_save_path), exist_ok=True)
229 | 
230 |     tokenizer = BertTokenizer.from_pretrained(args.vocab_path)
231 |     model_config = NeZhaConfig.from_pretrained(args.pretrain_model_path)
232 | 
233 |     data = read_data(args.pretrain_data_path, tokenizer)
234 | 
235 |     data_collator = DGDataCollator(max_seq_len=args.seq_length,
236 |                                    tokenizer=tokenizer,
237 |                                    mlm_probability=args.mlm_probability)
238 |     model = NeZhaForMaskedLM.from_pretrained(pretrained_model_name_or_path=args.pretrain_model_path,
239 |                                              config=model_config)
240 |     model.resize_token_embeddings(tokenizer.vocab_size)
241 |     dataset = DGDataset(data)
242 | 
243 |     training_args = TrainingArguments(
244 |         seed=args.seed,
245 |         fp16=args.fp16,
246 |         fp16_backend=args.fp16_backend,
247 |         save_steps=args.save_steps,
248 |         prediction_loss_only=True,
249 |         logging_steps=args.logging_steps,
250 |         output_dir=args.record_save_path,
251 |         learning_rate=args.learning_rate,
252 |         save_total_limit=args.ckpt_save_limit,
253 |         num_train_epochs=args.num_train_epochs,
254 |         per_device_train_batch_size=args.batch_size
255 |     )
256 | 
257 |     trainer = Trainer(
258 |         model=model,
259 |         args=training_args,
260 |         train_dataset=dataset,
261 |         data_collator=data_collator
262 |     )
263 | 
264 |     trainer.train()
265 |     trainer.save_model(args.save_path)
266 |     tokenizer.save_pretrained(args.save_path)
267 | 
268 | 
269 | if __name__ == '__main__':
270 |     main()
271 | 


--------------------------------------------------------------------------------
/data/code/util/tools/finetune_tools.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import pickle
  4 | import random
  5 | import numpy as np
  6 | import pandas as pd
  7 | from torch.optim import AdamW
  8 | from torch.utils.data import Dataset, DataLoader
  9 | from tqdm import tqdm
 10 | from transformers import BertTokenizer
 11 | from collections import defaultdict
 12 | 
 13 | from torch.optim import Optimizer
 14 | from torch.optim.lr_scheduler import LambdaLR
 15 | 
 16 | sys.path.append('../../../../data')
 17 | from data.code.models.nezha import *
 18 | 
 19 | 
 20 | def seed_everything(seed):
 21 |     torch.manual_seed(seed)
 22 |     torch.cuda.manual_seed(seed)
 23 |     torch.cuda.manual_seed_all(seed)
 24 |     torch.backends.cudnn.benchmark = False
 25 |     torch.backends.cudnn.deterministic = True
 26 |     random.seed(seed)
 27 |     np.random.seed(seed)
 28 |     os.environ['PYTHONHASHSEED'] = str(seed)
 29 | 
 30 | 
 31 | def batch2cuda(args, batch):
 32 |     return {item: value.to(args.device) for item, value in list(batch.items())}
 33 | 
 34 | 
 35 | def build_model_and_tokenizer(args):
 36 |     tokenizer = BertTokenizer.from_pretrained(args.vocab_path)
 37 |     model = NeZhaSequenceClassification_F.from_pretrained(args.model_path)
 38 |     model.to(args.device)
 39 | 
 40 |     return tokenizer, model
 41 | 
 42 | 
 43 | class PGD:
 44 |     def __init__(self, args, model):
 45 |         self.model = model
 46 |         self.emb_backup = {}
 47 |         self.grad_backup = {}
 48 |         self.epsilon = args.epsilon
 49 |         self.emb_name = args.emb_name
 50 |         self.alpha = args.alpha
 51 | 
 52 |     def attack(self, is_first_attack=False):
 53 |         for name, param in self.model.bert.named_parameters():
 54 |             if param.requires_grad and self.emb_name in name:
 55 |                 if is_first_attack:
 56 |                     self.emb_backup[name] = param.data.clone()
 57 |                 norm = torch.norm(param.grad)
 58 |                 if norm != 0 and not torch.isnan(norm):
 59 |                     r_at = self.alpha * param.grad / norm
 60 |                     param.data.add_(r_at)
 61 |                     param.data = self.project(name, param.data, self.epsilon)
 62 | 
 63 |     def restore(self):
 64 |         for name, param in self.model.bert.named_parameters():
 65 |             if param.requires_grad and self.emb_name in name:
 66 |                 assert name in self.emb_backup
 67 |                 param.data = self.emb_backup[name]
 68 |         self.emb_backup = {}
 69 | 
 70 |     def project(self, param_name, param_data, epsilon):
 71 |         r = param_data - self.emb_backup[param_name]
 72 |         if torch.norm(r) > epsilon:
 73 |             r = epsilon * r / torch.norm(r)
 74 |         return self.emb_backup[param_name] + r
 75 | 
 76 |     def backup_grad(self):
 77 |         for name, param in self.model.bert.named_parameters():
 78 |             if param.requires_grad and param.grad is not None:
 79 |                 self.grad_backup[name] = param.grad.clone()
 80 | 
 81 |     def restore_grad(self):
 82 |         for name, param in self.model.bert.named_parameters():
 83 |             if param.requires_grad and param.grad is not None:
 84 |                 param.grad = self.grad_backup[name]
 85 | 
 86 | 
 87 | class Lookahead(Optimizer):
 88 |     def __init__(self, optimizer, k=5, alpha=0.5):
 89 |         self.optimizer = optimizer
 90 |         self.k = k
 91 |         self.alpha = alpha
 92 |         self.param_groups = self.optimizer.param_groups
 93 |         self.state = defaultdict(dict)
 94 |         self.fast_state = self.optimizer.state
 95 |         for group in self.param_groups:
 96 |             group["counter"] = 0
 97 | 
 98 |     def update(self, group):
 99 |         for fast in group["params"]:
100 |             param_state = self.state[fast]
101 |             if "slow_param" not in param_state:
102 |                 param_state["slow_param"] = torch.zeros_like(fast.data)
103 |                 param_state["slow_param"].copy_(fast.data)
104 |             slow = param_state["slow_param"]
105 |             slow += (fast.data - slow) * self.alpha
106 |             fast.data.copy_(slow)
107 | 
108 |     def update_lookahead(self):
109 |         for group in self.param_groups:
110 |             self.update(group)
111 | 
112 |     def step(self, closure=None):
113 |         loss = self.optimizer.step(closure)
114 |         for group in self.param_groups:
115 |             if group["counter"] == 0:
116 |                 self.update(group)
117 |             group["counter"] += 1
118 |             if group["counter"] >= self.k:
119 |                 group["counter"] = 0
120 |         return loss
121 | 
122 |     def state_dict(self):
123 |         fast_state_dict = self.optimizer.state_dict()
124 |         slow_state = {
125 |             (id(k) if isinstance(k, torch.Tensor) else k): v
126 |             for k, v in self.state.items()
127 |         }
128 |         fast_state = fast_state_dict["state"]
129 |         param_groups = fast_state_dict["param_groups"]
130 |         return {
131 |             "fast_state": fast_state,
132 |             "slow_state": slow_state,
133 |             "param_groups": param_groups,
134 |         }
135 | 
136 |     def load_state_dict(self, state_dict):
137 |         slow_state_dict = {
138 |             "state": state_dict["slow_state"],
139 |             "param_groups": state_dict["param_groups"],
140 |         }
141 |         fast_state_dict = {
142 |             "state": state_dict["fast_state"],
143 |             "param_groups": state_dict["param_groups"],
144 |         }
145 |         super(Lookahead, self).load_state_dict(slow_state_dict)
146 |         self.optimizer.load_state_dict(fast_state_dict)
147 |         self.fast_state = self.optimizer.state
148 | 
149 |     def add_param_group(self, param_group):
150 |         param_group["counter"] = 0
151 |         self.optimizer.add_param_group(param_group)
152 | 
153 | 
154 | class EMA:
155 |     def __init__(self, model, decay):
156 |         self.model = model
157 |         self.decay = decay
158 |         self.shadow = {}
159 |         self.backup = {}
160 |         self.register()
161 | 
162 |     def register(self):
163 |         for name, param in self.model.named_parameters():
164 |             if param.requires_grad:
165 |                 self.shadow[name] = param.data.clone()
166 | 
167 |     def update(self):
168 |         for name, param in self.model.named_parameters():
169 |             if param.requires_grad:
170 |                 assert name in self.shadow
171 |                 new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name]
172 |                 self.shadow[name] = new_average.clone()
173 | 
174 |     def apply_shadow(self):
175 |         for name, param in self.model.named_parameters():
176 |             if param.requires_grad:
177 |                 assert name in self.shadow
178 |                 self.backup[name] = param.data
179 |                 param.data = self.shadow[name]
180 | 
181 |     def restore(self):
182 |         for name, param in self.model.named_parameters():
183 |             if param.requires_grad:
184 |                 assert name in self.backup
185 |                 param.data = self.backup[name]
186 |         self.backup = {}
187 | 
188 | 
189 | class WarmupLinearSchedule(LambdaLR):
190 |     def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
191 |         self.warmup_steps = warmup_steps
192 |         self.t_total = t_total
193 |         super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
194 | 
195 |     def lr_lambda(self, step):
196 |         if step < self.warmup_steps:
197 |             return float(step) / float(max(1, self.warmup_steps))
198 |         return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
199 | 
200 | 
201 | def build_optimizer(args, model, train_steps):
202 |     no_decay = ['bias', 'LayerNorm.weight']
203 | 
204 |     bert_model_param, bert_downstream_param = [], []
205 | 
206 |     for items in model.named_parameters():
207 |         if "bert" in items:
208 |             bert_model_param.append(items)
209 |         else:
210 |             bert_downstream_param.append(items)
211 | 
212 |     optimizer_grouped_parameters = [
213 |         {"params": [p for n, p in bert_model_param if
214 |                     not any(nd in n for nd in no_decay)],
215 |          'weight_decay_rate': args.weight_decay, "lr": args.learning_rate},
216 |         {'params': [p for n, p in bert_model_param if
217 |                     any(nd in n for nd in no_decay)],
218 |          'weight_decay_rate': 0.0, 'lr': args.learning_rate},
219 | 
220 |         {"params": [p for n, p in bert_downstream_param if
221 |                     not any(nd in n for nd in no_decay)],
222 |          'weight_decay_rate': args.weight_decay, "lr": args.downstream_learning_rate},
223 |         {'params': [p for n, p in bert_downstream_param if
224 |                     any(nd in n for nd in no_decay)],
225 |          'weight_decay_rate': 0.0, 'lr': args.downstream_learning_rate}
226 |     ]
227 | 
228 |     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.eps)
229 |     scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_steps * args.warmup_ratio,
230 |                                      t_total=train_steps)
231 |     optimizer = Lookahead(optimizer, args.lookahead_k, args.lookahead_alpha)
232 | 
233 |     return optimizer, scheduler
234 | 
235 | 
236 | def save_model(args, model, tokenizer):
237 |     model_to_save = model.module if hasattr(model, 'module') else model
238 |     model_save_path = os.path.join(args.output_path, f'last-checkpoint')
239 |     model_to_save.save_pretrained(model_save_path)
240 |     tokenizer.save_vocabulary(model_save_path)
241 | 
242 |     print(f'model saved in : {model_save_path} .')
243 | 
244 | 
245 | def get_tsa_thresh(args, global_step, num_train_steps, start, end):
246 |     training_progress = torch.tensor(float(global_step) / float(num_train_steps))
247 | 
248 |     if args.schedule == 'linear':
249 |         threshold = training_progress
250 |     elif args.schedule == 'exp':
251 |         scale = 5
252 |         threshold = torch.exp((training_progress - 1) * scale)
253 |     elif args.schedule == 'log':
254 |         scale = 5
255 |         threshold = 1 - torch.exp((-training_progress) * scale)
256 | 
257 |     output = threshold * (end - start) + start
258 | 
259 |     return output.to(args.device)
260 | 
261 | 
262 | def read_data(args, tokenizer):
263 |     train_df = pd.read_csv(args.train_path, header=None, sep='\t')
264 | 
265 |     inputs = defaultdict(list)
266 |     for i, row in tqdm(train_df.iterrows(), desc=f'Preprocessing train data', total=len(train_df)):
267 |         sentence, label, level1_label = row
268 |         build_bert_inputs(inputs, label, level1_label, sentence, tokenizer)
269 | 
270 |     data_cache_path = args.data_cache_path
271 |     if not os.path.exists(data_cache_path):
272 |         os.makedirs(data_cache_path)
273 | 
274 |     cache_pkl_path = os.path.join(data_cache_path, 'train.pkl')
275 |     with open(cache_pkl_path, 'wb') as f:
276 |         pickle.dump(inputs, f)
277 | 
278 |     return cache_pkl_path
279 | 
280 | 
281 | def build_bert_inputs(inputs, label, level1_label, sentence, tokenizer):
282 |     inputs_dict = tokenizer.encode_plus(sentence, add_special_tokens=True,
283 |                                         return_token_type_ids=True, return_attention_mask=True)
284 |     inputs['input_ids'].append(inputs_dict['input_ids'])
285 |     inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
286 |     inputs['attention_mask'].append(inputs_dict['attention_mask'])
287 |     inputs['labels'].append(label)
288 |     inputs['level1_labels'].append(level1_label)
289 | 
290 | 
291 | class DGDataset(Dataset):
292 |     def __init__(self, data_dict: dict, tokenizer: BertTokenizer):
293 |         super(DGDataset, self).__init__()
294 |         self.data_dict = data_dict
295 |         self.tokenizer = tokenizer
296 | 
297 |     def __getitem__(self, index: int) -> tuple:
298 |         data = (
299 |             self.data_dict['input_ids'][index],
300 |             self.data_dict['token_type_ids'][index],
301 |             self.data_dict['attention_mask'][index],
302 |             self.data_dict['labels'][index],
303 |             self.data_dict['level1_labels'][index]
304 |         )
305 | 
306 |         return data
307 | 
308 |     def __len__(self) -> int:
309 |         return len(self.data_dict['input_ids'])
310 | 
311 | 
312 | class Collator:
313 |     def __init__(self, max_seq_len: int, tokenizer: BertTokenizer):
314 |         self.max_seq_len = max_seq_len
315 |         self.tokenizer = tokenizer
316 | 
317 |     def pad_and_truncate(self, input_ids_list, token_type_ids_list,
318 |                          attention_mask_list, labels_list, level1_labels_list, max_seq_len):
319 |         input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long)
320 |         token_type_ids = torch.zeros_like(input_ids)
321 |         attention_mask = torch.zeros_like(input_ids)
322 |         for i in range(len(input_ids_list)):
323 |             seq_len = len(input_ids_list[i])
324 |             if seq_len <= max_seq_len:
325 |                 input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long)
326 |                 token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
327 |                 attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)
328 |             else:
329 |                 input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id],
330 |                                             dtype=torch.long)
331 |                 token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long)
332 |                 attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long)
333 | 
334 |         labels = torch.tensor(labels_list, dtype=torch.long)
335 |         level1_labels = torch.tensor(level1_labels_list, dtype=torch.long)
336 |         return input_ids, token_type_ids, attention_mask, labels, level1_labels
337 | 
338 |     def __call__(self, examples: list) -> dict:
339 |         input_ids_list, token_type_ids_list, attention_mask_list, labels_list, level1_labels_list = list(zip(*examples))
340 |         cur_max_seq_len = max(len(input_id) for input_id in input_ids_list)
341 |         max_seq_len = min(cur_max_seq_len, self.max_seq_len)
342 | 
343 |         input_ids, token_type_ids, attention_mask, labels, level1_labels = \
344 |             self.pad_and_truncate(input_ids_list, token_type_ids_list, attention_mask_list,
345 |                                   labels_list, level1_labels_list, max_seq_len)
346 | 
347 |         data_dict = {
348 |             'input_ids': input_ids,
349 |             'token_type_ids': token_type_ids,
350 |             'attention_mask': attention_mask,
351 |             'labels': labels,
352 |             'level1_labels': level1_labels
353 |         }
354 | 
355 |         return data_dict
356 | 
357 | 
358 | def load_data(args, tokenizer):
359 |     cache_pkl_path = os.path.join(args.data_cache_path, 'train.pkl')
360 | 
361 |     with open(cache_pkl_path, 'rb') as f:
362 |         train_data = pickle.load(f)
363 | 
364 |     collate_fn = Collator(args.max_seq_len, tokenizer)
365 |     train_dataset = DGDataset(train_data, tokenizer)
366 |     train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True,
367 |                                   num_workers=0, collate_fn=collate_fn)
368 |     return train_dataloader
369 | 


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
  1 | <component name="InspectionProjectProfileManager">
  2 |   <profile version="1.0">
  3 |     <option name="myName" value="Project Default" />
  4 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
  5 |       <option name="ignoredPackages">
  6 |         <value>
  7 |           <list size="282">
  8 |             <item index="0" class="java.lang.String" itemvalue="web.py" />
  9 |             <item index="1" class="java.lang.String" itemvalue="pybloom" />
 10 |             <item index="2" class="java.lang.String" itemvalue="tensorflow" />
 11 |             <item index="3" class="java.lang.String" itemvalue="absl-py" />
 12 |             <item index="4" class="java.lang.String" itemvalue="protobuf" />
 13 |             <item index="5" class="java.lang.String" itemvalue="scipy" />
 14 |             <item index="6" class="java.lang.String" itemvalue="six" />
 15 |             <item index="7" class="java.lang.String" itemvalue="Werkzeug" />
 16 |             <item index="8" class="java.lang.String" itemvalue="opencv-python" />
 17 |             <item index="9" class="java.lang.String" itemvalue="h5py" />
 18 |             <item index="10" class="java.lang.String" itemvalue="python-dateutil" />
 19 |             <item index="11" class="java.lang.String" itemvalue="astor" />
 20 |             <item index="12" class="java.lang.String" itemvalue="kiwisolver" />
 21 |             <item index="13" class="java.lang.String" itemvalue="gast" />
 22 |             <item index="14" class="java.lang.String" itemvalue="numpy" />
 23 |             <item index="15" class="java.lang.String" itemvalue="Keras-Applications" />
 24 |             <item index="16" class="java.lang.String" itemvalue="Keras-Preprocessing" />
 25 |             <item index="17" class="java.lang.String" itemvalue="termcolor" />
 26 |             <item index="18" class="java.lang.String" itemvalue="tensorboard" />
 27 |             <item index="19" class="java.lang.String" itemvalue="matplotlib" />
 28 |             <item index="20" class="java.lang.String" itemvalue="tensorboardX" />
 29 |             <item index="21" class="java.lang.String" itemvalue="grpcio" />
 30 |             <item index="22" class="java.lang.String" itemvalue="pytz" />
 31 |             <item index="23" class="java.lang.String" itemvalue="pyparsing" />
 32 |             <item index="24" class="java.lang.String" itemvalue="Markdown" />
 33 |             <item index="25" class="java.lang.String" itemvalue="Pillow" />
 34 |             <item index="26" class="java.lang.String" itemvalue="gensim" />
 35 |             <item index="27" class="java.lang.String" itemvalue="numba" />
 36 |             <item index="28" class="java.lang.String" itemvalue="tensorflow-estimator" />
 37 |             <item index="29" class="java.lang.String" itemvalue="greenlet" />
 38 |             <item index="30" class="java.lang.String" itemvalue="Babel" />
 39 |             <item index="31" class="java.lang.String" itemvalue="scikit-learn" />
 40 |             <item index="32" class="java.lang.String" itemvalue="testpath" />
 41 |             <item index="33" class="java.lang.String" itemvalue="backports.os" />
 42 |             <item index="34" class="java.lang.String" itemvalue="py" />
 43 |             <item index="35" class="java.lang.String" itemvalue="patsy" />
 44 |             <item index="36" class="java.lang.String" itemvalue="ipython-genutils" />
 45 |             <item index="37" class="java.lang.String" itemvalue="mccabe" />
 46 |             <item index="38" class="java.lang.String" itemvalue="bleach" />
 47 |             <item index="39" class="java.lang.String" itemvalue="HeapDict" />
 48 |             <item index="40" class="java.lang.String" itemvalue="lxml" />
 49 |             <item index="41" class="java.lang.String" itemvalue="soupsieve" />
 50 |             <item index="42" class="java.lang.String" itemvalue="jsonschema" />
 51 |             <item index="43" class="java.lang.String" itemvalue="xlrd" />
 52 |             <item index="44" class="java.lang.String" itemvalue="anaconda-project" />
 53 |             <item index="45" class="java.lang.String" itemvalue="menuinst" />
 54 |             <item index="46" class="java.lang.String" itemvalue="boto3" />
 55 |             <item index="47" class="java.lang.String" itemvalue="regex" />
 56 |             <item index="48" class="java.lang.String" itemvalue="fastcache" />
 57 |             <item index="49" class="java.lang.String" itemvalue="imageio" />
 58 |             <item index="50" class="java.lang.String" itemvalue="pytest-remotedata" />
 59 |             <item index="51" class="java.lang.String" itemvalue="Keras" />
 60 |             <item index="52" class="java.lang.String" itemvalue="idna" />
 61 |             <item index="53" class="java.lang.String" itemvalue="Bottleneck" />
 62 |             <item index="54" class="java.lang.String" itemvalue="rsa" />
 63 |             <item index="55" class="java.lang.String" itemvalue="networkx" />
 64 |             <item index="56" class="java.lang.String" itemvalue="pycurl" />
 65 |             <item index="57" class="java.lang.String" itemvalue="json5" />
 66 |             <item index="58" class="java.lang.String" itemvalue="pluggy" />
 67 |             <item index="59" class="java.lang.String" itemvalue="cffi" />
 68 |             <item index="60" class="java.lang.String" itemvalue="pep8" />
 69 |             <item index="61" class="java.lang.String" itemvalue="jdcal" />
 70 |             <item index="62" class="java.lang.String" itemvalue="alabaster" />
 71 |             <item index="63" class="java.lang.String" itemvalue="jupyter" />
 72 |             <item index="64" class="java.lang.String" itemvalue="pyOpenSSL" />
 73 |             <item index="65" class="java.lang.String" itemvalue="PyWavelets" />
 74 |             <item index="66" class="java.lang.String" itemvalue="smart-open" />
 75 |             <item index="67" class="java.lang.String" itemvalue="prompt-toolkit" />
 76 |             <item index="68" class="java.lang.String" itemvalue="QtAwesome" />
 77 |             <item index="69" class="java.lang.String" itemvalue="bert4keras" />
 78 |             <item index="70" class="java.lang.String" itemvalue="glob2" />
 79 |             <item index="71" class="java.lang.String" itemvalue="Send2Trash" />
 80 |             <item index="72" class="java.lang.String" itemvalue="imagesize" />
 81 |             <item index="73" class="java.lang.String" itemvalue="et-xmlfile" />
 82 |             <item index="74" class="java.lang.String" itemvalue="pathlib2" />
 83 |             <item index="75" class="java.lang.String" itemvalue="spyder" />
 84 |             <item index="76" class="java.lang.String" itemvalue="pylint" />
 85 |             <item index="77" class="java.lang.String" itemvalue="statsmodels" />
 86 |             <item index="78" class="java.lang.String" itemvalue="isort" />
 87 |             <item index="79" class="java.lang.String" itemvalue="olefile" />
 88 |             <item index="80" class="java.lang.String" itemvalue="keyring" />
 89 |             <item index="81" class="java.lang.String" itemvalue="unicodecsv" />
 90 |             <item index="82" class="java.lang.String" itemvalue="pytest-astropy" />
 91 |             <item index="83" class="java.lang.String" itemvalue="traitlets" />
 92 |             <item index="84" class="java.lang.String" itemvalue="pywinpty" />
 93 |             <item index="85" class="java.lang.String" itemvalue="joblib" />
 94 |             <item index="86" class="java.lang.String" itemvalue="nltk" />
 95 |             <item index="87" class="java.lang.String" itemvalue="atomicwrites" />
 96 |             <item index="88" class="java.lang.String" itemvalue="partd" />
 97 |             <item index="89" class="java.lang.String" itemvalue="Click" />
 98 |             <item index="90" class="java.lang.String" itemvalue="hanziconv" />
 99 |             <item index="91" class="java.lang.String" itemvalue="fsspec" />
100 |             <item index="92" class="java.lang.String" itemvalue="filelock" />
101 |             <item index="93" class="java.lang.String" itemvalue="numpydoc" />
102 |             <item index="94" class="java.lang.String" itemvalue="pyzmq" />
103 |             <item index="95" class="java.lang.String" itemvalue="sentencepiece" />
104 |             <item index="96" class="java.lang.String" itemvalue="astropy" />
105 |             <item index="97" class="java.lang.String" itemvalue="entrypoints" />
106 |             <item index="98" class="java.lang.String" itemvalue="anaconda-navigator" />
107 |             <item index="99" class="java.lang.String" itemvalue="bkcharts" />
108 |             <item index="100" class="java.lang.String" itemvalue="sphinxcontrib-websupport" />
109 |             <item index="101" class="java.lang.String" itemvalue="google-api-core" />
110 |             <item index="102" class="java.lang.String" itemvalue="beautifulsoup4" />
111 |             <item index="103" class="java.lang.String" itemvalue="path.py" />
112 |             <item index="104" class="java.lang.String" itemvalue="tokenizers" />
113 |             <item index="105" class="java.lang.String" itemvalue="clyent" />
114 |             <item index="106" class="java.lang.String" itemvalue="navigator-updater" />
115 |             <item index="107" class="java.lang.String" itemvalue="libarchive-c" />
116 |             <item index="108" class="java.lang.String" itemvalue="cryptography" />
117 |             <item index="109" class="java.lang.String" itemvalue="pyahocorasick" />
118 |             <item index="110" class="java.lang.String" itemvalue="widgetsnbextension" />
119 |             <item index="111" class="java.lang.String" itemvalue="multipledispatch" />
120 |             <item index="112" class="java.lang.String" itemvalue="numexpr" />
121 |             <item index="113" class="java.lang.String" itemvalue="jupyter-core" />
122 |             <item index="114" class="java.lang.String" itemvalue="rope" />
123 |             <item index="115" class="java.lang.String" itemvalue="wcwidth" />
124 |             <item index="116" class="java.lang.String" itemvalue="llvmlite" />
125 |             <item index="117" class="java.lang.String" itemvalue="rouge" />
126 |             <item index="118" class="java.lang.String" itemvalue="Jinja2" />
127 |             <item index="119" class="java.lang.String" itemvalue="pycrypto" />
128 |             <item index="120" class="java.lang.String" itemvalue="google-resumable-media" />
129 |             <item index="121" class="java.lang.String" itemvalue="conda" />
130 |             <item index="122" class="java.lang.String" itemvalue="sortedcollections" />
131 |             <item index="123" class="java.lang.String" itemvalue="pyflakes" />
132 |             <item index="124" class="java.lang.String" itemvalue="asn1crypto" />
133 |             <item index="125" class="java.lang.String" itemvalue="parso" />
134 |             <item index="126" class="java.lang.String" itemvalue="pytest-doctestplus" />
135 |             <item index="127" class="java.lang.String" itemvalue="ipython" />
136 |             <item index="128" class="java.lang.String" itemvalue="xlwt" />
137 |             <item index="129" class="java.lang.String" itemvalue="packaging" />
138 |             <item index="130" class="java.lang.String" itemvalue="JPype1" />
139 |             <item index="131" class="java.lang.String" itemvalue="chardet" />
140 |             <item index="132" class="java.lang.String" itemvalue="jmespath" />
141 |             <item index="133" class="java.lang.String" itemvalue="s3transfer" />
142 |             <item index="134" class="java.lang.String" itemvalue="sphinxcontrib-jsmath" />
143 |             <item index="135" class="java.lang.String" itemvalue="PyYAML" />
144 |             <item index="136" class="java.lang.String" itemvalue="pickleshare" />
145 |             <item index="137" class="java.lang.String" itemvalue="defusedxml" />
146 |             <item index="138" class="java.lang.String" itemvalue="pycparser" />
147 |             <item index="139" class="java.lang.String" itemvalue="pyasn1-modules" />
148 |             <item index="140" class="java.lang.String" itemvalue="tables" />
149 |             <item index="141" class="java.lang.String" itemvalue="Pygments" />
150 |             <item index="142" class="java.lang.String" itemvalue="sphinxcontrib-qthelp" />
151 |             <item index="143" class="java.lang.String" itemvalue="docutils" />
152 |             <item index="144" class="java.lang.String" itemvalue="gevent" />
153 |             <item index="145" class="java.lang.String" itemvalue="comtypes" />
154 |             <item index="146" class="java.lang.String" itemvalue="pywin32" />
155 |             <item index="147" class="java.lang.String" itemvalue="qtconsole" />
156 |             <item index="148" class="java.lang.String" itemvalue="terminado" />
157 |             <item index="149" class="java.lang.String" itemvalue="transformers" />
158 |             <item index="150" class="java.lang.String" itemvalue="distributed" />
159 |             <item index="151" class="java.lang.String" itemvalue="jupyter-client" />
160 |             <item index="152" class="java.lang.String" itemvalue="ipykernel" />
161 |             <item index="153" class="java.lang.String" itemvalue="nbconvert" />
162 |             <item index="154" class="java.lang.String" itemvalue="attrs" />
163 |             <item index="155" class="java.lang.String" itemvalue="conda-package-handling" />
164 |             <item index="156" class="java.lang.String" itemvalue="psutil" />
165 |             <item index="157" class="java.lang.String" itemvalue="jedi" />
166 |             <item index="158" class="java.lang.String" itemvalue="cytoolz" />
167 |             <item index="159" class="java.lang.String" itemvalue="backports.tempfile" />
168 |             <item index="160" class="java.lang.String" itemvalue="msgpack" />
169 |             <item index="161" class="java.lang.String" itemvalue="decorator" />
170 |             <item index="162" class="java.lang.String" itemvalue="pandocfilters" />
171 |             <item index="163" class="java.lang.String" itemvalue="backports.shutil-get-terminal-size" />
172 |             <item index="164" class="java.lang.String" itemvalue="google-cloud-storage" />
173 |             <item index="165" class="java.lang.String" itemvalue="pycodestyle" />
174 |             <item index="166" class="java.lang.String" itemvalue="pycosat" />
175 |             <item index="167" class="java.lang.String" itemvalue="pyasn1" />
176 |             <item index="168" class="java.lang.String" itemvalue="requests" />
177 |             <item index="169" class="java.lang.String" itemvalue="bitarray" />
178 |             <item index="170" class="java.lang.String" itemvalue="pyrsistent" />
179 |             <item index="171" class="java.lang.String" itemvalue="sphinxcontrib-devhelp" />
180 |             <item index="172" class="java.lang.String" itemvalue="mkl-fft" />
181 |             <item index="173" class="java.lang.String" itemvalue="XlsxWriter" />
182 |             <item index="174" class="java.lang.String" itemvalue="seaborn" />
183 |             <item index="175" class="java.lang.String" itemvalue="zipp" />
184 |             <item index="176" class="java.lang.String" itemvalue="pkginfo" />
185 |             <item index="177" class="java.lang.String" itemvalue="itsdangerous" />
186 |             <item index="178" class="java.lang.String" itemvalue="pyreadline" />
187 |             <item index="179" class="java.lang.String" itemvalue="ipywidgets" />
188 |             <item index="180" class="java.lang.String" itemvalue="tornado" />
189 |             <item index="181" class="java.lang.String" itemvalue="botocore" />
190 |             <item index="182" class="java.lang.String" itemvalue="torch" />
191 |             <item index="183" class="java.lang.String" itemvalue="pyhanlp" />
192 |             <item index="184" class="java.lang.String" itemvalue="singledispatch" />
193 |             <item index="185" class="java.lang.String" itemvalue="sortedcontainers" />
194 |             <item index="186" class="java.lang.String" itemvalue="mistune" />
195 |             <item index="187" class="java.lang.String" itemvalue="pandas" />
196 |             <item index="188" class="java.lang.String" itemvalue="toolz" />
197 |             <item index="189" class="java.lang.String" itemvalue="Sphinx" />
198 |             <item index="190" class="java.lang.String" itemvalue="future" />
199 |             <item index="191" class="java.lang.String" itemvalue="mpmath" />
200 |             <item index="192" class="java.lang.String" itemvalue="jupyter-console" />
201 |             <item index="193" class="java.lang.String" itemvalue="ruamel-yaml" />
202 |             <item index="194" class="java.lang.String" itemvalue="bokeh" />
203 |             <item index="195" class="java.lang.String" itemvalue="cachetools" />
204 |             <item index="196" class="java.lang.String" itemvalue="webencodings" />
205 |             <item index="197" class="java.lang.String" itemvalue="sphinxcontrib-applehelp" />
206 |             <item index="198" class="java.lang.String" itemvalue="html5lib" />
207 |             <item index="199" class="java.lang.String" itemvalue="missingno" />
208 |             <item index="200" class="java.lang.String" itemvalue="googleapis-common-protos" />
209 |             <item index="201" class="java.lang.String" itemvalue="spyder-kernels" />
210 |             <item index="202" class="java.lang.String" itemvalue="QtPy" />
211 |             <item index="203" class="java.lang.String" itemvalue="astroid" />
212 |             <item index="204" class="java.lang.String" itemvalue="MarkupSafe" />
213 |             <item index="205" class="java.lang.String" itemvalue="mkl-random" />
214 |             <item index="206" class="java.lang.String" itemvalue="pytest-arraydiff" />
215 |             <item index="207" class="java.lang.String" itemvalue="locket" />
216 |             <item index="208" class="java.lang.String" itemvalue="snowballstemmer" />
217 |             <item index="209" class="java.lang.String" itemvalue="contextlib2" />
218 |             <item index="210" class="java.lang.String" itemvalue="conda-verify" />
219 |             <item index="211" class="java.lang.String" itemvalue="certifi" />
220 |             <item index="212" class="java.lang.String" itemvalue="sympy" />
221 |             <item index="213" class="java.lang.String" itemvalue="notebook" />
222 |             <item index="214" class="java.lang.String" itemvalue="xlwings" />
223 |             <item index="215" class="java.lang.String" itemvalue="win-unicode-console" />
224 |             <item index="216" class="java.lang.String" itemvalue="sacremoses" />
225 |             <item index="217" class="java.lang.String" itemvalue="pyodbc" />
226 |             <item index="218" class="java.lang.String" itemvalue="boto" />
227 |             <item index="219" class="java.lang.String" itemvalue="wrapt" />
228 |             <item index="220" class="java.lang.String" itemvalue="sphinxcontrib-htmlhelp" />
229 |             <item index="221" class="java.lang.String" itemvalue="pytest-openfiles" />
230 |             <item index="222" class="java.lang.String" itemvalue="anaconda-client" />
231 |             <item index="223" class="java.lang.String" itemvalue="backcall" />
232 |             <item index="224" class="java.lang.String" itemvalue="PySocks" />
233 |             <item index="225" class="java.lang.String" itemvalue="backports.functools-lru-cache" />
234 |             <item index="226" class="java.lang.String" itemvalue="mock" />
235 |             <item index="227" class="java.lang.String" itemvalue="dask" />
236 |             <item index="228" class="java.lang.String" itemvalue="google-cloud-core" />
237 |             <item index="229" class="java.lang.String" itemvalue="scikit-image" />
238 |             <item index="230" class="java.lang.String" itemvalue="jieba" />
239 |             <item index="231" class="java.lang.String" itemvalue="more-itertools" />
240 |             <item index="232" class="java.lang.String" itemvalue="SQLAlchemy" />
241 |             <item index="233" class="java.lang.String" itemvalue="tblib" />
242 |             <item index="234" class="java.lang.String" itemvalue="cloudpickle" />
243 |             <item index="235" class="java.lang.String" itemvalue="importlib-metadata" />
244 |             <item index="236" class="java.lang.String" itemvalue="simplegeneric" />
245 |             <item index="237" class="java.lang.String" itemvalue="backports.weakref" />
246 |             <item index="238" class="java.lang.String" itemvalue="zict" />
247 |             <item index="239" class="java.lang.String" itemvalue="urllib3" />
248 |             <item index="240" class="java.lang.String" itemvalue="jupyterlab" />
249 |             <item index="241" class="java.lang.String" itemvalue="Cython" />
250 |             <item index="242" class="java.lang.String" itemvalue="Flask" />
251 |             <item index="243" class="java.lang.String" itemvalue="nose" />
252 |             <item index="244" class="java.lang.String" itemvalue="pytest" />
253 |             <item index="245" class="java.lang.String" itemvalue="sphinxcontrib-serializinghtml" />
254 |             <item index="246" class="java.lang.String" itemvalue="jupyterlab-server" />
255 |             <item index="247" class="java.lang.String" itemvalue="conda-build" />
256 |             <item index="248" class="java.lang.String" itemvalue="nbformat" />
257 |             <item index="249" class="java.lang.String" itemvalue="prometheus-client" />
258 |             <item index="250" class="java.lang.String" itemvalue="tqdm" />
259 |             <item index="251" class="java.lang.String" itemvalue="win-inet-pton" />
260 |             <item index="252" class="java.lang.String" itemvalue="lazy-object-proxy" />
261 |             <item index="253" class="java.lang.String" itemvalue="colorama" />
262 |             <item index="254" class="java.lang.String" itemvalue="ply" />
263 |             <item index="255" class="java.lang.String" itemvalue="google-auth" />
264 |             <item index="256" class="java.lang.String" itemvalue="openpyxl" />
265 |             <item index="257" class="java.lang.String" itemvalue="pymysql" />
266 |             <item index="258" class="java.lang.String" itemvalue="tensorflow_gpu" />
267 |             <item index="259" class="java.lang.String" itemvalue="scikit_learn" />
268 |             <item index="260" class="java.lang.String" itemvalue="pytorch" />
269 |             <item index="261" class="java.lang.String" itemvalue="autopep8" />
270 |             <item index="262" class="java.lang.String" itemvalue="google-pasta" />
271 |             <item index="263" class="java.lang.String" itemvalue="pkuseg" />
272 |             <item index="264" class="java.lang.String" itemvalue="google-auth-oauthlib" />
273 |             <item index="265" class="java.lang.String" itemvalue="typed-ast" />
274 |             <item index="266" class="java.lang.String" itemvalue="opt-einsum" />
275 |             <item index="267" class="java.lang.String" itemvalue="toml" />
276 |             <item index="268" class="java.lang.String" itemvalue="tensorflow-gpu-estimator" />
277 |             <item index="269" class="java.lang.String" itemvalue="streamz" />
278 |             <item index="270" class="java.lang.String" itemvalue="astunparse" />
279 |             <item index="271" class="java.lang.String" itemvalue="snownlp" />
280 |             <item index="272" class="java.lang.String" itemvalue="tensorflow-gpu" />
281 |             <item index="273" class="java.lang.String" itemvalue="argparse" />
282 |             <item index="274" class="java.lang.String" itemvalue="wikipedia" />
283 |             <item index="275" class="java.lang.String" itemvalue="tb-nightly" />
284 |             <item index="276" class="java.lang.String" itemvalue="keras" />
285 |             <item index="277" class="java.lang.String" itemvalue="thulac" />
286 |             <item index="278" class="java.lang.String" itemvalue="pytorch-pretrained-bert" />
287 |             <item index="279" class="java.lang.String" itemvalue="apex" />
288 |             <item index="280" class="java.lang.String" itemvalue="cxxfilt" />
289 |             <item index="281" class="java.lang.String" itemvalue="simpletransformers" />
290 |           </list>
291 |         </value>
292 |       </option>
293 |     </inspection_tool>
294 |     <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
295 |       <option name="ignoredErrors">
296 |         <list>
297 |           <option value="N803" />
298 |         </list>
299 |       </option>
300 |     </inspection_tool>
301 |     <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
302 |       <option name="ignoredIdentifiers">
303 |         <list>
304 |           <option value="demo.paquURL.crawler" />
305 |           <option value="tuple.scatter3D" />
306 |           <option value="tuple.view_init" />
307 |           <option value="tuple.set_xlabel" />
308 |           <option value="tuple.set_ylabel" />
309 |           <option value="tuple.set_zlabel" />
310 |           <option value="data.IMDB.load_data" />
311 |           <option value="transformers.convert_bert_original_tf_checkpoint_to_pytorch" />
312 |           <option value="modeling.modeling_nezha.configuration.NeZhaConfig.__getitem__" />
313 |         </list>
314 |       </option>
315 |     </inspection_tool>
316 |   </profile>
317 | </component>


--------------------------------------------------------------------------------
/data/code/util/pretrain_utils/trainer_args.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | 
  3 | import json
  4 | import os
  5 | from dataclasses import asdict, dataclass, field
  6 | from enum import Enum
  7 | from typing import Any, Dict, List, Optional
  8 | 
  9 | from transformers.file_utils import (
 10 |     cached_property,
 11 |     is_torch_available,
 12 |     is_torch_tpu_available,
 13 |     torch_required,
 14 | )
 15 | from transformers.trainer_utils import EvaluationStrategy, SchedulerType
 16 | from transformers.utils import logging
 17 | 
 18 | 
 19 | if is_torch_available():
 20 |     import torch
 21 | 
 22 | if is_torch_tpu_available():
 23 |     import torch_xla.core.xla_model as xm
 24 | 
 25 | 
 26 | logger = logging.get_logger(__name__)
 27 | 
 28 | 
 29 | def default_logdir() -> str:
 30 |     """
 31 |     Same default as PyTorch
 32 |     """
 33 |     import socket
 34 |     from datetime import datetime
 35 | 
 36 |     current_time = datetime.now().strftime("%b%d_%H-%M-%S")
 37 |     return os.path.join("runs", current_time + "_" + socket.gethostname())
 38 | 
 39 | 
 40 | @dataclass
 41 | class TrainingArguments:
 42 |     """
 43 |     TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
 44 |     itself**.
 45 | 
 46 |     Using :class:`~transformers.HfArgumentParser` we can turn this class into `argparse
 47 |     <https://docs.python.org/3/library/argparse.html#module-argparse>`__ arguments that can be specified on the command
 48 |     line.
 49 | 
 50 | 
 51 | 
 52 | 
 53 |     Parameters:
 54 |         output_dir (:obj:`str`):
 55 |             The output directory where the model predictions and checkpoints will be written.
 56 |         overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
 57 |             If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
 58 |             :obj:`output_dir` points to a checkpoint directory.
 59 |         do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
 60 |             Whether to run training or not. This argument is not directly used by :class:`~transformers.Trainer`, it's
 61 |             intended to be used by your training/evaluation scripts instead. See the `example scripts
 62 |             <https://github.com/huggingface/transformers/tree/master/examples>`__ for more details.
 63 |         do_eval (:obj:`bool`, `optional`):
 64 |             Whether to run evaluation on the validation set or not. Will be set to :obj:`True` if
 65 |             :obj:`evaluation_strategy` is different from :obj:`"no"`. This argument is not directly used by
 66 |             :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
 67 |             the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more
 68 |             details.
 69 |         do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
 70 |             Whether to run predictions on the test set or not. This argument is not directly used by
 71 |             :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
 72 |             the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more
 73 |             details.
 74 |         evaluation_strategy (:obj:`str` or :class:`~transformers.trainer_utils.EvaluationStrategy`, `optional`, defaults to :obj:`"no"`):
 75 |             The evaluation strategy to adopt during training. Possible values are:
 76 | 
 77 |                 * :obj:`"no"`: No evaluation is done during training.
 78 |                 * :obj:`"steps"`: Evaluation is done (and logged) every :obj:`eval_steps`.
 79 |                 * :obj:`"epoch"`: Evaluation is done at the end of each epoch.
 80 | 
 81 |         prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
 82 |             When performing evaluation and generating predictions, only returns the loss.
 83 |         per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
 84 |             The batch size per GPU/TPU core/CPU for training.
 85 |         per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
 86 |             The batch size per GPU/TPU core/CPU for evaluation.
 87 |         gradient_accumulation_steps (:obj:`int`, `optional`, defaults to 1):
 88 |             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 89 | 
 90 |             .. warning::
 91 | 
 92 |                 When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
 93 |                 logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
 94 |                 examples.
 95 |         eval_accumulation_steps (:obj:`int`, `optional`):
 96 |             Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
 97 |             left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but
 98 |             requires more memory).
 99 |         learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
100 |             The initial learning rate for :class:`~transformers.AdamW` optimizer.
101 |         weight_decay (:obj:`float`, `optional`, defaults to 0):
102 |             The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in
103 |             :class:`~transformers.AdamW` optimizer.
104 |         adam_beta1 (:obj:`float`, `optional`, defaults to 0.9):
105 |             The beta1 hyperparameter for the :class:`~transformers.AdamW` optimizer.
106 |         adam_beta2 (:obj:`float`, `optional`, defaults to 0.999):
107 |             The beta2 hyperparameter for the :class:`~transformers.AdamW` optimizer.
108 |         adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
109 |             The epsilon hyperparameter for the :class:`~transformers.AdamW` optimizer.
110 |         max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
111 |             Maximum gradient norm (for gradient clipping).
112 |         num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
113 |             Total number of training epochs to perform (if not an integer, will perform the decimal part percents of
114 |             the last epoch before stopping training).
115 |         max_steps (:obj:`int`, `optional`, defaults to -1):
116 |             If set to a positive number, the total number of training steps to perform. Overrides
117 |             :obj:`num_train_epochs`.
118 |         lr_scheduler_type (:obj:`str` or :class:`~transformers.SchedulerType`, `optional`, defaults to :obj:`"linear"`):
119 |             The scheduler type to use. See the documentation of :class:`~transformers.SchedulerType` for all possible
120 |             values.
121 |         warmup_steps (:obj:`int`, `optional`, defaults to 0):
122 |             Number of steps used for a linear warmup from 0 to :obj:`learning_rate`.
123 |         logging_dir (:obj:`str`, `optional`):
124 |             `TensorBoard <https://www.tensorflow.org/tensorboard>`__ log directory. Will default to
125 |             `runs/**CURRENT_DATETIME_HOSTNAME**`.
126 |         logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
127 |             Whether to log and evaluate the first :obj:`global_step` or not.
128 |         logging_steps (:obj:`int`, `optional`, defaults to 500):
129 |             Number of update steps between two logs.
130 |         save_steps (:obj:`int`, `optional`, defaults to 500):
131 |             Number of updates steps before two checkpoint saves.
132 |         save_total_limit (:obj:`int`, `optional`):
133 |             If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
134 |             :obj:`output_dir`.
135 |         no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
136 |             Whether to not use CUDA even when it is available or not.
137 |         seed (:obj:`int`, `optional`, defaults to 42):
138 |             Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
139 |             :func:`~transformers.Trainer.model_init` function to instantiate the model if it has some randomly
140 |             initialized parameters.
141 |         fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
142 |             Whether to use 16-bit (mixed) precision training (through NVIDIA Apex) instead of 32-bit training.
143 |         fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
144 |             For :obj:`fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
145 |             on the `Apex documentation <https://nvidia.github.io/apex/amp.html>`__.
146 |         fp16_backend (:obj:`str`, `optional`, defaults to :obj:`"auto"`):
147 |             The backend to use for mixed precision training. Must be one of :obj:`"auto"`, :obj:`"amp"` or
148 |             :obj:`"apex"`. :obj:`"auto"` will use AMP or APEX depending on the PyTorch version detected, while the
149 |             other choices will force the requested backend.
150 |         local_rank (:obj:`int`, `optional`, defaults to -1):
151 |             Rank of the process during distributed training.
152 |         tpu_num_cores (:obj:`int`, `optional`):
153 |             When training on TPU, the number of TPU cores (automatically passed by launcher script).
154 |         debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
155 |             When training on TPU, whether to print debug metrics or not.
156 |         dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
157 |             Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
158 |             or not.
159 |         eval_steps (:obj:`int`, `optional`):
160 |             Number of update steps between two evaluations if :obj:`evaluation_strategy="steps"`. Will default to the
161 |             same value as :obj:`logging_steps` if not set.
162 |         dataloader_num_workers (:obj:`int`, `optional`, defaults to 0):
163 |             Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
164 |             main process.
165 |         past_index (:obj:`int`, `optional`, defaults to -1):
166 |             Some models like :doc:`TransformerXL <../model_doc/transformerxl>` or :doc`XLNet <../model_doc/xlnet>` can
167 |             make use of the past hidden states for their predictions. If this argument is set to a positive int, the
168 |             ``Trainer`` will use the corresponding output (usually index 2) as the past state and feed it to the model
169 |             at the next training step under the keyword argument ``mems``.
170 |         run_name (:obj:`str`, `optional`):
171 |             A descriptor for the run. Typically used for `wandb <https://www.wandb.com/>`_ logging.
172 |         disable_tqdm (:obj:`bool`, `optional`):
173 |             Whether or not to disable the tqdm progress bars and table of metrics produced by
174 |             :class:`~transformers.notebook.NotebookTrainingTracker` in Jupyter Notebooks. Will default to :obj:`True`
175 |             if the logging level is set to warn or lower (default), :obj:`False` otherwise.
176 |         remove_unused_columns (:obj:`bool`, `optional`, defaults to :obj:`True`):
177 |             If using :obj:`datasets.Dataset` datasets, whether or not to automatically remove the columns unused by the
178 |             model forward method.
179 | 
180 |             (Note that this behavior is not implemented for :class:`~transformers.TFTrainer` yet.)
181 |         label_names (:obj:`List[str]`, `optional`):
182 |             The list of keys in your dictionary of inputs that correspond to the labels.
183 | 
184 |             Will eventually default to :obj:`["labels"]` except if the model used is one of the
185 |             :obj:`XxxForQuestionAnswering` in which case it will default to :obj:`["start_positions",
186 |             "end_positions"]`.
187 |         load_best_model_at_end (:obj:`bool`, `optional`, defaults to :obj:`False`):
188 |             Whether or not to load the best model found during training at the end of training.
189 | 
190 |             .. note::
191 | 
192 |                 When set to :obj:`True`, the parameters :obj:`save_steps` will be ignored and the model will be saved
193 |                 after each evaluation.
194 |         metric_for_best_model (:obj:`str`, `optional`):
195 |             Use in conjunction with :obj:`load_best_model_at_end` to specify the metric to use to compare two different
196 |             models. Must be the name of a metric returned by the evaluation with or without the prefix :obj:`"eval_"`.
197 |             Will default to :obj:`"loss"` if unspecified and :obj:`load_best_model_at_end=True` (to use the evaluation
198 |             loss).
199 | 
200 |             If you set this value, :obj:`greater_is_better` will default to :obj:`True`. Don't forget to set it to
201 |             :obj:`False` if your metric is better when lower.
202 |         greater_is_better (:obj:`bool`, `optional`):
203 |             Use in conjunction with :obj:`load_best_model_at_end` and :obj:`metric_for_best_model` to specify if better
204 |             models should have a greater metric or not. Will default to:
205 | 
206 |             - :obj:`True` if :obj:`metric_for_best_model` is set to a value that isn't :obj:`"loss"` or
207 |               :obj:`"eval_loss"`.
208 |             - :obj:`False` if :obj:`metric_for_best_model` is not set, or set to :obj:`"loss"` or :obj:`"eval_loss"`.
209 |         ignore_skip_data (:obj:`bool`, `optional`, defaults to :obj:`False`):
210 |             When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
211 |             stage as in the previous training. If set to :obj:`True`, the training will begin faster (as that skipping
212 |             step can take a long time) but will not yield the same results as the interrupted training would have.
213 |         sharded_ddp (:obj:`bool`, `optional`, defaults to :obj:`False`):
214 |             Use Sharded DDP training from `FairScale <https://github.com/facebookresearch/fairscale>`__ (in distributed
215 |             training only). This is an experimental feature.
216 |         deepspeed (:obj:`str`, `optional`):
217 |             Use `Deepspeed <https://github.com/microsoft/deepspeed>`__. This is an experimental feature and its API may
218 |             evolve in the future. The value is the location of its json config file (usually ``ds_config.json``).
219 |         label_smoothing_factor (:obj:`float`, `optional`, defaults to 0.0):
220 |             The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
221 |             labels are changed from 0s and 1s to :obj:`label_smoothing_factor/num_labels` and :obj:`1 -
222 |             label_smoothing_factor + label_smoothing_factor/num_labels` respectively.
223 |         adafactor (:obj:`bool`, `optional`, defaults to :obj:`False`):
224 |             Whether or not to use the :class:`~transformers.Adafactor` optimizer instead of
225 |             :class:`~transformers.AdamW`.
226 |         group_by_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
227 |             Whether or not to group together samples of roughly the same legnth in the training dataset (to minimize
228 |             padding applied and be more efficient). Only useful if applying dynamic padding.
229 |         report_to (:obj:`List[str]`, `optional`, defaults to the list of integrations platforms installed):
230 |             The list of integrations to report the results and logs to. Supported platforms are :obj:`"azure_ml"`,
231 |             :obj:`"comet_ml"`, :obj:`"mlflow"`, :obj:`"tensorboard"` and :obj:`"wandb"`.
232 |         ddp_find_unused_parameters (:obj:`bool`, `optional`):
233 |             When using distributed training, the value of the flag :obj:`find_unused_parameters` passed to
234 |             :obj:`DistributedDataParallel`. Will default to :obj:`False` if gradient checkpointing is used, :obj:`True`
235 |             otherwise.
236 |         dataloader_pin_memory (:obj:`bool`, `optional`, defaults to :obj:`True`)):
237 |             Whether you want to pin memory in data loaders or not. Will default to :obj:`True`.
238 |     """
239 | 
240 |     output_dir: Optional[str] = field(
241 |         default=None,
242 |         metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
243 |     )
244 |     overwrite_output_dir: bool = field(
245 |         default=False,
246 |         metadata={
247 |             "help": (
248 |                 "Overwrite the content of the output directory."
249 |                 "Use this to continue training if output_dir points to a checkpoint directory."
250 |             )
251 |         },
252 |     )
253 | 
254 |     do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
255 |     do_eval: bool = field(default=None, metadata={"help": "Whether to run eval on the dev set."})
256 |     do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
257 |     evaluation_strategy: EvaluationStrategy = field(
258 |         default="no",
259 |         metadata={"help": "The evaluation strategy to use."},
260 |     )
261 |     prediction_loss_only: bool = field(
262 |         default=False,
263 |         metadata={"help": "When performing evaluation and predictions, only returns the loss."},
264 |     )
265 | 
266 |     per_device_train_batch_size: int = field(
267 |         default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
268 |     )
269 |     per_device_eval_batch_size: int = field(
270 |         default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
271 |     )
272 | 
273 |     per_gpu_train_batch_size: Optional[int] = field(
274 |         default=None,
275 |         metadata={
276 |             "help": "Deprecated, the use of `--per_device_train_batch_size` is preferred. "
277 |             "Batch size per GPU/TPU core/CPU for training."
278 |         },
279 |     )
280 |     per_gpu_eval_batch_size: Optional[int] = field(
281 |         default=None,
282 |         metadata={
283 |             "help": "Deprecated, the use of `--per_device_eval_batch_size` is preferred."
284 |             "Batch size per GPU/TPU core/CPU for evaluation."
285 |         },
286 |     )
287 | 
288 |     gradient_accumulation_steps: int = field(
289 |         default=1,
290 |         metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
291 |     )
292 |     eval_accumulation_steps: Optional[int] = field(
293 |         default=None,
294 |         metadata={"help": "Number of predictions steps to accumulate before moving the tensors to the CPU."},
295 |     )
296 | 
297 |     learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
298 |     weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
299 |     adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
300 |     adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
301 |     adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
302 |     max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})
303 | 
304 |     num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
305 |     max_steps: int = field(
306 |         default=-1,
307 |         metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
308 |     )
309 |     lr_scheduler_type: SchedulerType = field(
310 |         default="linear",
311 |         metadata={"help": "The scheduler type to use."},
312 |     )
313 |     warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
314 | 
315 |     logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."})
316 |     logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"})
317 |     logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
318 |     save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
319 |     save_total_limit: Optional[int] = field(
320 |         default=None,
321 |         metadata={
322 |             "help": (
323 |                 "Limit the total amount of checkpoints."
324 |                 "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints"
325 |             )
326 |         },
327 |     )
328 |     no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
329 |     seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
330 | 
331 |     fp16: bool = field(
332 |         default=False,
333 |         metadata={"help": "Whether to use 16-bit (mixed) precision (through NVIDIA Apex) instead of 32-bit"},
334 |     )
335 |     fp16_opt_level: str = field(
336 |         default="O1",
337 |         metadata={
338 |             "help": (
339 |                 "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
340 |                 "See details at https://nvidia.github.io/apex/amp.html"
341 |             )
342 |         },
343 |     )
344 |     fp16_backend: str = field(
345 |         default="auto",
346 |         metadata={"help": "The backend to be used for mixed precision.", "choices": ["auto", "amp", "apex"]},
347 |     )
348 |     local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
349 | 
350 |     tpu_num_cores: Optional[int] = field(
351 |         default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"}
352 |     )
353 |     tpu_metrics_debug: bool = field(
354 |         default=False,
355 |         metadata={"help": "Deprecated, the use of `--debug` is preferred. TPU: Whether to print debug metrics"},
356 |     )
357 |     debug: bool = field(default=False, metadata={"help": "Whether to print debug metrics on TPU"})
358 | 
359 |     dataloader_drop_last: bool = field(
360 |         default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
361 |     )
362 |     eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
363 |     dataloader_num_workers: int = field(
364 |         default=0,
365 |         metadata={
366 |             "help": "Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process."
367 |         },
368 |     )
369 | 
370 |     past_index: int = field(
371 |         default=-1,
372 |         metadata={"help": "If >=0, uses the corresponding part of the output as the past state for next step."},
373 |     )
374 | 
375 |     run_name: Optional[str] = field(
376 |         default=None, metadata={"help": "An optional descriptor for the run. Notably used for wandb logging."}
377 |     )
378 |     disable_tqdm: Optional[bool] = field(
379 |         default=None, metadata={"help": "Whether or not to disable the tqdm progress bars."}
380 |     )
381 | 
382 |     remove_unused_columns: Optional[bool] = field(
383 |         default=True, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."}
384 |     )
385 |     label_names: Optional[List[str]] = field(
386 |         default=None, metadata={"help": "The list of keys in your dictionary of inputs that correspond to the labels."}
387 |     )
388 | 
389 |     load_best_model_at_end: Optional[bool] = field(
390 |         default=False,
391 |         metadata={"help": "Whether or not to load the best model found during training at the end of training."},
392 |     )
393 |     metric_for_best_model: Optional[str] = field(
394 |         default=None, metadata={"help": "The metric to use to compare two different models."}
395 |     )
396 |     greater_is_better: Optional[bool] = field(
397 |         default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."}
398 |     )
399 |     ignore_data_skip: bool = field(
400 |         default=False,
401 |         metadata={
402 |             "help": "When resuming training, whether or not to skip the first epochs and batches to get to the same training data."
403 |         },
404 |     )
405 |     sharded_ddp: bool = field(
406 |         default=False,
407 |         metadata={"help": "Whether or not to use sharded DDP training (in distributed training only)."},
408 |     )
409 |     deepspeed: Optional[str] = field(
410 |         default=None,
411 |         metadata={"help": "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json)"},
412 |     )
413 |     label_smoothing_factor: float = field(
414 |         default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
415 |     )
416 |     adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
417 |     group_by_length: bool = field(
418 |         default=False,
419 |         metadata={"help": "Whether or not to group samples of roughly the same length together when batching."},
420 |     )
421 |     report_to: Optional[List[str]] = field(
422 |         default=None, metadata={"help": "The list of integrations to report the results and logs to."}
423 |     )
424 |     ddp_find_unused_parameters: Optional[bool] = field(
425 |         default=None,
426 |         metadata={
427 |             "help": "When using distributed training, the value of the flag `find_unused_parameters` passed to "
428 |             "`DistributedDataParallel`."
429 |         },
430 |     )
431 |     dataloader_pin_memory: bool = field(
432 |         default=True, metadata={"help": "Whether or not to pin memory for DataLoader."}
433 |     )
434 |     _n_gpu: int = field(init=False, repr=False, default=-1)
435 | 
436 |     def __post_init__(self):
437 |         if self.output_dir is None and os.getenv("SM_OUTPUT_DATA_DIR") is None:
438 |             raise ValueError(
439 |                 "`output_dir` is only optional if it can get inferred from the environment. Please set a value for "
440 |                 "`output_dir`."
441 |             )
442 |         elif os.getenv("SM_OUTPUT_DATA_DIR") is not None:
443 |             if self.output_dir is not None:
444 |                 logger.warn(
445 |                     "`output_dir` is overwritten by the env variable 'SM_OUTPUT_DATA_DIR' "
446 |                     f"({os.getenv('SM_OUTPUT_DATA_DIR')})."
447 |                 )
448 |             self.output_dir = os.getenv("SM_OUTPUT_DATA_DIR")
449 |         if self.disable_tqdm is None:
450 |             self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
451 |         self.evaluation_strategy = EvaluationStrategy(self.evaluation_strategy)
452 |         self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type)
453 |         if self.do_eval is False and self.evaluation_strategy != EvaluationStrategy.NO:
454 |             self.do_eval = True
455 |         if self.eval_steps is None:
456 |             self.eval_steps = self.logging_steps
457 | 
458 |         if self.load_best_model_at_end and self.metric_for_best_model is None:
459 |             self.metric_for_best_model = "loss"
460 |         if self.greater_is_better is None and self.metric_for_best_model is not None:
461 |             self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
462 |         if self.run_name is None:
463 |             self.run_name = self.output_dir
464 | 
465 |         if is_torch_available() and self.device.type != "cuda" and self.fp16:
466 |             raise ValueError("Mixed precision training with AMP or APEX (`--fp16`) can only be used on CUDA devices.")
467 |         if self.report_to is None:
468 |             # Import at runtime to avoid a circular import.
469 |             from transformers.integrations import get_available_reporting_integrations
470 | 
471 |             self.report_to = get_available_reporting_integrations()
472 | 
473 |     def __repr__(self):
474 |         # We override the default repr to remove deprecated arguments from the repr. This method should be removed once
475 |         # those deprecated arguments are removed form TrainingArguments. (TODO: v5)
476 |         self_as_dict = asdict(self)
477 |         del self_as_dict["per_gpu_train_batch_size"]
478 |         del self_as_dict["per_gpu_eval_batch_size"]
479 |         attrs_as_str = [f"{k}={v}" for k, v in self_as_dict.items()]
480 |         return f"{self.__class__.__name__}({', '.join(attrs_as_str)})"
481 | 
482 |     @property
483 |     def train_batch_size(self) -> int:
484 |         """
485 |         The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
486 |         """
487 |         if self.per_gpu_train_batch_size:
488 |             logger.warning(
489 |                 "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
490 |                 "version. Using `--per_device_train_batch_size` is preferred."
491 |             )
492 |         per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
493 |         train_batch_size = per_device_batch_size * max(1, self.n_gpu)
494 |         return train_batch_size
495 | 
496 |     @property
497 |     def eval_batch_size(self) -> int:
498 |         """
499 |         The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
500 |         """
501 |         if self.per_gpu_eval_batch_size:
502 |             logger.warning(
503 |                 "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
504 |                 "version. Using `--per_device_eval_batch_size` is preferred."
505 |             )
506 |         per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
507 |         eval_batch_size = per_device_batch_size * max(1, self.n_gpu)
508 |         return eval_batch_size
509 | 
510 |     @cached_property
511 |     @torch_required
512 |     def _setup_devices(self) -> "torch.device":
513 |         logger.info("PyTorch: setting up devices")
514 |         if self.no_cuda:
515 |             device = torch.device("cpu")
516 |             self._n_gpu = 0
517 |         elif is_torch_tpu_available():
518 |             device = xm.xla_device()
519 |             self._n_gpu = 0
520 |         elif self.deepspeed:
521 |             # deepspeed performs its own DDP internally, and requires the program to be started with:
522 |             # deepspeed  ./program.py
523 |             # rather than:
524 |             # python -m torch.distributed.launch --nproc_per_node=2 ./program.py
525 |             from transformers.integrations import is_deepspeed_available
526 | 
527 |             if not is_deepspeed_available():
528 |                 raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.")
529 |             import deepspeed
530 | 
531 |             deepspeed.init_distributed()
532 |             device = torch.device("cuda", self.local_rank)
533 |             self._n_gpu = 1
534 |         elif self.local_rank == -1:
535 |             # if n_gpu is > 1 we'll use nn.DataParallel.
536 |             # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
537 |             # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
538 |             # trigger an error that a device index is missing. Index 0 takes into account the
539 |             # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
540 |             # will use the first GPU in that env, i.e. GPU#1
541 |             device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
542 |             # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
543 |             # the default value.
544 |             self._n_gpu = torch.cuda.device_count()
545 |         else:
546 |             # Here, we'll use torch.distributed.
547 |             # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
548 |             torch.distributed.init_process_group(backend="nccl")
549 |             device = torch.device("cuda", self.local_rank)
550 |             self._n_gpu = 1
551 | 
552 |         if device.type == "cuda":
553 |             torch.cuda.set_device(device)
554 | 
555 |         return device
556 | 
557 |     @property
558 |     @torch_required
559 |     def device(self) -> "torch.device":
560 |         """
561 |         The device used by this process.
562 |         """
563 |         return self._setup_devices
564 | 
565 |     @property
566 |     @torch_required
567 |     def n_gpu(self):
568 |         """
569 |         The number of GPUs used by this process.
570 | 
571 |         Note:
572 |             This will only be greater than one when you have multiple GPUs available but are not using distributed
573 |             training. For distributed training, it will always be 1.
574 |         """
575 |         # Make sure `self._n_gpu` is properly setup.
576 |         _ = self._setup_devices
577 |         return self._n_gpu
578 | 
579 |     @property
580 |     @torch_required
581 |     def parallel_mode(self):
582 |         """
583 |         The current mode used for parallelism if multiple GPUs/TPU cores are available. One of:
584 | 
585 |         - :obj:`ParallelMode.NOT_PARALLEL`: no parallelism (CPU or one GPU).
586 |         - :obj:`ParallelMode.NOT_DISTRIBUTED`: several GPUs in one single process (uses :obj:`torch.nn.DataParallel`).
587 |         - :obj:`ParallelMode.DISTRIBUTED`: several GPUs, each ahving its own process (uses
588 |           :obj:`torch.nn.DistributedDataParallel`).
589 |         - :obj:`ParallelMode.TPU`: several TPU cores.
590 |         """
591 |         if is_torch_tpu_available():
592 |             return ParallelMode.TPU
593 |         elif self.local_rank != -1:
594 |             return ParallelMode.DISTRIBUTED
595 |         elif self.n_gpu > 1:
596 |             return ParallelMode.NOT_DISTRIBUTED
597 |         else:
598 |             return ParallelMode.NOT_PARALLEL
599 | 
600 |     def to_dict(self):
601 |         """
602 |         Serializes this instance while replace `Enum` by their values (for JSON serialization support).
603 |         """
604 |         d = asdict(self)
605 |         for k, v in d.items():
606 |             if isinstance(v, Enum):
607 |                 d[k] = v.value
608 |         return d
609 | 
610 |     def to_json_string(self):
611 |         """
612 |         Serializes this instance to a JSON string.
613 |         """
614 |         return json.dumps(self.to_dict(), indent=2)
615 | 
616 |     def to_sanitized_dict(self) -> Dict[str, Any]:
617 |         """
618 |         Sanitized serialization to use with TensorBoard’s hparams
619 |         """
620 |         d = self.to_dict()
621 |         d = {**d, **{"train_batch_size": self.train_batch_size, "eval_batch_size": self.eval_batch_size}}
622 | 
623 |         valid_types = [bool, int, float, str]
624 |         if is_torch_available():
625 |             valid_types.append(torch.Tensor)
626 | 
627 |         return {k: v if type(v) in valid_types else str(v) for k, v in d.items()}
628 | 
629 | 
630 | class ParallelMode(Enum):
631 |     NOT_PARALLEL = "not_parallel"
632 |     NOT_DISTRIBUTED = "not_distributed"
633 |     DISTRIBUTED = "distributed"
634 |     SAGEMAKER_DISTRIBUTED = "sm_distributed"
635 |     TPU = "tpu"
636 | 


--------------------------------------------------------------------------------
/data/code/util/modeling/modeling_nezha/modeling.py:
--------------------------------------------------------------------------------
   1 | import math
   2 | import os
   3 | import logging
   4 | import torch
   5 | 
   6 | from torch import nn
   7 | from torch.nn import CrossEntropyLoss, MSELoss
   8 | 
   9 | from .configuration import NeZhaConfig
  10 | from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
  11 | from transformers.modeling_utils import PreTrainedModel, prune_linear_layer
  12 | from transformers.models.bert.modeling_bert import (
  13 |     BertOutput,
  14 |     BertPooler,
  15 |     BertSelfOutput,
  16 |     BertIntermediate,
  17 |     BertOnlyMLMHead,
  18 |     BertOnlyNSPHead,
  19 |     BertLMPredictionHead,
  20 |     BERT_START_DOCSTRING,
  21 |     BERT_INPUTS_DOCSTRING,
  22 | )
  23 | 
  24 | logger = logging.getLogger(__name__)
  25 | 
  26 | _CONFIG_FOR_DOC = "NeZhaConfig"
  27 | _TOKENIZER_FOR_DOC = "NeZhaTokenizer"
  28 | 
  29 | NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = []
  30 | NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP = {}
  31 | 
  32 | 
  33 | def load_tf_weights_in_nezha(model, config, tf_checkpoint_path):
  34 |     """Load tf checkpoints in a pytorch model."""
  35 |     try:
  36 |         import re
  37 |         import numpy as np
  38 |         import tensorflow as tf
  39 |     except ImportError:
  40 |         logger.error(
  41 |             "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
  42 |             "https://www.tensorflow.org/install/ for installation instructions."
  43 |         )
  44 |         raise
  45 | 
  46 |     tf_path = os.path.abspath(tf_checkpoint_path)
  47 |     logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
  48 |     # Load weights from TF model
  49 |     init_vars = tf.train.list_variables(tf_path)
  50 |     names = []
  51 |     arrays = []
  52 |     for name, shape in init_vars:
  53 |         # logger.info("Loading TF weight {} with shape {}".format(name, shape))
  54 |         array = tf.train.load_variable(tf_path, name)
  55 |         names.append(name)
  56 |         arrays.append(array)
  57 | 
  58 |     for name, array in zip(names, arrays):
  59 |         name = name.split("/")
  60 |         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
  61 |         # which are not required for using pretrained model
  62 |         if any(
  63 |                 n in ["adam_v", "adam_m", "lamb_m", "lamb_v", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1",
  64 |                       "global_step", "good_steps", "loss_scale", 'bad_steps']
  65 |                 for n in name
  66 |         ):
  67 |             logger.info("Skipping {}".format("/".join(name)))
  68 |             continue
  69 |         pointer = model
  70 |         for m_name in name:
  71 |             if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
  72 |                 scope_names = re.split(r"_(\d+)", m_name)
  73 |             else:
  74 |                 scope_names = [m_name]
  75 |             if scope_names[0] == "kernel" or scope_names[0] == "gamma":
  76 |                 pointer = getattr(pointer, "weight")
  77 |             elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
  78 |                 pointer = getattr(pointer, "bias")
  79 |             elif scope_names[0] == "output_weights":
  80 |                 pointer = getattr(pointer, "weight")
  81 |             elif scope_names[0] == "squad":
  82 |                 pointer = getattr(pointer, "classifier")
  83 |             else:
  84 |                 try:
  85 |                     pointer = getattr(pointer, scope_names[0])
  86 |                 except AttributeError:
  87 |                     logger.info("Skipping {}".format("/".join(name)))
  88 |                     continue
  89 |             if len(scope_names) >= 2:
  90 |                 num = int(scope_names[1])
  91 |                 pointer = pointer[num]
  92 |         if m_name[-11:] == "_embeddings":
  93 |             pointer = getattr(pointer, "weight")
  94 |         elif m_name == "kernel":
  95 |             array = np.transpose(array)
  96 |         try:
  97 |             assert (
  98 |                     pointer.shape == array.shape
  99 |             ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
 100 |         except AssertionError as e:
 101 |             e.args += (pointer.shape, array.shape)
 102 |             raise
 103 |         logger.info("Initialize PyTorch weight {}".format(name))
 104 |         pointer.data = torch.from_numpy(array)
 105 |     return model
 106 | 
 107 | 
 108 | class NeZhaEmbeddings(nn.Module):
 109 |     """
 110 |     Construct the embeddings from word, position and token_type embeddings.
 111 |     """
 112 | 
 113 |     def __init__(self, config):
 114 |         super().__init__()
 115 |         self.use_relative_position = config.use_relative_position
 116 |         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
 117 |         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 118 |         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
 119 |         # any TensorFlow checkpoint file
 120 |         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 121 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 122 | 
 123 |     def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None):
 124 |         if input_ids is not None:
 125 |             input_shape = input_ids.size()
 126 |         else:
 127 |             input_shape = inputs_embeds.size()[:-1]
 128 |         device = input_ids.device if input_ids is not None else inputs_embeds.device
 129 |         if token_type_ids is None:
 130 |             token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
 131 |         if inputs_embeds is None:
 132 |             inputs_embeds = self.word_embeddings(input_ids)
 133 |         token_type_embeddings = self.token_type_embeddings(token_type_ids)
 134 | 
 135 |         embeddings = inputs_embeds + token_type_embeddings
 136 | 
 137 |         # embeddings = inputs_embeds + token_type_embeddings
 138 |         embeddings = self.LayerNorm(embeddings)
 139 |         embeddings = self.dropout(embeddings)
 140 |         return embeddings
 141 | 
 142 | 
 143 | def relative_position_encoding(depth, max_length=512, max_relative_position=127):
 144 |     vocab_size = max_relative_position * 2 + 1
 145 |     range_vec = torch.arange(max_length)
 146 |     range_mat = range_vec.repeat(max_length).view(max_length, max_length)
 147 |     distance_mat = range_mat - torch.t(range_mat)
 148 |     distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position)
 149 |     final_mat = distance_mat_clipped + max_relative_position
 150 | 
 151 |     embeddings_table = torch.zeros(vocab_size, depth)
 152 |     position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
 153 |     div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth))
 154 |     embeddings_table[:, 0::2] = torch.sin(position * div_term)
 155 |     embeddings_table[:, 1::2] = torch.cos(position * div_term)
 156 |     embeddings_table = embeddings_table.unsqueeze(0).transpose(0, 1).squeeze(1)
 157 | 
 158 |     flat_relative_positions_matrix = final_mat.view(-1)
 159 |     one_hot_relative_positions_matrix = torch.nn.functional.one_hot(flat_relative_positions_matrix,
 160 |                                                                     num_classes=vocab_size).float()
 161 |     positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table)
 162 |     my_shape = list(final_mat.size())
 163 |     my_shape.append(depth)
 164 |     positions_encoding = positions_encoding.view(my_shape)
 165 |     return positions_encoding
 166 | 
 167 | 
 168 | class NeZhaSelfAttention(nn.Module):
 169 |     def __init__(self, config):
 170 |         super().__init__()
 171 |         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
 172 |             raise ValueError(
 173 |                 "The hidden size (%d) is not a multiple of the number of attention "
 174 |                 "heads (%d)" % (config.hidden_size, config.num_attention_heads)
 175 |             )
 176 |         self.output_attentions = config.output_attentions
 177 | 
 178 |         self.num_attention_heads = config.num_attention_heads
 179 |         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
 180 |         self.all_head_size = self.num_attention_heads * self.attention_head_size
 181 | 
 182 |         self.query = nn.Linear(config.hidden_size, self.all_head_size)
 183 |         self.key = nn.Linear(config.hidden_size, self.all_head_size)
 184 |         self.value = nn.Linear(config.hidden_size, self.all_head_size)
 185 |         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 186 | 
 187 |         self.relative_positions_encoding = relative_position_encoding(max_length=config.max_position_embeddings,
 188 |                                                                       depth=self.attention_head_size,
 189 |                                                                       max_relative_position=config.max_relative_position)
 190 | 
 191 |     def transpose_for_scores(self, x):
 192 |         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
 193 |         x = x.view(*new_x_shape)
 194 |         return x.permute(0, 2, 1, 3)
 195 | 
 196 |     def forward(
 197 |             self,
 198 |             hidden_states,
 199 |             attention_mask=None,
 200 |             head_mask=None,
 201 |             encoder_hidden_states=None,
 202 |             encoder_attention_mask=None,
 203 |     ):
 204 | 
 205 |         mixed_query_layer = self.query(hidden_states)
 206 | 
 207 |         # If this is instantiated as a cross-attention module, the keys
 208 |         # and values come from an encoder; the attention mask needs to be
 209 |         # such that the encoder's padding tokens are not attended to.
 210 |         if encoder_hidden_states is not None:
 211 |             mixed_key_layer = self.key(encoder_hidden_states)
 212 |             mixed_value_layer = self.value(encoder_hidden_states)
 213 |             attention_mask = encoder_attention_mask
 214 |         else:
 215 |             mixed_key_layer = self.key(hidden_states)
 216 |             mixed_value_layer = self.value(hidden_states)
 217 | 
 218 |         query_layer = self.transpose_for_scores(mixed_query_layer)
 219 |         key_layer = self.transpose_for_scores(mixed_key_layer)
 220 |         value_layer = self.transpose_for_scores(mixed_value_layer)
 221 | 
 222 |         # Take the dot product between "query" and "key" to get the raw attention scores.
 223 |         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 224 | 
 225 |         batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size()
 226 | 
 227 |         relations_keys = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :].to(hidden_states.device)
 228 |         query_layer_t = query_layer.permute(2, 0, 1, 3)
 229 | 
 230 |         query_layer_r = query_layer_t.contiguous().view(from_seq_length, batch_size * num_attention_heads,
 231 |                                                         self.attention_head_size)
 232 |         key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1))
 233 |         key_position_scores_r = key_position_scores.view(from_seq_length, batch_size,
 234 |                                                          num_attention_heads, from_seq_length)
 235 |         key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3)
 236 |         attention_scores = attention_scores + key_position_scores_r_t
 237 | 
 238 |         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
 239 |         if attention_mask is not None:
 240 |             # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
 241 |             attention_scores = attention_scores + attention_mask
 242 | 
 243 |         # Normalize the attention scores to probabilities.
 244 |         attention_probs = nn.Softmax(dim=-1)(attention_scores)
 245 | 
 246 |         # This is actually dropping out entire tokens to attend to, which might
 247 |         # seem a bit unusual, but is taken from the original Transformer paper.
 248 |         attention_probs = self.dropout(attention_probs)
 249 | 
 250 |         # Mask heads if we want to
 251 |         if head_mask is not None:
 252 |             attention_probs = attention_probs * head_mask
 253 | 
 254 |         context_layer = torch.matmul(attention_probs, value_layer)
 255 | 
 256 |         relations_values = self.relative_positions_encoding[:to_seq_length, :to_seq_length, :].to(hidden_states.device)
 257 |         attention_probs_t = attention_probs.permute(2, 0, 1, 3)
 258 |         attentions_probs_r = attention_probs_t.contiguous().view(from_seq_length, batch_size * num_attention_heads,
 259 |                                                                  to_seq_length)
 260 |         value_position_scores = torch.matmul(attentions_probs_r, relations_values)
 261 |         value_position_scores_r = value_position_scores.view(from_seq_length, batch_size,
 262 |                                                              num_attention_heads, self.attention_head_size)
 263 |         value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3)
 264 |         context_layer = context_layer + value_position_scores_r_t
 265 | 
 266 |         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
 267 |         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
 268 |         context_layer = context_layer.view(*new_context_layer_shape)
 269 | 
 270 |         outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
 271 |         return outputs
 272 | 
 273 | 
 274 | class NeZhaAttention(nn.Module):
 275 |     def __init__(self, config):
 276 |         super().__init__()
 277 |         self.self = NeZhaSelfAttention(config)
 278 |         self.output = BertSelfOutput(config)
 279 |         self.pruned_heads = set()
 280 | 
 281 |     def prune_heads(self, heads):
 282 |         if len(heads) == 0:
 283 |             return
 284 |         mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
 285 |         heads = set(heads) - self.pruned_heads  # Convert to set and remove already pruned heads
 286 |         for head in heads:
 287 |             # Compute how many pruned heads are before the head and move the index accordingly
 288 |             head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
 289 |             mask[head] = 0
 290 |         mask = mask.view(-1).contiguous().eq(1)
 291 |         index = torch.arange(len(mask))[mask].long()
 292 |         # Prune linear layers
 293 |         self.self.query = prune_linear_layer(self.self.query, index)
 294 |         self.self.key = prune_linear_layer(self.self.key, index)
 295 |         self.self.value = prune_linear_layer(self.self.value, index)
 296 |         self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
 297 |         # Update hyper params and store pruned heads
 298 |         self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
 299 |         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
 300 |         self.pruned_heads = self.pruned_heads.union(heads)
 301 | 
 302 |     def forward(
 303 |             self,
 304 |             hidden_states,
 305 |             attention_mask=None,
 306 |             head_mask=None,
 307 |             encoder_hidden_states=None,
 308 |             encoder_attention_mask=None,
 309 |     ):
 310 |         self_outputs = self.self(
 311 |             hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
 312 |         )
 313 |         attention_output = self.output(self_outputs[0], hidden_states)
 314 |         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
 315 |         return outputs
 316 | 
 317 | 
 318 | class NeZhaLayer(nn.Module):
 319 |     def __init__(self, config):
 320 |         super().__init__()
 321 |         self.attention = NeZhaAttention(config)
 322 |         self.is_decoder = config.is_decoder
 323 |         if self.is_decoder:
 324 |             self.crossattention = NeZhaAttention(config)
 325 |         self.intermediate = BertIntermediate(config)
 326 |         self.output = BertOutput(config)
 327 | 
 328 |     def forward(
 329 |             self,
 330 |             hidden_states,
 331 |             attention_mask=None,
 332 |             head_mask=None,
 333 |             encoder_hidden_states=None,
 334 |             encoder_attention_mask=None,
 335 |     ):
 336 |         self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
 337 |         attention_output = self_attention_outputs[0]
 338 |         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 339 | 
 340 |         if self.is_decoder and encoder_hidden_states is not None:
 341 |             cross_attention_outputs = self.crossattention(
 342 |                 attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
 343 |             )
 344 |             attention_output = cross_attention_outputs[0]
 345 |             outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
 346 | 
 347 |         intermediate_output = self.intermediate(attention_output)
 348 |         layer_output = self.output(intermediate_output, attention_output)
 349 |         outputs = (layer_output,) + outputs
 350 |         return outputs
 351 | 
 352 | 
 353 | class NeZhaEncoder(nn.Module):
 354 |     def __init__(self, config):
 355 |         super().__init__()
 356 |         self.output_attentions = config.output_attentions
 357 |         # self.output_hidden_states = config.output_hidden_states
 358 |         self.output_hidden_states = True
 359 |         self.layer = nn.ModuleList([NeZhaLayer(config) for _ in range(config.num_hidden_layers)])
 360 | 
 361 |     def forward(
 362 |             self,
 363 |             hidden_states,
 364 |             attention_mask=None,
 365 |             head_mask=None,
 366 |             encoder_hidden_states=None,
 367 |             encoder_attention_mask=None,
 368 |     ):
 369 |         all_hidden_states = ()
 370 |         all_attentions = ()
 371 |         for i, layer_module in enumerate(self.layer):
 372 |             if self.output_hidden_states:
 373 |                 all_hidden_states = all_hidden_states + (hidden_states,)
 374 |             layer_outputs = layer_module(
 375 |                 hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask
 376 |             )
 377 |             hidden_states = layer_outputs[0]
 378 |             if self.output_attentions:
 379 |                 all_attentions = all_attentions + (layer_outputs[1],)
 380 |         # Add last layer
 381 |         if self.output_hidden_states:
 382 |             all_hidden_states = all_hidden_states + (hidden_states,)
 383 | 
 384 |         outputs = (hidden_states,)
 385 |         if self.output_hidden_states:
 386 |             outputs = outputs + (all_hidden_states,)
 387 |         if self.output_attentions:
 388 |             outputs = outputs + (all_attentions,)
 389 |         return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 390 | 
 391 | 
 392 | class NeZhaPreTrainedModel(PreTrainedModel):
 393 |     """ An abstract class to handle weights initialization and
 394 |         a simple interface for downloading and loading pretrained models.
 395 |     """
 396 |     config_class = NeZhaConfig
 397 |     pretrained_model_archive_map = NEZHA_PRETRAINED_MODEL_ARCHIVE_MAP
 398 |     load_tf_weights = load_tf_weights_in_nezha
 399 |     base_model_prefix = "bert"
 400 | 
 401 |     def _init_weights(self, module):
 402 |         """ Initialize the weights """
 403 |         if isinstance(module, (nn.Linear, nn.Embedding)):
 404 |             # Slightly different from the TF version which uses truncated_normal for initialization
 405 |             # cf https://github.com/pytorch/pytorch/pull/5617
 406 |             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
 407 |         elif isinstance(module, nn.LayerNorm):
 408 |             module.bias.data.zero_()
 409 |             module.weight.data.fill_(1.0)
 410 |         if isinstance(module, nn.Linear) and module.bias is not None:
 411 |             module.bias.data.zero_()
 412 | 
 413 | 
 414 | @add_start_docstrings(
 415 |     "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
 416 |     BERT_START_DOCSTRING,
 417 | )
 418 | class NeZhaModel(NeZhaPreTrainedModel):
 419 |     """
 420 |     The model can behave as an encoder (with only self-attention) as well
 421 |     as a decoder, in which case a layer of cross-attention is added between
 422 |     the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
 423 |     Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
 424 | 
 425 |     To behave as an decoder the model needs to be initialized with the
 426 |     :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
 427 |     :obj:`encoder_hidden_states` is expected as an input to the forward pass.
 428 | 
 429 |     .. _`Attention is all you need`:
 430 |         https://arxiv.org/abs/1706.03762
 431 | 
 432 |     """
 433 | 
 434 |     def __init__(self, config):
 435 |         super().__init__(config)
 436 |         self.config = config
 437 |         self.embeddings = NeZhaEmbeddings(config)
 438 |         self.encoder = NeZhaEncoder(config)
 439 |         self.pooler = BertPooler(config)
 440 |         self.init_weights()
 441 | 
 442 |     def get_input_embeddings(self):
 443 |         return self.embeddings.word_embeddings
 444 | 
 445 |     def set_input_embeddings(self, value):
 446 |         self.embeddings.word_embeddings = value
 447 | 
 448 |     def _prune_heads(self, heads_to_prune):
 449 |         """ Prunes heads of the model.
 450 |             heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
 451 |             See base class PreTrainedModel
 452 |         """
 453 |         for layer, heads in heads_to_prune.items():
 454 |             self.encoder.layer[layer].attention.prune_heads(heads)
 455 | 
 456 |     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
 457 |     def forward(
 458 |             self,
 459 |             input_ids=None,
 460 |             attention_mask=None,
 461 |             token_type_ids=None,
 462 |             head_mask=None,
 463 |             position_ids=None,
 464 |             inputs_embeds=None,
 465 |             encoder_hidden_states=None,
 466 |             encoder_attention_mask=None,
 467 |     ):
 468 |         r"""
 469 |     Return:
 470 |         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
 471 |         last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
 472 |             Sequence of hidden-states at the output of the last layer of the model.
 473 |         pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
 474 |             Last layer hidden-state of the first token of the sequence (classification token)
 475 |             further processed by a Linear layer and a Tanh activation function. The Linear
 476 |             layer weights are trained from the next sentence prediction (classification)
 477 |             objective during pre-training.
 478 | 
 479 |             This output is usually *not* a good summary
 480 |             of the semantic content of the input, you're often better with averaging or pooling
 481 |             the sequence of hidden-states for the whole input sequence.
 482 |         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
 483 |             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
 484 |             of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 485 | 
 486 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 487 |         attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
 488 |             Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
 489 |             :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 490 | 
 491 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
 492 |             heads.
 493 | 
 494 |     Examples::
 495 | 
 496 |         from transformers import BertModel, BertTokenizer
 497 |         import torch
 498 | 
 499 |         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 500 |         model = BertModel.from_pretrained('bert-base-uncased')
 501 | 
 502 |         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
 503 |         outputs = model(input_ids)
 504 | 
 505 |         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 506 | 
 507 |         """
 508 | 
 509 |         if input_ids is not None and inputs_embeds is not None:
 510 |             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
 511 |         elif input_ids is not None:
 512 |             input_shape = input_ids.size()
 513 |         elif inputs_embeds is not None:
 514 |             input_shape = inputs_embeds.size()[:-1]
 515 |         else:
 516 |             raise ValueError("You have to specify either input_ids or inputs_embeds")
 517 | 
 518 |         device = input_ids.device if input_ids is not None else inputs_embeds.device
 519 | 
 520 |         if attention_mask is None:
 521 |             attention_mask = torch.ones(input_shape, device=device)
 522 |         if token_type_ids is None:
 523 |             token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
 524 | 
 525 |         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
 526 |         # ourselves in which case we just need to make it broadcastable to all heads.
 527 |         extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
 528 |             attention_mask, input_shape, self.device
 529 |         )
 530 | 
 531 |         # If a 2D ou 3D attention mask is provided for the cross-attention
 532 |         # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
 533 |         if self.config.is_decoder and encoder_hidden_states is not None:
 534 |             encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
 535 |             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
 536 |             if encoder_attention_mask is None:
 537 |                 encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
 538 |             encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
 539 |         else:
 540 |             encoder_extended_attention_mask = None
 541 | 
 542 |         # Prepare head mask if needed
 543 |         # 1.0 in head_mask indicate we keep the head
 544 |         # attention_probs has shape bsz x n_heads x N x N
 545 |         # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
 546 |         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
 547 |         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 548 | 
 549 |         embedding_output = self.embeddings(
 550 |             input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
 551 |         )
 552 |         encoder_outputs = self.encoder(
 553 |             embedding_output,
 554 |             attention_mask=extended_attention_mask,
 555 |             head_mask=head_mask,
 556 |             encoder_hidden_states=encoder_hidden_states,
 557 |             encoder_attention_mask=encoder_extended_attention_mask,
 558 |         )
 559 |         sequence_output = encoder_outputs[0]
 560 |         pooled_output = self.pooler(sequence_output)
 561 | 
 562 |         outputs = (sequence_output, pooled_output,) + encoder_outputs[
 563 |                                                       1:
 564 |                                                       ]  # add hidden_states and attentions if they are here
 565 |         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 566 | 
 567 | 
 568 | class BertPreTrainingHeads(nn.Module):
 569 |     def __init__(self, config):
 570 |         super().__init__()
 571 |         self.predictions = BertLMPredictionHead(config)
 572 |         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 573 | 
 574 |     def forward(self, sequence_output, pooled_output):
 575 |         prediction_scores = self.predictions(sequence_output)
 576 |         seq_relationship_score = self.seq_relationship(pooled_output)
 577 |         return prediction_scores, seq_relationship_score
 578 | 
 579 | 
 580 | @add_start_docstrings(
 581 |     """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
 582 |     a `next sentence prediction (classification)` head. """,
 583 |     BERT_START_DOCSTRING,
 584 | )
 585 | class NeZhaForPreTraining(NeZhaPreTrainedModel):
 586 |     def __init__(self, config):
 587 |         super().__init__(config)
 588 |         self.bert = NeZhaModel(config)
 589 |         self.cls = BertPreTrainingHeads(config)
 590 |         self.init_weights()
 591 | 
 592 |     def get_output_embeddings(self):
 593 |         return self.cls.predictions.decoder
 594 | 
 595 |     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
 596 |     def forward(
 597 |             self,
 598 |             input_ids=None,
 599 |             attention_mask=None,
 600 |             token_type_ids=None,
 601 |             head_mask=None,
 602 |             position_ids=None,
 603 |             inputs_embeds=None,
 604 |             labels=None,
 605 |             sentence_span_labels=None,
 606 |     ):
 607 | 
 608 |         outputs = self.bert(
 609 |             input_ids,
 610 |             attention_mask=attention_mask,
 611 |             token_type_ids=token_type_ids,
 612 |             head_mask=head_mask,
 613 |             inputs_embeds=inputs_embeds,
 614 |         )
 615 | 
 616 |         sequence_output, pooled_output = outputs[:2]
 617 |         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 618 |         # add hidden states and attention if they are here
 619 |         outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]
 620 | 
 621 |         if labels is not None and sentence_span_labels is not None:
 622 |             loss_fct = CrossEntropyLoss()
 623 |             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 624 | 
 625 |             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), sentence_span_labels.view(-1))
 626 | 
 627 |             pseudo_labels = torch.argmax(torch.softmax(seq_relationship_score, -1), 1)
 628 |             pseudo_loss = loss_fct(seq_relationship_score.view(-1, 2), pseudo_labels.view(-1))
 629 |             next_sentence_loss = next_sentence_loss + 0.5 * pseudo_loss
 630 | 
 631 |             total_loss = masked_lm_loss + next_sentence_loss
 632 |             outputs = (total_loss,) + outputs
 633 | 
 634 |         return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 635 | 
 636 | 
 637 | @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
 638 | class NeZhaForMaskedLM(NeZhaPreTrainedModel):
 639 |     def __init__(self, config):
 640 |         super().__init__(config)
 641 |         self.bert = NeZhaModel(config)
 642 |         self.cls = BertOnlyMLMHead(config)
 643 |         self.init_weights()
 644 | 
 645 |     def get_output_embeddings(self):
 646 |         return self.cls.predictions.decoder
 647 | 
 648 |     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
 649 |     def forward(
 650 |             self,
 651 |             input_ids=None,
 652 |             attention_mask=None,
 653 |             token_type_ids=None,
 654 |             head_mask=None,
 655 |             position_ids=None,
 656 |             inputs_embeds=None,
 657 |             encoder_hidden_states=None,
 658 |             encoder_attention_mask=None,
 659 |             labels=None,
 660 |     ):
 661 |         r"""
 662 |         masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
 663 |             Labels for computing the masked language modeling loss.
 664 |             Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
 665 |             Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
 666 |             in ``[0, ..., config.vocab_size]``
 667 |         lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
 668 |             Labels for computing the left-to-right language modeling loss (next word prediction).
 669 |             Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
 670 |             Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
 671 |             in ``[0, ..., config.vocab_size]``
 672 | 
 673 |     Returns:
 674 |         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
 675 |         masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
 676 |             Masked language modeling loss.
 677 |         ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided):
 678 |                 Next token prediction loss.
 679 |         prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
 680 |             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
 681 |         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
 682 |             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
 683 |             of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 684 | 
 685 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 686 |         attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
 687 |             Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
 688 |             :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 689 | 
 690 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
 691 |             heads.
 692 | 
 693 |         Examples::
 694 | 
 695 |             from transformers import BertTokenizer, BertForMaskedLM
 696 |             import torch
 697 | 
 698 |             tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 699 |             model = BertForMaskedLM.from_pretrained('bert-base-uncased')
 700 | 
 701 |             input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
 702 |             outputs = model(input_ids, masked_lm_labels=input_ids)
 703 | 
 704 |             loss, prediction_scores = outputs[:2]
 705 | 
 706 |         """
 707 |         outputs = self.bert(
 708 |             input_ids,
 709 |             attention_mask=attention_mask,
 710 |             token_type_ids=token_type_ids,
 711 |             head_mask=head_mask,
 712 |             inputs_embeds=inputs_embeds,
 713 |             encoder_hidden_states=encoder_hidden_states,
 714 |             encoder_attention_mask=encoder_attention_mask,
 715 |         )
 716 | 
 717 |         sequence_output = outputs[0]
 718 |         prediction_scores = self.cls(sequence_output)
 719 |         outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
 720 | 
 721 |         # Although this may seem awkward, BertForMaskedLM supports two scenarios:
 722 |         # 1. If a tensor that contains the indices of masked labels is provided,
 723 |         #    the cross-entropy is the MLM cross-entropy that measures the likelihood
 724 |         #    of predictions for masked words.
 725 |         # 2. If `lm_labels` is provided we are in a causal scenario where we
 726 |         #    try to predict the next token for each input in the decoder.
 727 |         masked_lm_labels = None
 728 |         if labels is not None:
 729 |             loss_fct = CrossEntropyLoss()  # -100 index = padding token
 730 |             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 731 |             outputs = (masked_lm_loss,) + outputs
 732 |         return outputs  # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 733 | 
 734 |     def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
 735 |         input_shape = input_ids.shape
 736 |         effective_batch_size = input_shape[0]
 737 | 
 738 |         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
 739 |         if attention_mask is None:
 740 |             attention_mask = input_ids.new_ones(input_shape)
 741 | 
 742 |         # if model is does not use a causal mask then add a dummy token
 743 |         if self.config.is_decoder is False:
 744 |             assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
 745 |             attention_mask = torch.cat(
 746 |                 [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1
 747 |             )
 748 | 
 749 |             dummy_token = torch.full(
 750 |                 (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
 751 |             )
 752 |             input_ids = torch.cat([input_ids, dummy_token], dim=1)
 753 | 
 754 |         return {"input_ids": input_ids, "attention_mask": attention_mask}
 755 | 
 756 | 
 757 | @add_start_docstrings(
 758 |     """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
 759 | )
 760 | class NeZhaForNextSentencePrediction(NeZhaPreTrainedModel):
 761 |     def __init__(self, config):
 762 |         super().__init__(config)
 763 |         self.bert = NeZhaModel(config)
 764 |         self.cls = BertOnlyNSPHead(config)
 765 |         self.init_weights()
 766 | 
 767 |     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
 768 |     def forward(
 769 |             self,
 770 |             input_ids=None,
 771 |             attention_mask=None,
 772 |             token_type_ids=None,
 773 |             head_mask=None,
 774 |             position_ids=None,
 775 |             inputs_embeds=None,
 776 |             next_sentence_label=None,
 777 |     ):
 778 |         r"""
 779 |         next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
 780 |             Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
 781 |             Indices should be in ``[0, 1]``.
 782 |             ``0`` indicates sequence B is a continuation of sequence A,
 783 |             ``1`` indicates sequence B is a random sequence.
 784 | 
 785 |     Returns:
 786 |         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
 787 |         loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided):
 788 |             Next sequence prediction (classification) loss.
 789 |         seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
 790 |             Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
 791 |         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
 792 |             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
 793 |             of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 794 | 
 795 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 796 |         attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
 797 |             Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
 798 |             :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 799 | 
 800 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
 801 |             heads.
 802 | 
 803 |     Examples::
 804 | 
 805 |         from transformers import BertTokenizer, BertForNextSentencePrediction
 806 |         import torch
 807 | 
 808 |         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 809 |         model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
 810 | 
 811 |         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
 812 |         outputs = model(input_ids)
 813 | 
 814 |         seq_relationship_scores = outputs[0]
 815 | 
 816 |         """
 817 | 
 818 |         outputs = self.bert(
 819 |             input_ids,
 820 |             attention_mask=attention_mask,
 821 |             token_type_ids=token_type_ids,
 822 |             head_mask=head_mask,
 823 |             inputs_embeds=inputs_embeds,
 824 |         )
 825 | 
 826 |         pooled_output = outputs[1]
 827 |         seq_relationship_score = self.cls(pooled_output)
 828 |         outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
 829 |         if next_sentence_label is not None:
 830 |             loss_fct = CrossEntropyLoss()
 831 |             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
 832 |             outputs = (next_sentence_loss,) + outputs
 833 | 
 834 |         return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
 835 | 
 836 | 
 837 | @add_start_docstrings(
 838 |     """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
 839 |     the pooled output) e.g. for GLUE tasks. """,
 840 |     BERT_START_DOCSTRING,
 841 | )
 842 | class NeZhaForSequenceClassification(NeZhaPreTrainedModel):
 843 |     def __init__(self, config):
 844 |         super().__init__(config)
 845 |         self.num_labels = config.num_labels
 846 |         self.bert = NeZhaModel(config)
 847 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 848 |         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 849 |         self.init_weights()
 850 | 
 851 |     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
 852 |     def forward(
 853 |             self,
 854 |             input_ids=None,
 855 |             attention_mask=None,
 856 |             token_type_ids=None,
 857 |             position_ids=None,
 858 |             head_mask=None,
 859 |             inputs_embeds=None,
 860 |             labels=None,
 861 |     ):
 862 |         r"""
 863 |         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
 864 |             Labels for computing the sequence classification/regression loss.
 865 |             Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
 866 |             If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
 867 |             If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
 868 | 
 869 |     Returns:
 870 |         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
 871 |         loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
 872 |             Classification (or regression if config.num_labels==1) loss.
 873 |         logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
 874 |             Classification (or regression if config.num_labels==1) scores (before SoftMax).
 875 |         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
 876 |             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
 877 |             of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 878 | 
 879 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 880 |         attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
 881 |             Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
 882 |             :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 883 | 
 884 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
 885 |             heads.
 886 | 
 887 |     Examples::
 888 | 
 889 |         from transformers import BertTokenizer, BertForSequenceClassification
 890 |         import torch
 891 | 
 892 |         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 893 |         model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 894 | 
 895 |         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
 896 |         labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
 897 |         outputs = model(input_ids, labels=labels)
 898 | 
 899 |         loss, logits = outputs[:2]
 900 | 
 901 |         """
 902 | 
 903 |         outputs = self.bert(
 904 |             input_ids,
 905 |             attention_mask=attention_mask,
 906 |             token_type_ids=token_type_ids,
 907 |             head_mask=head_mask,
 908 |             inputs_embeds=inputs_embeds,
 909 |         )
 910 | 
 911 |         pooled_output = outputs[1]
 912 | 
 913 |         pooled_output = self.dropout(pooled_output)
 914 |         logits = self.classifier(pooled_output)
 915 | 
 916 |         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 917 | 
 918 |         if labels is not None:
 919 |             if self.num_labels == 1:
 920 |                 #  We are doing regression
 921 |                 loss_fct = MSELoss()
 922 |                 loss = loss_fct(logits.view(-1), labels.view(-1))
 923 |             else:
 924 |                 loss_fct = CrossEntropyLoss()
 925 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 926 |             outputs = (loss,) + outputs
 927 | 
 928 |         return outputs  # (loss), logits, (hidden_states), (attentions)
 929 | 
 930 | 
 931 | @add_start_docstrings(
 932 |     """Bert Model with a multiple choice classification head on top (a linear layer on top of
 933 |     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
 934 |     BERT_START_DOCSTRING,
 935 | )
 936 | class NeZhaForMultipleChoice(NeZhaPreTrainedModel):
 937 |     def __init__(self, config):
 938 |         super().__init__(config)
 939 |         self.bert = NeZhaModel(config)
 940 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 941 |         self.classifier = nn.Linear(config.hidden_size, 1)
 942 |         self.init_weights()
 943 | 
 944 |     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
 945 |     def forward(
 946 |             self,
 947 |             input_ids=None,
 948 |             attention_mask=None,
 949 |             token_type_ids=None,
 950 |             head_mask=None,
 951 |             position_ids=None,
 952 |             inputs_embeds=None,
 953 |             labels=None,
 954 |     ):
 955 |         r"""
 956 |         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
 957 |             Labels for computing the multiple choice classification loss.
 958 |             Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
 959 |             of the input tensors. (see `input_ids` above)
 960 | 
 961 |     Returns:
 962 |         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
 963 |         loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
 964 |             Classification loss.
 965 |         classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
 966 |             `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
 967 | 
 968 |             Classification scores (before SoftMax).
 969 |         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
 970 |             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
 971 |             of shape :obj:`(batch_size, sequence_length, hidden_size)`.
 972 | 
 973 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 974 |         attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
 975 |             Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
 976 |             :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
 977 | 
 978 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
 979 |             heads.
 980 | 
 981 |     Examples::
 982 | 
 983 |         from transformers import BertTokenizer, BertForMultipleChoice
 984 |         import torch
 985 | 
 986 |         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 987 |         model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
 988 |         choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
 989 | 
 990 |         input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
 991 |         labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
 992 |         outputs = model(input_ids, labels=labels)
 993 | 
 994 |         loss, classification_scores = outputs[:2]
 995 | 
 996 |         """
 997 |         num_choices = input_ids.shape[1]
 998 | 
 999 |         input_ids = input_ids.view(-1, input_ids.size(-1))
1000 |         attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
1001 |         token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
1002 | 
1003 |         outputs = self.bert(
1004 |             input_ids,
1005 |             attention_mask=attention_mask,
1006 |             token_type_ids=token_type_ids,
1007 |             head_mask=head_mask,
1008 |             inputs_embeds=inputs_embeds,
1009 |         )
1010 | 
1011 |         pooled_output = outputs[1]
1012 | 
1013 |         pooled_output = self.dropout(pooled_output)
1014 |         logits = self.classifier(pooled_output)
1015 |         reshaped_logits = logits.view(-1, num_choices)
1016 | 
1017 |         outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
1018 | 
1019 |         if labels is not None:
1020 |             loss_fct = CrossEntropyLoss()
1021 |             loss = loss_fct(reshaped_logits, labels)
1022 |             outputs = (loss,) + outputs
1023 | 
1024 |         return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
1025 | 
1026 | 
1027 | @add_start_docstrings(
1028 |     """Bert Model with a token classification head on top (a linear layer on top of
1029 |     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
1030 |     BERT_START_DOCSTRING,
1031 | )
1032 | class NeZhaForTokenClassification(NeZhaPreTrainedModel):
1033 |     def __init__(self, config):
1034 |         super().__init__(config)
1035 |         self.num_labels = config.num_labels
1036 |         self.bert = NeZhaModel(config)
1037 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
1038 |         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1039 |         self.init_weights()
1040 | 
1041 |     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1042 |     def forward(
1043 |             self,
1044 |             input_ids=None,
1045 |             attention_mask=None,
1046 |             token_type_ids=None,
1047 |             head_mask=None,
1048 |             position_ids=None,
1049 |             inputs_embeds=None,
1050 |             labels=None,
1051 |     ):
1052 |         r"""
1053 |         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
1054 |             Labels for computing the token classification loss.
1055 |             Indices should be in ``[0, ..., config.num_labels - 1]``.
1056 | 
1057 |     Returns:
1058 |         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
1059 |         loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
1060 |             Classification loss.
1061 |         scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
1062 |             Classification scores (before SoftMax).
1063 |         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
1064 |             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
1065 |             of shape :obj:`(batch_size, sequence_length, hidden_size)`.
1066 | 
1067 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
1068 |         attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
1069 |             Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
1070 |             :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
1071 | 
1072 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
1073 |             heads.
1074 | 
1075 |     Examples::
1076 | 
1077 |         from transformers import BertTokenizer, BertForTokenClassification
1078 |         import torch
1079 | 
1080 |         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1081 |         model = BertForTokenClassification.from_pretrained('bert-base-uncased')
1082 | 
1083 |         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
1084 |         labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
1085 |         outputs = model(input_ids, labels=labels)
1086 | 
1087 |         loss, scores = outputs[:2]
1088 | 
1089 |         """
1090 | 
1091 |         outputs = self.bert(
1092 |             input_ids,
1093 |             attention_mask=attention_mask,
1094 |             token_type_ids=token_type_ids,
1095 |             head_mask=head_mask,
1096 |             inputs_embeds=inputs_embeds,
1097 |         )
1098 | 
1099 |         sequence_output = outputs[0]
1100 | 
1101 |         sequence_output = self.dropout(sequence_output)
1102 |         logits = self.classifier(sequence_output)
1103 | 
1104 |         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
1105 |         if labels is not None:
1106 |             loss_fct = CrossEntropyLoss()
1107 |             # Only keep active parts of the loss
1108 |             if attention_mask is not None:
1109 |                 active_loss = attention_mask.view(-1) == 1
1110 |                 active_logits = logits.view(-1, self.num_labels)
1111 |                 active_labels = torch.where(
1112 |                     active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
1113 |                 )
1114 |                 loss = loss_fct(active_logits, active_labels)
1115 |             else:
1116 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1117 |             outputs = (loss,) + outputs
1118 | 
1119 |         return outputs  # (loss), scores, (hidden_states), (attentions)
1120 | 
1121 | 
1122 | @add_start_docstrings(
1123 |     """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
1124 |     layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
1125 |     BERT_START_DOCSTRING,
1126 | )
1127 | class NeZhaForQuestionAnswering(NeZhaPreTrainedModel):
1128 |     def __init__(self, config):
1129 |         super().__init__(config)
1130 |         self.num_labels = config.num_labels
1131 |         self.bert = NeZhaModel(config)
1132 |         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1133 |         self.init_weights()
1134 | 
1135 |     @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1136 |     def forward(
1137 |             self,
1138 |             input_ids=None,
1139 |             attention_mask=None,
1140 |             token_type_ids=None,
1141 |             head_mask=None,
1142 |             inputs_embeds=None,
1143 |             position_ids=None,
1144 |             start_positions=None,
1145 |             end_positions=None,
1146 |     ):
1147 |         r"""
1148 |         start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1149 |             Labels for position (index) of the start of the labelled span for computing the token classification loss.
1150 |             Positions are clamped to the length of the sequence (`sequence_length`).
1151 |             Position outside of the sequence are not taken into account for computing the loss.
1152 |         end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
1153 |             Labels for position (index) of the end of the labelled span for computing the token classification loss.
1154 |             Positions are clamped to the length of the sequence (`sequence_length`).
1155 |             Position outside of the sequence are not taken into account for computing the loss.
1156 | 
1157 |     Returns:
1158 |         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
1159 |         loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
1160 |             Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
1161 |         start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
1162 |             Span-start scores (before SoftMax).
1163 |         end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
1164 |             Span-end scores (before SoftMax).
1165 |         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
1166 |             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
1167 |             of shape :obj:`(batch_size, sequence_length, hidden_size)`.
1168 | 
1169 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
1170 |         attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
1171 |             Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
1172 |             :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
1173 | 
1174 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
1175 |             heads.
1176 | 
1177 |     Examples::
1178 | 
1179 |         from transformers import BertTokenizer, BertForQuestionAnswering
1180 |         import torch
1181 | 
1182 |         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
1183 |         model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
1184 | 
1185 |         question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
1186 |         encoding = tokenizer.encode_plus(question, text)
1187 |         input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
1188 |         start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
1189 | 
1190 |         all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
1191 |         answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
1192 | 
1193 |         assert answer == "a nice puppet"
1194 | 
1195 |         """
1196 | 
1197 |         outputs = self.bert(
1198 |             input_ids,
1199 |             attention_mask=attention_mask,
1200 |             token_type_ids=token_type_ids,
1201 |             head_mask=head_mask,
1202 |             inputs_embeds=inputs_embeds,
1203 |         )
1204 | 
1205 |         sequence_output = outputs[0]
1206 | 
1207 |         logits = self.qa_outputs(sequence_output)
1208 |         start_logits, end_logits = logits.split(1, dim=-1)
1209 |         start_logits = start_logits.squeeze(-1)
1210 |         end_logits = end_logits.squeeze(-1)
1211 | 
1212 |         outputs = (start_logits, end_logits,) + outputs[2:]
1213 |         if start_positions is not None and end_positions is not None:
1214 |             # If we are on multi-GPU, split add a dimension
1215 |             if len(start_positions.size()) > 1:
1216 |                 start_positions = start_positions.squeeze(-1)
1217 |             if len(end_positions.size()) > 1:
1218 |                 end_positions = end_positions.squeeze(-1)
1219 |             # sometimes the start/end positions are outside our model inputs, we ignore these terms
1220 |             ignored_index = start_logits.size(1)
1221 |             start_positions.clamp_(0, ignored_index)
1222 |             end_positions.clamp_(0, ignored_index)
1223 | 
1224 |             loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1225 |             start_loss = loss_fct(start_logits, start_positions)
1226 |             end_loss = loss_fct(end_logits, end_positions)
1227 |             total_loss = (start_loss + end_loss) / 2
1228 |             outputs = (total_loss,) + outputs
1229 | 
1230 |         return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
1231 | 


--------------------------------------------------------------------------------