├── 0.prepare_data.py
├── 1.roberta_pretrain.py
├── 2.finetune.py
├── 3.pseudo_train.py
├── 4.predict.py
├── LICENSE
├── README.md
├── components
    ├── dataset.py
    ├── model.py
    ├── optimizer.py
    ├── predict.py
    ├── train.py
    └── util.py
└── run_train.sh


/0.prepare_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | def main():
 3 |     with open('./extra_data/simple_english_wiki.txt') as f:
 4 |         contents = f.read()
 5 |     contents = contents.split('\n\n')
 6 |     for i in range(len(contents)):
 7 |         contents[i] = '\n'.join(contents[i].split('\n')[1:])
 8 |     length = 1000
 9 |     all_data_split = []
10 |     for txt in contents:
11 |         [all_data_split.append(txt[0+i:length+i]) for i in range(0, len(txt), length)]
12 |     with open('./extra_data/cbt_valid.txt') as f:
13 |         cbt_v = f.read()
14 |     with open('./extra_data/cbt_test.txt') as f:
15 |         cbt_te = f.read()
16 |     with open('./extra_data/cbt_train.txt') as f:
17 |         cbt_tr = f.read()
18 |     cbt = cbt_v+cbt_te+cbt_tr
19 |     cbt = cbt.replace('`',"'")
20 |     cbt = cbt.replace("''",'"')
21 |     _=[all_data_split.append(cbt[0+i:length+i]) for i in range(0, len(cbt), length)]
22 |     df = pd.DataFrame()
23 |     df['excerpt'] = all_data_split
24 |     df.to_csv('./extra_data/extra_excerpt.csv',index=False)
25 | 
26 | if __name__ == "__main__":
27 |     main()
28 | 


--------------------------------------------------------------------------------
/1.roberta_pretrain.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from transformers import AutoTokenizer, RobertaForMaskedLM, AutoConfig
 3 | from transformers import Trainer, TrainingArguments
 4 | from components.util import seed_everything
 5 | from components.dataset import MLMDataset
 6 | from components.optimizer import get_optimizer_robertaMLM, get_scheduler
 7 | import torch
 8 | import os
 9 | import sys
10 | 
11 | def main():
12 |     ###
13 |     # MLM pretrain with training data
14 |     ###
15 |     device = "cuda:0"
16 |     model_dir = './pretrained/roberta-large/'
17 |     tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256, add_prefix_space=True)
18 |     model = RobertaForMaskedLM.from_pretrained(model_dir, local_files_only=True).to(device)
19 | 
20 |     df = pd.read_csv('./data/train.csv')[['excerpt']]
21 |     texts = df['excerpt'].tolist()
22 |     df_val = pd.read_csv('./data/test.csv')[['excerpt']]
23 |     test = df_val['excerpt'].tolist()
24 |     texts = texts+test
25 | 
26 |     seed_everything(456982)
27 | 
28 |     train_dataset = MLMDataset(True,texts,tokenizer)
29 |     val_dataset = MLMDataset(True,texts,tokenizer)
30 | 
31 |     config = {
32 |         'lr_type':'custom',
33 |         'base_lr':9e-5,
34 |         'head_lr':1.2e-4,
35 |         'min_lr':4e-5,
36 |         'low_lr':2e-5,
37 |         'n_epoch':5,
38 |         'bs':16,
39 |         'ga':1,
40 |         'lr_scheduler_mul_factor':2,
41 |         'weight_decay':0.01,
42 |         'warm_up_ratio':0.2,
43 |         'decline_1': 0.2,
44 |         'decline_2': 0.7,
45 |         'decline_3': 0.8,
46 |         'decline_4': 0.9,
47 |         'layerwise_decay_rate': 0.9**0.5,
48 |         'betas': (0.9,0.993),
49 |     }
50 | 
51 |     train_len = len(train_dataset)
52 |     total_train_steps = int(train_len * config['n_epoch'] / config['ga'] / config['bs']) 
53 |     optimizer = get_optimizer_robertaMLM(model,config)
54 |     lr_scheduler = get_scheduler(optimizer, total_train_steps, config)
55 | 
56 |     training_args = TrainingArguments(
57 |         output_dir='./',          # output directory
58 |         num_train_epochs=config['n_epoch'],              # total number of training epochs
59 |         overwrite_output_dir=True,
60 |         per_device_train_batch_size=config['bs'],  # batch size per device during training
61 |         per_device_eval_batch_size=32,   # batch size for evaluation
62 |         weight_decay=0.01,               # strength of weight decay
63 |         logging_strategy='no',
64 |         gradient_accumulation_steps = config['ga'],
65 |         save_strategy = "no",
66 |         evaluation_strategy= 'epoch',
67 |         prediction_loss_only=True,
68 |         learning_rate = config['base_lr'],
69 |     )
70 | 
71 |     trainer = Trainer(
72 |         model=model,                         # the instantiated 🤗 Transformers model to be trained
73 |         args=training_args,                  # training arguments, defined above
74 |         train_dataset=train_dataset,         # training dataset
75 |         eval_dataset=val_dataset,             # evaluation dataset
76 |         optimizers = (optimizer, lr_scheduler)
77 |     )
78 | 
79 |     trainer.train()
80 |     if not os.path.isdir('./models'):
81 |         os.mkdir('./models')
82 |     dict_ = model.state_dict()
83 |     for key in list(dict_.keys()):
84 |         dict_[key.replace('roberta.', 'base.')] = dict_.pop(key)
85 |     torch.save(dict_, f'./models/roberta_large_pretrain.pt')
86 |     
87 | if __name__ == "__main__":
88 |     main()
89 | 


--------------------------------------------------------------------------------
/2.finetune.py:
--------------------------------------------------------------------------------
 1 | #args:
 2 | #1. type of model: 'ro' or 'de'
 3 | #2. pretrained path
 4 | #3. save path
 5 | #4. lr type 'custom' or '3stage'
 6 | #5. lr config type 1-3
 7 | #    1.training from scratch (use lr type 'custom')
 8 | #    2.pseudo pretrain (use lr type 'custom')
 9 | #    3.pseudo finetune (use lr type '3stage')
10 | 
11 | from components.train import train_ft
12 | from components.util import generate_config
13 | import sys
14 | import numpy as np
15 | 
16 | def main():
17 |     ###
18 |     # training using provided training data
19 |     ###
20 |     config = generate_config(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5])
21 |     losses = train_ft(config)
22 |     print(np.mean(losses),'\n',losses)
23 | 
24 | if __name__ == "__main__":
25 |     main()
26 | 


--------------------------------------------------------------------------------
/3.pseudo_train.py:
--------------------------------------------------------------------------------
 1 | #args:
 2 | #1. type of model: 'ro' or 'de'
 3 | #2. label path
 4 | #3. save path
 5 | #4. mode: 0=mix 1=5fold
 6 | 
 7 | from components.train import train_pseudo, train_pseudo_5fold
 8 | from components.util import generate_config
 9 | import sys
10 | import numpy as np
11 | 
12 | def main():
13 |     ###
14 |     # training using extra training data
15 |     ###
16 |     config = generate_config(sys.argv[1],'None',sys.argv[3],'custom','2')
17 |     if sys.argv[4]=='0':
18 |         min_valid_loss = train_pseudo(config,sys.argv[2])
19 |         print(min_valid_loss)
20 |     else:
21 |         min_valid_loss = train_pseudo_5fold(config,sys.argv[2])
22 |         print(min_valid_loss)
23 | 
24 | if __name__ == "__main__":
25 |     main()
26 | 


--------------------------------------------------------------------------------
/4.predict.py:
--------------------------------------------------------------------------------
 1 | #args:
 2 | #1. source file
 3 | #2. target file
 4 | #3. num of models
 5 | #4. model dir
 6 | #5. mode: 0=label data 1=5fold labels
 7 | #...
 8 | 
 9 | import sys
10 | import numpy as np
11 | from components.predict import get_single_model
12 | import pandas as pd
13 | import os
14 | 
15 | def main():
16 |     ###
17 |     # generate prediction for 1. inference 2. 5fold labels
18 |     ###
19 |     source_path = sys.argv[1]
20 |     target_path = sys.argv[2]
21 |     num_of_models = int(sys.argv[3])
22 |     model_dirs = [sys.argv[i+5] for i in range(num_of_models)]
23 |     
24 |     data = pd.read_csv(source_path)
25 |     
26 |     preds = []
27 |     for i in range(num_of_models):
28 |         preds.append(get_single_model(model_dirs[i],data))
29 |         
30 |     if sys.argv[4] == '1':
31 |         #hard coded weight for when one of each of roberta and deberta is used to predict
32 |         if num_of_models == 2 and 'roberta' in model_dirs[0] and 'deberta' in model_dirs[1]:
33 |             preds_fold0 = [pred[0] for pred in preds]
34 |             preds_fold1 = [pred[1] for pred in preds]
35 |             preds_fold2 = [pred[2] for pred in preds]
36 |             preds_fold3 = [pred[3] for pred in preds]
37 |             preds_fold4 = [pred[4] for pred in preds]
38 |             preds_0 = preds_fold0[0] * 0.33 + preds_fold0[1] * 0.67
39 |             preds_1 = preds_fold1[0] * 0.33 + preds_fold1[1] * 0.67
40 |             preds_2 = preds_fold2[0] * 0.33 + preds_fold2[1] * 0.67
41 |             preds_3 = preds_fold3[0] * 0.33 + preds_fold3[1] * 0.67
42 |             preds_4 = preds_fold4[0] * 0.33 + preds_fold4[1] * 0.67
43 |         else:
44 |             preds_0 = np.mean(np.concatenate([pred[0] for pred in preds],axis=1),axis=1)
45 |             preds_1 = np.mean(np.concatenate([pred[1] for pred in preds],axis=1),axis=1)
46 |             preds_2 = np.mean(np.concatenate([pred[2] for pred in preds],axis=1),axis=1)
47 |             preds_3 = np.mean(np.concatenate([pred[3] for pred in preds],axis=1),axis=1)
48 |             preds_4 = np.mean(np.concatenate([pred[4] for pred in preds],axis=1),axis=1)
49 |         labeled_extra0 = data.copy()
50 |         labeled_extra1 = data.copy()
51 |         labeled_extra2 = data.copy()
52 |         labeled_extra3 = data.copy()
53 |         labeled_extra4 = data.copy()
54 |         labeled_extra0['target'] = preds_0
55 |         labeled_extra1['target'] = preds_1
56 |         labeled_extra2['target'] = preds_2
57 |         labeled_extra3['target'] = preds_3
58 |         labeled_extra4['target'] = preds_4
59 | 
60 |         if not os.path.isdir(target_path):
61 |             os.mkdir(target_path)
62 |         labeled_extra0.to_csv(target_path + 'labeled_extra_0.csv',index=False)
63 |         labeled_extra1.to_csv(target_path + 'labeled_extra_1.csv',index=False)
64 |         labeled_extra2.to_csv(target_path + 'labeled_extra_2.csv',index=False)
65 |         labeled_extra3.to_csv(target_path + 'labeled_extra_3.csv',index=False)
66 |         labeled_extra4.to_csv(target_path + 'labeled_extra_4.csv',index=False)
67 |         
68 |     else:
69 |         preds = [np.expand_dims(np.mean(np.concatenate(pred,axis=1),axis=1),axis=1) for pred in preds]
70 |         if num_of_models == 2 and 'roberta' in model_dirs[0] and 'deberta' in model_dirs[1]:
71 |             pred = preds[0] * 0.33 + preds[1] * 0.67
72 |         else:
73 |             cat = np.concatenate(preds,axis=1)
74 |             pred = np.mean(cat,axis=1)
75 |         data['target'] = pred
76 |         data.to_csv(target_path,index=False)
77 |     
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Danielhuxc
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CLRP-solution
 2 | ### requirements:
 3 | numpy==1.20.2 \
 4 | pandas==1.2.4 \
 5 | transformers==4.5.1 \
 6 | torch==1.9.0+cu111 \
 7 | sklearn==0.0 \
 8 | tqdm==4.60.0
 9 | 
10 | 24GB VRAM
11 | 
12 | ### prep:
13 | download pretrained roberta-large and deberta-large from: \
14 | https://huggingface.co/roberta-large \
15 | https://huggingface.co/microsoft/deberta-large \
16 | and save it in \
17 | ./pretrained/roberta-large\
18 | ./pretrained/deberta-large
19 | <br/><br/>
20 | download \
21 | Children's Book Test from: \
22 | https://research.fb.com/downloads/babi/ \
23 | Simple Wiki Dump from: \
24 | https://github.com/LGDoor/Dump-of-Simple-English-Wiki \
25 | and save it as follows \
26 | ./extra_data/cbt_test.txt \
27 | ./extra_data/cbt_train.txt \
28 | ./extra_data/cbt_valid.txt \
29 | ./extra_data/simple_english_wiki.txt
30 | 
31 | CLRP training data goes to \
32 | ./data/train.csv \
33 | ./data/test.csv
34 | 
35 | ### train from scratch:
36 | ./run_train.sh \
37 | takes about 30 hours
38 | 
39 | ### predict:
40 | python 4.predict.py ./{path_to_source_file}.csv ./{path_to_save}.csv 3 0 ./models/roberta_2/ ./models/deberta_1/ ./models/deberta_2/ \
41 | make sure the column name is 'excerpt' in source csv file
42 | 
43 | ### solution writeup:
44 | https://www.kaggle.com/c/commonlitreadabilityprize/discussion/258095
45 | 


--------------------------------------------------------------------------------
/components/dataset.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import pandas as pd
  3 | 
  4 | class MLMDataset(torch.utils.data.Dataset):
  5 |     def __init__(self, is_train, texts, tokenizer):
  6 |         self.is_train = is_train
  7 |         self.tokenizer = tokenizer
  8 |         if self.is_train:
  9 |             self.data = texts
 10 |         else:
 11 |             self.data = texts
 12 |         ### only use portion of data
 13 |         length = int(len(self.data)/1)
 14 |         self.data = self.data[:length]
 15 |         ###
 16 | 
 17 |     def __getitem__(self, idx):
 18 |         item = self.tokenizer(self.data[idx], padding='max_length', is_split_into_words = False,truncation=True, return_tensors="pt")
 19 |         
 20 |         item['labels'] = item['input_ids'].clone()
 21 |         
 22 |         probability_matrix = torch.full(item['labels'].shape, 0.15)
 23 |         special_tokens_mask = [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in item['labels'].tolist()]
 24 |         probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
 25 |         masked_indices = torch.bernoulli(probability_matrix).bool()
 26 |         item['labels'][~masked_indices] = -100
 27 | 
 28 |         # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
 29 |         indices_replaced = torch.bernoulli(torch.full(item['labels'].shape, 0.8)).bool() & masked_indices
 30 |         item['input_ids'][indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
 31 | 
 32 |         # 10% of the time, we replace masked input tokens with random word
 33 |         indices_random = torch.bernoulli(torch.full(item['labels'].shape, 0.5)).bool() & masked_indices & ~indices_replaced
 34 |         random_words = torch.randint(len(self.tokenizer), item['labels'].shape, dtype=torch.long)
 35 |         item['input_ids'][indices_random] = random_words[indices_random]
 36 | 
 37 |         # The rest of the time (10% of the time) we keep the masked input tokens unchanged
 38 |         item['input_ids'] = item['input_ids'][0]
 39 |         item['attention_mask'] = item['attention_mask'][0]
 40 |         item['labels'] = item['labels'][0]
 41 |         return item
 42 | 
 43 |     def __len__(self):
 44 |         return len(self.data)
 45 |     
 46 | class CLRPDataset_finetune(torch.utils.data.Dataset):
 47 |     def __init__(self, is_train, fold, train_data, tokenizer):
 48 |         self.is_train = is_train
 49 |         self.tokenizer = tokenizer
 50 |         
 51 |         if is_train:
 52 |             df = train_data.query(f"kfold != {fold}")[['excerpt','target']]
 53 |         else:
 54 |             df = train_data.query(f"kfold == {fold}")[['excerpt','target']]
 55 |         self.excerpt = df['excerpt'].to_numpy()
 56 |         self.target = df['target'].to_numpy()
 57 | 
 58 |     def __getitem__(self, idx):
 59 |         tokenized = self.tokenizer(self.excerpt[idx],return_tensors='pt',
 60 |                               max_length=256,
 61 |                               padding='max_length',truncation=True)
 62 |         
 63 |         item = {}
 64 |         item['input_ids'] = tokenized['input_ids'][0]
 65 |         item['attention_mask'] = tokenized['attention_mask'][0]
 66 |         item['target'] = torch.tensor(self.target[idx]).type(torch.float32)
 67 |         
 68 |         return item
 69 | 
 70 |     def __len__(self):
 71 |         return len(self.target)
 72 |     
 73 | class CLRPDataset_pred(torch.utils.data.Dataset):
 74 |     def __init__(self,df,tokenizer):
 75 |         self.excerpt = df['excerpt'].to_numpy()
 76 |         self.tokenizer = tokenizer
 77 |     
 78 |     def __getitem__(self,idx):
 79 |         encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
 80 |                                 max_length=256,
 81 |                                 padding='max_length',truncation=True)
 82 |         encoded = {'input_ids':encode['input_ids'][0],
 83 |                    'attention_mask':encode['attention_mask'][0]
 84 |                   }
 85 |         
 86 |         return encoded
 87 |     
 88 |     def __len__(self):
 89 |         return len(self.excerpt)
 90 |     
 91 | class CLRPDataset_pseudo(torch.utils.data.Dataset):
 92 |     def __init__(self, is_train, label_path, train_data, tokenizer):
 93 |         self.tokenizer = tokenizer
 94 |         if is_train:
 95 |             df1 = pd.read_csv(label_path+"labeled_extra_0.csv")
 96 |             df2 = pd.read_csv(label_path+"labeled_extra_1.csv")
 97 |             df3 = pd.read_csv(label_path+"labeled_extra_2.csv")
 98 |             df4 = pd.read_csv(label_path+"labeled_extra_3.csv")
 99 |             df5 = pd.read_csv(label_path+"labeled_extra_4.csv")
100 |             self.excerpt = df1['excerpt'].to_numpy()
101 |             self.target = (df1['target'] + df2['target'] + df3['target'] + df4['target'] + df5['target']).to_numpy()/5
102 |         else:
103 |             self.excerpt = train_data['excerpt'].to_numpy()
104 |             self.target = train_data['target'].to_numpy()
105 | 
106 |     def __getitem__(self, idx):
107 |         tokenized = self.tokenizer(self.excerpt[idx],return_tensors='pt',
108 |                               max_length=256,
109 |                               padding='max_length',truncation=True)
110 |         
111 |         item = {}
112 |         item['input_ids'] = tokenized['input_ids'][0]
113 |         item['attention_mask'] = tokenized['attention_mask'][0]
114 |         item['target'] = torch.tensor(self.target[idx]).type(torch.float32)
115 |         
116 |         return item
117 | 
118 |     def __len__(self):
119 |         return len(self.target)
120 | 
121 | #reads 5 fold labeled data and mix 3x training data in
122 | class CLRPDataset_pseudo_5fold(torch.utils.data.Dataset):
123 |     def __init__(self, is_train, fold, train_data, tokenizer, label_path):
124 |         self.tokenizer = tokenizer
125 |         if is_train:
126 |             df = pd.read_csv(label_path+f"labeled_extra_{fold}.csv")
127 |             tr = train_data.query(f"kfold != {fold}")[['excerpt','target']]
128 |             df = pd.concat([df,tr,tr,tr], ignore_index=True)
129 |             df = df.sample(frac=1).reset_index(drop=True)
130 |         else:
131 |             df = train_data.query(f"kfold == {fold}")[['excerpt','target']]
132 |         self.excerpt = df['excerpt'].to_numpy()
133 |         self.target = df['target'].to_numpy()
134 |         ###
135 | 
136 |     def __getitem__(self, idx):
137 |         tokenized = self.tokenizer(self.excerpt[idx],return_tensors='pt',
138 |                               max_length=256,
139 |                               padding='max_length',truncation=True)
140 |         
141 |         item = {}
142 |         item['input_ids'] = tokenized['input_ids'][0]
143 |         item['attention_mask'] = tokenized['attention_mask'][0]
144 |         item['target'] = torch.tensor(self.target[idx]).type(torch.float32)
145 |         
146 |         return item
147 | 
148 |     def __len__(self):
149 |         return len(self.target)
150 | 


--------------------------------------------------------------------------------
/components/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from transformers import AutoModel, AutoConfig
 4 | from components.util import init_params
 5 | 
 6 | class Custom_bert(nn.Module):
 7 |     def __init__(self,model_dir):
 8 |         super().__init__()
 9 | 
10 |         #load base model
11 |         config = AutoConfig.from_pretrained(model_dir)
12 |         config.update({"output_hidden_states":True, 
13 |                        "hidden_dropout_prob": 0.0,
14 |                        "layer_norm_eps": 1e-7})                       
15 |         
16 |         self.base = AutoModel.from_pretrained(model_dir, config=config)  
17 |         
18 |         dim = self.base.encoder.layer[0].output.dense.bias.shape[0]
19 |         
20 |         self.dropout = nn.Dropout(p=0.2)
21 |         self.high_dropout = nn.Dropout(p=0.5)
22 |         
23 |         #weights for weighted layer average
24 |         n_weights = 24
25 |         weights_init = torch.zeros(n_weights).float()
26 |         weights_init.data[:-1] = -3
27 |         self.layer_weights = torch.nn.Parameter(weights_init)
28 |         
29 |         #attention head
30 |         self.attention = nn.Sequential(
31 |             nn.Linear(1024, 1024),            
32 |             nn.Tanh(),
33 |             nn.Linear(1024, 1),
34 |             nn.Softmax(dim=1)
35 |         ) 
36 |         self.cls = nn.Sequential(
37 |             nn.Linear(dim,1)
38 |         )
39 |         init_params([self.cls,self.attention])
40 |         
41 |     def reini_head(self):
42 |         init_params([self.cls,self.attention])
43 |         return 
44 |         
45 |     def forward(self, input_ids, attention_mask):
46 |         base_output = self.base(input_ids=input_ids,
47 |                                       attention_mask=attention_mask)
48 | 
49 |         #weighted average of all encoder outputs
50 |         cls_outputs = torch.stack(
51 |             [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
52 |         )
53 |         cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)
54 |     
55 |         #multisample dropout
56 |         logits = torch.mean(
57 |             torch.stack(
58 |                 [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
59 |                 dim=0,
60 |             ),
61 |             dim=0,
62 |         )
63 |         return self.cls(logits)
64 | 


--------------------------------------------------------------------------------
/components/optimizer.py:
--------------------------------------------------------------------------------
  1 | from transformers import AdamW
  2 | import torch
  3 | 
  4 | def get_optimizer(model,config):
  5 |     # divide encoder layers into 3 groups and assign different lr
  6 |     # head lr is set separately
  7 |     layers = len(model.base.encoder.layer)
  8 |     no_decay = ["bias", "LayerNorm.weight"]
  9 |     high_lr_head = ["layer_weights"]
 10 |     ### not in high_lr_head
 11 |     params_lst = [{'params':[p for n, p in model.named_parameters() 
 12 |                              if not any(en in n for en,ep in model.base.encoder.layer.named_parameters())
 13 |                              and not any(nd in n for nd in no_decay)
 14 |                              and not any(nd in n for nd in high_lr_head)], 
 15 |                    'lr': config['head_lr'],
 16 |                    'weight_decay': config['weight_decay']
 17 |                   }]
 18 |     params_lst.append({'params':[p for n, p in model.named_parameters() 
 19 |                              if not any(en in n for en,ep in model.base.encoder.layer.named_parameters())
 20 |                              and any(nd in n for nd in no_decay)
 21 |                              and not any(nd in n for nd in high_lr_head)], 
 22 |                        'lr': config['head_lr'],
 23 |                        'weight_decay': 0.0
 24 |                       })
 25 |     ###
 26 |     ### in high_lr_head
 27 |     params_lst.append({'params':[p for n, p in model.named_parameters() 
 28 |                              if not any(en in n for en,ep in model.base.encoder.layer.named_parameters())
 29 |                              and not any(nd in n for nd in no_decay)
 30 |                              and any(lw in n for lw in high_lr_head)], 
 31 |                    'lr': config['weight_lr'],
 32 |                    'weight_decay': config['weight_decay']
 33 |                   })
 34 |     params_lst.append({'params':[p for n, p in model.named_parameters() 
 35 |                              if not any(en in n for en,ep in model.base.encoder.layer.named_parameters())
 36 |                              and any(nd in n for nd in no_decay)
 37 |                              and any(lw in n for lw in high_lr_head)], 
 38 |                        'lr': config['weight_lr'],
 39 |                        'weight_decay': 0.0
 40 |                       })
 41 |     ###
 42 |     parts = 3
 43 |     for i,j in zip(range(layers-1,-1,-int(layers/parts)),range(0,layers,int(layers/parts))):
 44 |         for k in range(int(layers/parts)):
 45 |             param_dict1 = {'params': [p for n, p in model.base.encoder.layer[i-k].named_parameters()
 46 |                                       if not any(nd in n for nd in no_decay)],
 47 |                            'weight_decay': config['weight_decay'],
 48 |                            'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr']
 49 |                           }
 50 |             param_dict2 = {'params': [p for n, p in model.base.encoder.layer[i-k].named_parameters()
 51 |                                       if any(nd in n for nd in no_decay)],
 52 |                            'weight_decay': 0.0,
 53 |                            'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr']
 54 |                           }
 55 |             params_lst.append(param_dict1)
 56 |             params_lst.append(param_dict2)
 57 |             
 58 |     optimizer = AdamW(params_lst, betas = config['betas'])
 59 |     
 60 |     return optimizer
 61 | 
 62 | def get_optimizer_robertaMLM(model,config):
 63 |     layers = len(model.roberta.encoder.layer)
 64 |     no_decay = ["bias", "LayerNorm.weight"]
 65 |     high_lr_head = ["layer_weights"]
 66 |     ### not in high_lr_head
 67 |     params_lst = [{'params':[p for n, p in model.named_parameters() 
 68 |                              if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters())
 69 |                              and not any(nd in n for nd in no_decay)
 70 |                              and not any(nd in n for nd in high_lr_head)], 
 71 |                    'lr': config['head_lr'],
 72 |                    'weight_decay': config['weight_decay']
 73 |                   }]
 74 |     params_lst.append({'params':[p for n, p in model.named_parameters() 
 75 |                              if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters())
 76 |                              and any(nd in n for nd in no_decay)
 77 |                              and not any(nd in n for nd in high_lr_head)], 
 78 |                        'lr': config['head_lr'],
 79 |                        'weight_decay': 0.0
 80 |                       })
 81 |     ###
 82 |     ### in high_lr_head
 83 |     params_lst.append({'params':[p for n, p in model.named_parameters() 
 84 |                              if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters())
 85 |                              and not any(nd in n for nd in no_decay)
 86 |                              and any(lw in n for lw in high_lr_head)], 
 87 |                    'lr': config['base_lr'],
 88 |                    'weight_decay': config['weight_decay']
 89 |                   })
 90 |     params_lst.append({'params':[p for n, p in model.named_parameters() 
 91 |                              if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters())
 92 |                              and any(nd in n for nd in no_decay)
 93 |                              and any(lw in n for lw in high_lr_head)], 
 94 |                        'lr': config['base_lr'],
 95 |                        'weight_decay': 0.0
 96 |                       })
 97 |     ###
 98 |     parts = 3
 99 |     for i,j in zip(range(layers-1,-1,-int(layers/parts)),range(0,layers,int(layers/parts))):
100 |         for k in range(int(layers/parts)):
101 |             param_dict1 = {'params': [p for n, p in model.roberta.encoder.layer[i-k].named_parameters()
102 |                                       if not any(nd in n for nd in no_decay)],
103 |                            'weight_decay': config['weight_decay'],
104 |                            'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr']
105 |                           }
106 |             param_dict2 = {'params': [p for n, p in model.roberta.encoder.layer[i-k].named_parameters()
107 |                                       if any(nd in n for nd in no_decay)],
108 |                            'weight_decay': 0.0,
109 |                            'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr']
110 |                           }
111 |             params_lst.append(param_dict1)
112 |             params_lst.append(param_dict2)
113 |             
114 |     optimizer = AdamW(params_lst, betas = config['betas'])
115 |     
116 |     return optimizer
117 | 
118 | def get_scheduler(optimizer, total_train_steps, config):
119 |     #two schedules:
120 |     #1. custom is similar to linear decay with warmup
121 |     #2. 3stage is simply halving every 1/3 steps
122 |     def lr_lambda_1(step):
123 |         total_steps = total_train_steps
124 |         w = int(config['warm_up_ratio']*total_steps)
125 |         d1 = int(config['decline_1']*total_steps)
126 |         d2 = int(config['decline_2']*total_steps)
127 |         d3 = int(config['decline_3']*total_steps)
128 |         d4 = int(config['decline_4']*total_steps)
129 |         min_vs_base_ratio = config['min_lr']/config['base_lr']
130 |         low_vs_base_ratio = config['low_lr']/config['base_lr']
131 |         if step <= w:
132 |             return step/w
133 |         elif step <= d1:
134 |             return 1
135 |         elif step <= d3:
136 |             return max(min_vs_base_ratio,min_vs_base_ratio+(1-min_vs_base_ratio)*(d2-step)/(d2-d1))
137 |         else:
138 |             return max(low_vs_base_ratio,low_vs_base_ratio+(min_vs_base_ratio-low_vs_base_ratio)*(d4-step)/(d4-d3))
139 |     def lr_lambda_2(step):
140 |         if step <= total_train_steps * (1/3):
141 |             return 1
142 |         if step <= total_train_steps * (2/3):
143 |             return 0.5
144 |         if step <= total_train_steps * (3/3):
145 |             return 0.25
146 |     if config['lr_type'] == 'custom':
147 |         return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda_1)
148 |     elif config['lr_type'] == '3stage':
149 |         return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda_2)
150 | 


--------------------------------------------------------------------------------
/components/predict.py:
--------------------------------------------------------------------------------
 1 | from components.dataset import CLRPDataset_pred
 2 | from components.model import Custom_bert
 3 | import numpy as np
 4 | import torch
 5 | import gc
 6 | gc.enable()
 7 | from tqdm import tqdm
 8 | from transformers import AutoTokenizer
 9 | device = "cuda:0"
10 | 
11 | def run_fold(fold_num,model_path,data):
12 |     if 'roberta' in model_path:
13 |         model_dir = './pretrained/roberta-large/'
14 |         model_name = f"roberta_large_{fold_num}.pt"
15 |     elif 'deberta' in model_path:
16 |         model_dir = './pretrained/deberta-large/'
17 |         model_name = f"deberta_large_{fold_num}.pt"
18 | 
19 |     tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256)    
20 |     model = Custom_bert(model_dir).to(device)
21 |     _ = model.eval()
22 |     model.load_state_dict(torch.load(model_path+model_name))
23 |     
24 |     test_ds = CLRPDataset_pred(data,tokenizer)
25 |     test_dl = torch.utils.data.DataLoader(test_ds,
26 |                                           batch_size = 192,
27 |                                           shuffle=False,
28 |                                           pin_memory=True)
29 |     
30 |     pred = []
31 |     with torch.no_grad():
32 |         for batch in tqdm(test_dl):
33 |             input_ids = batch['input_ids'].to(device)
34 |             attention_mask = batch['attention_mask'].to(device)
35 |             output = model(input_ids, attention_mask)
36 |             pred.extend(output.detach().cpu().numpy())
37 |     
38 |     del model, test_dl, test_ds
39 |     gc.collect()
40 |     torch.cuda.empty_cache()
41 |             
42 |     return np.array(pred)
43 | 
44 | def get_single_model(pth,data):
45 |     pred0 = run_fold(0,pth,data)
46 |     pred1 = run_fold(1,pth,data)
47 |     pred2 = run_fold(2,pth,data)
48 |     pred3 = run_fold(3,pth,data)
49 |     pred4 = run_fold(4,pth,data)
50 |     
51 |     return [pred0,pred1,pred2,pred3,pred4]
52 | 


--------------------------------------------------------------------------------
/components/train.py:
--------------------------------------------------------------------------------
  1 | from components.dataset import CLRPDataset_finetune, CLRPDataset_pseudo, CLRPDataset_pseudo_5fold
  2 | from components.util import seed_everything, create_folds, generate_config
  3 | from components.model import Custom_bert
  4 | from components.optimizer import get_optimizer, get_scheduler
  5 | import pandas as pd
  6 | import torch
  7 | import torch.nn as nn
  8 | from transformers import AutoTokenizer
  9 | from tqdm import tqdm
 10 | import numpy as np
 11 | import os
 12 | import gc
 13 | gc.enable()
 14 | 
 15 | def run_fold_ft(fold,config,train_data,tokenizer,t_bar):
 16 |     device = "cuda:0"
 17 |     #prep train/val datasets
 18 |     train_dataset = CLRPDataset_finetune(True, fold,train_data,tokenizer)
 19 |     val_dataset = CLRPDataset_finetune(False, fold,train_data,tokenizer)
 20 |     
 21 |     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
 22 |     val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True)
 23 |     
 24 |     total_train_steps = int(len(train_loader) * config['num_epoch'] / config['accumulation_steps'])
 25 |     val_step = 1
 26 |     min_valid_loss = np.inf
 27 |     
 28 |     #load model
 29 |     model = Custom_bert(config['model_dir']).to(device)
 30 |     _ = model.eval()
 31 | 
 32 |     model.load_state_dict(torch.load(config['pretrained_path']), strict=False)
 33 | 
 34 |     #get optimizer and scheduler
 35 |     optimizer = get_optimizer(model,config)
 36 |     lr_scheduler = get_scheduler(optimizer,total_train_steps,config)
 37 | 
 38 |     step = 0
 39 |     min_step = 0
 40 |     last_save_step = 0
 41 |     last_save_index = 0
 42 | 
 43 |     #seed_everything(seed=config['seed_'] + fold)
 44 | 
 45 |     optimizer.zero_grad()
 46 |     for epoch in range(config['num_epoch']):
 47 |         model.train()
 48 |         count = 0
 49 |         total_loss = 0
 50 |         for batch in train_loader:
 51 |             input_ids = batch['input_ids'].to(device)
 52 |             attention_mask = batch['attention_mask'].to(device)
 53 |             target = batch['target'].to(device)
 54 | 
 55 |             outputs = model(input_ids, attention_mask)
 56 | 
 57 |             cls_loss = nn.MSELoss()(torch.squeeze(outputs,1),target)
 58 | 
 59 |             loss = cls_loss / config['accumulation_steps']
 60 | 
 61 |             total_loss+=torch.pow(nn.MSELoss()(torch.squeeze(outputs,1),target),0.5).item() / config['accumulation_steps']
 62 | 
 63 |             loss.backward()
 64 | 
 65 |             if (count+1) % config['accumulation_steps'] == 0:
 66 |                 optimizer.step()
 67 |                 lr_scheduler.step()
 68 |                 optimizer.zero_grad()
 69 |                 count = 0
 70 |                 total_loss = 0
 71 |             else:
 72 |                 count+=1
 73 | 
 74 |             #only save in radius of certain step
 75 |             if step >= (config['save_center']-config['save_radius']) and step <= (config['save_center']+config['save_radius']):
 76 |                 val_step = 1
 77 |             do_val = True
 78 |             if config['only_val_in_radius']:
 79 |                 if step < (config['save_center']-config['save_radius']) or step > (config['save_center']+config['save_radius']):
 80 |                     do_val = False
 81 | 
 82 |             if ((step+1) % val_step == 0 and count == 0) and do_val:
 83 |                 model.eval()
 84 |                 l_val = nn.MSELoss(reduction='sum')
 85 |                 with torch.no_grad():
 86 |                     total_loss_val = 0
 87 |                     for batch in val_loader:
 88 |                         input_ids = batch['input_ids'].to(device)
 89 |                         attention_mask = batch['attention_mask'].to(device)
 90 |                         outputs = model(input_ids, attention_mask)
 91 | 
 92 |                         cls_loss_val = l_val(torch.squeeze(outputs),batch['target'].to(device))
 93 | 
 94 |                         val_loss = cls_loss_val
 95 | 
 96 |                         total_loss_val+=val_loss.item()
 97 |                     total_loss_val/=len(val_dataset)
 98 |                     total_loss_val = total_loss_val**0.5
 99 | 
100 |                     if min_valid_loss > total_loss_val and step >= (config['save_center']-config['save_radius']) and step <= (config['save_center']+config['save_radius']):
101 |                         #saves model with lower loss
102 |                         min_step = step
103 |                         min_valid_loss = total_loss_val
104 |                         #print("min loss updated to ",min_valid_loss," at step ",min_step)
105 |                         if not os.path.isdir('./models'):
106 |                             os.mkdir('./models')
107 |                         if not os.path.isdir(config['save_path']):
108 |                             os.mkdir(config['save_path'])
109 |                         if 'roberta' in config['model_dir']:
110 |                             torch.save(model.state_dict(), config['save_path']+f'roberta_large_{fold}.pt')
111 |                         else:
112 |                             torch.save(model.state_dict(), config['save_path']+f'deberta_large_{fold}.pt')
113 |                 model.train()
114 |             step+=1
115 |             t_bar.update(1)
116 |     del model,train_dataset,train_loader,val_dataset,val_loader
117 |     gc.collect()
118 |     torch.cuda.empty_cache()
119 |     return min_valid_loss, min_step
120 | 
121 | def train_ft(config):
122 |     seed_everything(config['seed_'])
123 |     
124 |     train_data = pd.read_csv("./data/train.csv")
125 |     train_data = create_folds(train_data, num_splits=5)
126 |     model_dir = config['model_dir']
127 |     tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256)
128 |     
129 |     t_bar = tqdm(total=((2834*0.8//config['batch_size'])+1)*config['num_epoch']*config['n_folds'])
130 |     train_losses = []
131 |     for i in range(config['n_folds']):
132 |         loss, m_step = run_fold_ft(i,config,train_data,tokenizer,t_bar)
133 |         train_losses.append(loss)
134 |     return train_losses
135 | 
136 | def train_pseudo(config, label_path):
137 |     device = "cuda:0"
138 |     seed_everything(config['seed_'])
139 |     train_data = pd.read_csv("./data/train.csv")
140 |     train_data = create_folds(train_data, num_splits=5)
141 |     
142 |     model_dir = config['model_dir']
143 |     tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256)
144 |     
145 |     train_dataset = CLRPDataset_pseudo(True,label_path,train_data,tokenizer)
146 |     t_bar = tqdm(total=((len(train_dataset)//config['batch_size'])+1)*config['num_epoch'])
147 |     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
148 |     
149 |     val_dataset = CLRPDataset_pseudo(False,label_path,train_data,tokenizer)
150 |     val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True)
151 | 
152 |     total_train_steps = int(len(train_loader) * config['num_epoch'] / config['accumulation_steps'])
153 |     val_step = 100*config['accumulation_steps']
154 |     min_valid_loss = np.inf
155 | 
156 |     model = Custom_bert(config['model_dir']).to(device)
157 |     _ = model.eval()
158 | 
159 |     if config['pretrained_path'] not in [None,'None']:
160 |         print(model.load_state_dict(torch.load(config['pretrained_path']), strict=False))
161 | 
162 |     optimizer = get_optimizer(model,config)
163 |     lr_scheduler = get_scheduler(optimizer,total_train_steps,config)
164 | 
165 |     step = 0
166 |     min_step = 0
167 |     last_save_step = 0
168 |     last_save_index = 0
169 | 
170 |     optimizer.zero_grad()
171 |     for epoch in range(config['num_epoch']):
172 |         model.train()
173 |         count = 0
174 |         total_loss = 0
175 |         for batch in train_loader:
176 |             input_ids = batch['input_ids'].to(device)
177 |             attention_mask = batch['attention_mask'].to(device)
178 |             target = batch['target'].to(device)
179 |             outputs = model(input_ids, attention_mask)
180 | 
181 |             cls_loss = nn.MSELoss()(torch.squeeze(outputs,1),target)
182 | 
183 |             loss = cls_loss / config['accumulation_steps']
184 | 
185 |             total_loss+=torch.pow(nn.MSELoss()(torch.squeeze(outputs,1),target),0.5).item() / config['accumulation_steps']
186 |             loss.backward()
187 | 
188 |             if (count+1) % config['accumulation_steps'] == 0:
189 |                 optimizer.step()
190 |                 lr_scheduler.step()
191 |                 optimizer.zero_grad()
192 |                 count = 0
193 |                 total_loss = 0
194 |             else:
195 |                 count+=1
196 | 
197 |             if ((step+1) % val_step == 0):
198 |                 l_val = nn.MSELoss(reduction='sum')
199 |                 with torch.no_grad():
200 |                     model.eval()
201 |                     total_loss_val = 0
202 |                     for batch in val_loader:
203 |                         input_ids = batch['input_ids'].to(device)
204 |                         attention_mask = batch['attention_mask'].to(device)
205 |                         outputs = model(input_ids, attention_mask)
206 | 
207 |                         cls_loss_val = l_val(torch.squeeze(outputs),batch['target'].to(device))
208 | 
209 |                         val_loss = cls_loss_val
210 | 
211 |                         total_loss_val+=val_loss.item()
212 |                     total_loss_val/=len(val_dataset)
213 |                     total_loss_val = total_loss_val**0.5
214 | 
215 |                 if min_valid_loss > total_loss_val:
216 |                     min_step = step
217 |                     min_valid_loss = total_loss_val
218 |                     #print("min loss updated to ",min_valid_loss," at step ",min_step)
219 |                     # Saving State Dict
220 |                     if not os.path.isdir(config['save_path']):
221 |                         os.mkdir(config['save_path'])
222 |                     torch.save(model.state_dict(), config['save_path'] + config['pseudo_save_name'])
223 |                 model.train()
224 |             step+=1
225 |             t_bar.update(1)
226 |     del model,train_dataset,train_loader
227 |     gc.collect()
228 |     torch.cuda.empty_cache()
229 |     
230 |     return min_valid_loss
231 | 
232 | def train_pseudo_5fold(config, label_path):
233 |     device = "cuda:0"
234 |     seed_everything(config['seed_'])
235 | 
236 |     train_data = pd.read_csv("./data/train.csv")
237 |     train_data = create_folds(train_data, num_splits=5)
238 |     model_dir = config['model_dir']
239 |     tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256)
240 | 
241 |     min_val_losses = []
242 |     for fold in range(config['n_folds']):
243 |         train_dataset = CLRPDataset_pseudo_5fold(True,fold,train_data,tokenizer,label_path)
244 |         train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
245 |         
246 |         val_dataset = CLRPDataset_pseudo_5fold(False,fold,train_data,tokenizer,label_path)
247 |         val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True)
248 | 
249 |         if fold == 0:
250 |             t_bar = tqdm(total=((len(train_dataset)*5//config['batch_size'])+1)*config['num_epoch'])
251 |             
252 |         total_train_steps = int(len(train_loader) * config['num_epoch'] / config['accumulation_steps'])
253 |         val_step = 100*config['accumulation_steps']
254 |         min_valid_loss = np.inf
255 | 
256 |         model = Custom_bert(config['model_dir']).to(device)
257 |         _ = model.eval()
258 | 
259 |         if config['pretrained_path'] not in [None,'None']:
260 |             model.load_state_dict(torch.load(config['pretrained_path']), strict=False)
261 | 
262 |         optimizer = get_optimizer(model,config)
263 |         lr_scheduler = get_scheduler(optimizer,total_train_steps,config)
264 | 
265 |         step = 0
266 |         min_step = 0
267 |         last_save_step = 0
268 |         last_save_index = 0
269 | 
270 |         optimizer.zero_grad()
271 |         for epoch in range(config['num_epoch']):
272 |             model.train()
273 |             count = 0
274 |             total_loss = 0
275 |             for batch in train_loader:
276 |                 input_ids = batch['input_ids'].to(device)
277 |                 attention_mask = batch['attention_mask'].to(device)
278 |                 target = batch['target'].to(device)
279 | 
280 |                 outputs = model(input_ids, attention_mask)
281 | 
282 |                 cls_loss = nn.MSELoss()(torch.squeeze(outputs,1),target)
283 | 
284 |                 loss = cls_loss / config['accumulation_steps']
285 | 
286 |                 total_loss+=torch.pow(nn.MSELoss()(torch.squeeze(outputs,1),target),0.5).item() / config['accumulation_steps']
287 |                 loss.backward()
288 | 
289 |                 if (count+1) % config['accumulation_steps'] == 0:
290 |                     optimizer.step()
291 |                     lr_scheduler.step()
292 |                     optimizer.zero_grad()
293 |                     count = 0
294 |                     total_loss = 0
295 |                 else:
296 |                     count+=1
297 | 
298 |                 if ((step+1) % val_step == 0):
299 |                     model.eval()
300 |                     l_val = nn.MSELoss(reduction='sum')
301 |                     with torch.no_grad():
302 |                         total_loss_val = 0
303 |                         for batch in val_loader:
304 |                             input_ids = batch['input_ids'].to(device)
305 |                             attention_mask = batch['attention_mask'].to(device)
306 |                             outputs = model(input_ids, attention_mask)
307 | 
308 |                             cls_loss_val = l_val(torch.squeeze(outputs),batch['target'].to(device))
309 | 
310 |                             val_loss = cls_loss_val
311 | 
312 |                             total_loss_val+=val_loss.item()
313 |                         total_loss_val/=len(val_dataset)
314 |                         total_loss_val = total_loss_val**0.5
315 | 
316 |                     if min_valid_loss > total_loss_val and epoch > 0:
317 |                         min_step = step
318 |                         min_valid_loss = total_loss_val
319 |                         if not os.path.isdir('./models'):
320 |                             os.mkdir('./models')
321 |                         if not os.path.isdir(config['save_path']):
322 |                             os.mkdir(config['save_path'])
323 |                         if 'roberta' in config['model_dir']:
324 |                             torch.save(model.state_dict(), config['save_path']+f'roberta_large_{fold}.pt')
325 |                         else:
326 |                             torch.save(model.state_dict(), config['save_path']+f'deberta_large_{fold}.pt')
327 |                     model.train()
328 |                 step+=1
329 |                 t_bar.update(1)
330 |         del model,train_dataset,train_loader,val_dataset,val_loader
331 |         gc.collect()
332 |         torch.cuda.empty_cache()
333 |         min_val_losses.append(min_valid_loss)
334 |     return min_val_losses
335 | 


--------------------------------------------------------------------------------
/components/util.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import os
  4 | import torch
  5 | import pandas as pd
  6 | from sklearn.model_selection import KFold,StratifiedKFold
  7 | 
  8 | def seed_everything(seed):
  9 |     random.seed(seed)
 10 |     np.random.seed(seed)
 11 |     os.environ["PYTHONHASHSEED"] = str(seed)
 12 | 
 13 |     torch.manual_seed(seed)
 14 |     torch.cuda.manual_seed(seed)
 15 |     torch.cuda.manual_seed_all(seed)
 16 | 
 17 |     torch.backends.cudnn.deterministic = True
 18 |     
 19 | def create_folds(data, num_splits):
 20 |     # we create a new column called kfold and fill it with -1
 21 |     data["kfold"] = -1
 22 |     
 23 |     # the next step is to randomize the rows of the data
 24 |     data = data.sample(frac=1).reset_index(drop=True)
 25 | 
 26 |     # calculate number of bins by Sturge's rule
 27 |     # I take the floor of the value, you can also
 28 |     # just round it
 29 |     num_bins = int(np.floor(1 + np.log2(len(data))))
 30 |     
 31 |     # bin targets
 32 |     data.loc[:, "bins"] = pd.cut(
 33 |         data["target"], bins=num_bins, labels=False
 34 |     )
 35 |     
 36 |     # initiate the kfold class from model_selection module
 37 |     kf = StratifiedKFold(n_splits=num_splits)
 38 |     
 39 |     # fill the new kfold column
 40 |     # note that, instead of targets, we use bins!
 41 |     for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
 42 |         data.loc[v_, 'kfold'] = f
 43 |     
 44 |     # drop the bins column
 45 |     data = data.drop("bins", axis=1)
 46 | 
 47 |     # return dataframe with folds
 48 |     return data
 49 | 
 50 | def init_params(module_lst):
 51 |     for module in module_lst:
 52 |         for param in module.parameters():
 53 |             if param.dim() > 1:
 54 |                 torch.nn.init.xavier_uniform_(param)
 55 |     return
 56 | 
 57 | def generate_config(model_type,pretrained_path,save_path,lr_type,lr_setting):
 58 |     config = {'model_dir': './pretrained/roberta-large/',
 59 |               'n_folds': 5,
 60 |               'num_epoch': 3,
 61 |               'weight_decay': 0.01,
 62 |               'head_lr': 1e-4,
 63 |               'weight_lr': 5e-2,
 64 |               'base_lr': 7e-5,
 65 |               'min_lr': 2e-5,
 66 |               'low_lr': 1e-5,
 67 |               'warm_up_ratio': 0.06,
 68 |               'decline_1': 0.15,
 69 |               'decline_2': 0.6,
 70 |               'decline_3': 0.7,
 71 |               'decline_4': 0.75,
 72 |               'layerwise_decay_rate': 0.875**0.5,
 73 |               'seed_': 88888888,
 74 |               'reini_head':False,
 75 |               'only_val_in_radius': True,
 76 |               'save_center': 330,
 77 |               'save_radius': 5,
 78 |               'betas': (0.9, 0.999),
 79 |          }
 80 |     config['pretrained_path'] = pretrained_path
 81 |     config['save_path'] = save_path
 82 |     config['lr_type'] = lr_type
 83 |     if model_type == 'ro':
 84 |         config['model_dir'] = './pretrained/roberta-large/'
 85 |         config['batch_size'] = 16
 86 |         config['accumulation_steps'] = 1
 87 |         config['pseudo_save_name'] = 'roberta_large_single.pt'
 88 |     elif model_type == 'de':
 89 |         config['model_dir'] = './pretrained/deberta-large/'
 90 |         config['batch_size'] = 8
 91 |         config['accumulation_steps'] = 2
 92 |         config['save_center'] = 660
 93 |         config['save_radius'] = 10
 94 |         config['pseudo_save_name'] = 'deberta_large_single.pt'
 95 |     
 96 |     if lr_setting == '2':
 97 |         config['num_epoch'] = 2
 98 |         config['head_lr']= 1e-5
 99 |         config['weight_lr']= 5e-3
100 |         config['base_lr']= 7e-6
101 |         config['min_lr']= 2e-6
102 |         config['low_lr']= 1e-6
103 |     elif lr_setting == '3':
104 |         config['head_lr']= 5e-5
105 |         config['weight_lr']= 2e-3
106 |         config['base_lr']= 3e-5
107 |         config['min_lr']= 1e-5
108 |         config['low_lr']= 5e-6
109 |     
110 |     return config
111 | 


--------------------------------------------------------------------------------
/run_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | python3.9 0.prepare_data.py
 3 | python3.9 1.roberta_pretrain.py
 4 | python3.9 2.finetune.py ro ./models/roberta_large_pretrain.pt ./models/roberta_1/ custom 1
 5 | 
 6 | python3.9 4.predict.py ./extra_data/extra_excerpt.csv ./extra_data/pseudo_1/ 1 1 ./models/roberta_1/
 7 | python3.9 3.pseudo_train.py de ./extra_data/pseudo_1/ ./models/deberta_1/ 1
 8 | 
 9 | python3.9 4.predict.py ./extra_data/extra_excerpt.csv ./extra_data/pseudo_2/ 2 1 ./models/roberta_1/ ./models/deberta_1/
10 | python3.9 3.pseudo_train.py de ./extra_data/pseudo_2/ ./models/deberta_2/ 0
11 | python3.9 2.finetune.py de ./models/deberta_2/deberta_large_single.pt ./models/deberta_2/ 3stage 3
12 | python3.9 4.predict.py ./extra_data/extra_excerpt.csv ./extra_data/pseudo_3/ 3 1 ./models/roberta_1/ ./models/deberta_1/ ./models/deberta_2/
13 | python3.9 3.pseudo_train.py ro ./extra_data/pseudo_3/ ./models/roberta_2/ 0
14 | python3.9 2.finetune.py ro ./models/roberta_2/roberta_large_single.pt ./models/roberta_2/ 3stage 3
15 | 


--------------------------------------------------------------------------------