├── 0.prepare_data.py ├── 1.roberta_pretrain.py ├── 2.finetune.py ├── 3.pseudo_train.py ├── 4.predict.py ├── LICENSE ├── README.md ├── components ├── dataset.py ├── model.py ├── optimizer.py ├── predict.py ├── train.py └── util.py └── run_train.sh /0.prepare_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | def main(): 3 | with open('./extra_data/simple_english_wiki.txt') as f: 4 | contents = f.read() 5 | contents = contents.split('\n\n') 6 | for i in range(len(contents)): 7 | contents[i] = '\n'.join(contents[i].split('\n')[1:]) 8 | length = 1000 9 | all_data_split = [] 10 | for txt in contents: 11 | [all_data_split.append(txt[0+i:length+i]) for i in range(0, len(txt), length)] 12 | with open('./extra_data/cbt_valid.txt') as f: 13 | cbt_v = f.read() 14 | with open('./extra_data/cbt_test.txt') as f: 15 | cbt_te = f.read() 16 | with open('./extra_data/cbt_train.txt') as f: 17 | cbt_tr = f.read() 18 | cbt = cbt_v+cbt_te+cbt_tr 19 | cbt = cbt.replace('`',"'") 20 | cbt = cbt.replace("''",'"') 21 | _=[all_data_split.append(cbt[0+i:length+i]) for i in range(0, len(cbt), length)] 22 | df = pd.DataFrame() 23 | df['excerpt'] = all_data_split 24 | df.to_csv('./extra_data/extra_excerpt.csv',index=False) 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /1.roberta_pretrain.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from transformers import AutoTokenizer, RobertaForMaskedLM, AutoConfig 3 | from transformers import Trainer, TrainingArguments 4 | from components.util import seed_everything 5 | from components.dataset import MLMDataset 6 | from components.optimizer import get_optimizer_robertaMLM, get_scheduler 7 | import torch 8 | import os 9 | import sys 10 | 11 | def main(): 12 | ### 13 | # MLM pretrain with training data 14 | ### 15 | device = "cuda:0" 16 | model_dir = './pretrained/roberta-large/' 17 | tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256, add_prefix_space=True) 18 | model = RobertaForMaskedLM.from_pretrained(model_dir, local_files_only=True).to(device) 19 | 20 | df = pd.read_csv('./data/train.csv')[['excerpt']] 21 | texts = df['excerpt'].tolist() 22 | df_val = pd.read_csv('./data/test.csv')[['excerpt']] 23 | test = df_val['excerpt'].tolist() 24 | texts = texts+test 25 | 26 | seed_everything(456982) 27 | 28 | train_dataset = MLMDataset(True,texts,tokenizer) 29 | val_dataset = MLMDataset(True,texts,tokenizer) 30 | 31 | config = { 32 | 'lr_type':'custom', 33 | 'base_lr':9e-5, 34 | 'head_lr':1.2e-4, 35 | 'min_lr':4e-5, 36 | 'low_lr':2e-5, 37 | 'n_epoch':5, 38 | 'bs':16, 39 | 'ga':1, 40 | 'lr_scheduler_mul_factor':2, 41 | 'weight_decay':0.01, 42 | 'warm_up_ratio':0.2, 43 | 'decline_1': 0.2, 44 | 'decline_2': 0.7, 45 | 'decline_3': 0.8, 46 | 'decline_4': 0.9, 47 | 'layerwise_decay_rate': 0.9**0.5, 48 | 'betas': (0.9,0.993), 49 | } 50 | 51 | train_len = len(train_dataset) 52 | total_train_steps = int(train_len * config['n_epoch'] / config['ga'] / config['bs']) 53 | optimizer = get_optimizer_robertaMLM(model,config) 54 | lr_scheduler = get_scheduler(optimizer, total_train_steps, config) 55 | 56 | training_args = TrainingArguments( 57 | output_dir='./', # output directory 58 | num_train_epochs=config['n_epoch'], # total number of training epochs 59 | overwrite_output_dir=True, 60 | per_device_train_batch_size=config['bs'], # batch size per device during training 61 | per_device_eval_batch_size=32, # batch size for evaluation 62 | weight_decay=0.01, # strength of weight decay 63 | logging_strategy='no', 64 | gradient_accumulation_steps = config['ga'], 65 | save_strategy = "no", 66 | evaluation_strategy= 'epoch', 67 | prediction_loss_only=True, 68 | learning_rate = config['base_lr'], 69 | ) 70 | 71 | trainer = Trainer( 72 | model=model, # the instantiated 🤗 Transformers model to be trained 73 | args=training_args, # training arguments, defined above 74 | train_dataset=train_dataset, # training dataset 75 | eval_dataset=val_dataset, # evaluation dataset 76 | optimizers = (optimizer, lr_scheduler) 77 | ) 78 | 79 | trainer.train() 80 | if not os.path.isdir('./models'): 81 | os.mkdir('./models') 82 | dict_ = model.state_dict() 83 | for key in list(dict_.keys()): 84 | dict_[key.replace('roberta.', 'base.')] = dict_.pop(key) 85 | torch.save(dict_, f'./models/roberta_large_pretrain.pt') 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /2.finetune.py: -------------------------------------------------------------------------------- 1 | #args: 2 | #1. type of model: 'ro' or 'de' 3 | #2. pretrained path 4 | #3. save path 5 | #4. lr type 'custom' or '3stage' 6 | #5. lr config type 1-3 7 | # 1.training from scratch (use lr type 'custom') 8 | # 2.pseudo pretrain (use lr type 'custom') 9 | # 3.pseudo finetune (use lr type '3stage') 10 | 11 | from components.train import train_ft 12 | from components.util import generate_config 13 | import sys 14 | import numpy as np 15 | 16 | def main(): 17 | ### 18 | # training using provided training data 19 | ### 20 | config = generate_config(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5]) 21 | losses = train_ft(config) 22 | print(np.mean(losses),'\n',losses) 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /3.pseudo_train.py: -------------------------------------------------------------------------------- 1 | #args: 2 | #1. type of model: 'ro' or 'de' 3 | #2. label path 4 | #3. save path 5 | #4. mode: 0=mix 1=5fold 6 | 7 | from components.train import train_pseudo, train_pseudo_5fold 8 | from components.util import generate_config 9 | import sys 10 | import numpy as np 11 | 12 | def main(): 13 | ### 14 | # training using extra training data 15 | ### 16 | config = generate_config(sys.argv[1],'None',sys.argv[3],'custom','2') 17 | if sys.argv[4]=='0': 18 | min_valid_loss = train_pseudo(config,sys.argv[2]) 19 | print(min_valid_loss) 20 | else: 21 | min_valid_loss = train_pseudo_5fold(config,sys.argv[2]) 22 | print(min_valid_loss) 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /4.predict.py: -------------------------------------------------------------------------------- 1 | #args: 2 | #1. source file 3 | #2. target file 4 | #3. num of models 5 | #4. model dir 6 | #5. mode: 0=label data 1=5fold labels 7 | #... 8 | 9 | import sys 10 | import numpy as np 11 | from components.predict import get_single_model 12 | import pandas as pd 13 | import os 14 | 15 | def main(): 16 | ### 17 | # generate prediction for 1. inference 2. 5fold labels 18 | ### 19 | source_path = sys.argv[1] 20 | target_path = sys.argv[2] 21 | num_of_models = int(sys.argv[3]) 22 | model_dirs = [sys.argv[i+5] for i in range(num_of_models)] 23 | 24 | data = pd.read_csv(source_path) 25 | 26 | preds = [] 27 | for i in range(num_of_models): 28 | preds.append(get_single_model(model_dirs[i],data)) 29 | 30 | if sys.argv[4] == '1': 31 | #hard coded weight for when one of each of roberta and deberta is used to predict 32 | if num_of_models == 2 and 'roberta' in model_dirs[0] and 'deberta' in model_dirs[1]: 33 | preds_fold0 = [pred[0] for pred in preds] 34 | preds_fold1 = [pred[1] for pred in preds] 35 | preds_fold2 = [pred[2] for pred in preds] 36 | preds_fold3 = [pred[3] for pred in preds] 37 | preds_fold4 = [pred[4] for pred in preds] 38 | preds_0 = preds_fold0[0] * 0.33 + preds_fold0[1] * 0.67 39 | preds_1 = preds_fold1[0] * 0.33 + preds_fold1[1] * 0.67 40 | preds_2 = preds_fold2[0] * 0.33 + preds_fold2[1] * 0.67 41 | preds_3 = preds_fold3[0] * 0.33 + preds_fold3[1] * 0.67 42 | preds_4 = preds_fold4[0] * 0.33 + preds_fold4[1] * 0.67 43 | else: 44 | preds_0 = np.mean(np.concatenate([pred[0] for pred in preds],axis=1),axis=1) 45 | preds_1 = np.mean(np.concatenate([pred[1] for pred in preds],axis=1),axis=1) 46 | preds_2 = np.mean(np.concatenate([pred[2] for pred in preds],axis=1),axis=1) 47 | preds_3 = np.mean(np.concatenate([pred[3] for pred in preds],axis=1),axis=1) 48 | preds_4 = np.mean(np.concatenate([pred[4] for pred in preds],axis=1),axis=1) 49 | labeled_extra0 = data.copy() 50 | labeled_extra1 = data.copy() 51 | labeled_extra2 = data.copy() 52 | labeled_extra3 = data.copy() 53 | labeled_extra4 = data.copy() 54 | labeled_extra0['target'] = preds_0 55 | labeled_extra1['target'] = preds_1 56 | labeled_extra2['target'] = preds_2 57 | labeled_extra3['target'] = preds_3 58 | labeled_extra4['target'] = preds_4 59 | 60 | if not os.path.isdir(target_path): 61 | os.mkdir(target_path) 62 | labeled_extra0.to_csv(target_path + 'labeled_extra_0.csv',index=False) 63 | labeled_extra1.to_csv(target_path + 'labeled_extra_1.csv',index=False) 64 | labeled_extra2.to_csv(target_path + 'labeled_extra_2.csv',index=False) 65 | labeled_extra3.to_csv(target_path + 'labeled_extra_3.csv',index=False) 66 | labeled_extra4.to_csv(target_path + 'labeled_extra_4.csv',index=False) 67 | 68 | else: 69 | preds = [np.expand_dims(np.mean(np.concatenate(pred,axis=1),axis=1),axis=1) for pred in preds] 70 | if num_of_models == 2 and 'roberta' in model_dirs[0] and 'deberta' in model_dirs[1]: 71 | pred = preds[0] * 0.33 + preds[1] * 0.67 72 | else: 73 | cat = np.concatenate(preds,axis=1) 74 | pred = np.mean(cat,axis=1) 75 | data['target'] = pred 76 | data.to_csv(target_path,index=False) 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Danielhuxc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CLRP-solution 2 | ### requirements: 3 | numpy==1.20.2 \ 4 | pandas==1.2.4 \ 5 | transformers==4.5.1 \ 6 | torch==1.9.0+cu111 \ 7 | sklearn==0.0 \ 8 | tqdm==4.60.0 9 | 10 | 24GB VRAM 11 | 12 | ### prep: 13 | download pretrained roberta-large and deberta-large from: \ 14 | https://huggingface.co/roberta-large \ 15 | https://huggingface.co/microsoft/deberta-large \ 16 | and save it in \ 17 | ./pretrained/roberta-large\ 18 | ./pretrained/deberta-large 19 |

20 | download \ 21 | Children's Book Test from: \ 22 | https://research.fb.com/downloads/babi/ \ 23 | Simple Wiki Dump from: \ 24 | https://github.com/LGDoor/Dump-of-Simple-English-Wiki \ 25 | and save it as follows \ 26 | ./extra_data/cbt_test.txt \ 27 | ./extra_data/cbt_train.txt \ 28 | ./extra_data/cbt_valid.txt \ 29 | ./extra_data/simple_english_wiki.txt 30 | 31 | CLRP training data goes to \ 32 | ./data/train.csv \ 33 | ./data/test.csv 34 | 35 | ### train from scratch: 36 | ./run_train.sh \ 37 | takes about 30 hours 38 | 39 | ### predict: 40 | python 4.predict.py ./{path_to_source_file}.csv ./{path_to_save}.csv 3 0 ./models/roberta_2/ ./models/deberta_1/ ./models/deberta_2/ \ 41 | make sure the column name is 'excerpt' in source csv file 42 | 43 | ### solution writeup: 44 | https://www.kaggle.com/c/commonlitreadabilityprize/discussion/258095 45 | -------------------------------------------------------------------------------- /components/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pandas as pd 3 | 4 | class MLMDataset(torch.utils.data.Dataset): 5 | def __init__(self, is_train, texts, tokenizer): 6 | self.is_train = is_train 7 | self.tokenizer = tokenizer 8 | if self.is_train: 9 | self.data = texts 10 | else: 11 | self.data = texts 12 | ### only use portion of data 13 | length = int(len(self.data)/1) 14 | self.data = self.data[:length] 15 | ### 16 | 17 | def __getitem__(self, idx): 18 | item = self.tokenizer(self.data[idx], padding='max_length', is_split_into_words = False,truncation=True, return_tensors="pt") 19 | 20 | item['labels'] = item['input_ids'].clone() 21 | 22 | probability_matrix = torch.full(item['labels'].shape, 0.15) 23 | special_tokens_mask = [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in item['labels'].tolist()] 24 | probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) 25 | masked_indices = torch.bernoulli(probability_matrix).bool() 26 | item['labels'][~masked_indices] = -100 27 | 28 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) 29 | indices_replaced = torch.bernoulli(torch.full(item['labels'].shape, 0.8)).bool() & masked_indices 30 | item['input_ids'][indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) 31 | 32 | # 10% of the time, we replace masked input tokens with random word 33 | indices_random = torch.bernoulli(torch.full(item['labels'].shape, 0.5)).bool() & masked_indices & ~indices_replaced 34 | random_words = torch.randint(len(self.tokenizer), item['labels'].shape, dtype=torch.long) 35 | item['input_ids'][indices_random] = random_words[indices_random] 36 | 37 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged 38 | item['input_ids'] = item['input_ids'][0] 39 | item['attention_mask'] = item['attention_mask'][0] 40 | item['labels'] = item['labels'][0] 41 | return item 42 | 43 | def __len__(self): 44 | return len(self.data) 45 | 46 | class CLRPDataset_finetune(torch.utils.data.Dataset): 47 | def __init__(self, is_train, fold, train_data, tokenizer): 48 | self.is_train = is_train 49 | self.tokenizer = tokenizer 50 | 51 | if is_train: 52 | df = train_data.query(f"kfold != {fold}")[['excerpt','target']] 53 | else: 54 | df = train_data.query(f"kfold == {fold}")[['excerpt','target']] 55 | self.excerpt = df['excerpt'].to_numpy() 56 | self.target = df['target'].to_numpy() 57 | 58 | def __getitem__(self, idx): 59 | tokenized = self.tokenizer(self.excerpt[idx],return_tensors='pt', 60 | max_length=256, 61 | padding='max_length',truncation=True) 62 | 63 | item = {} 64 | item['input_ids'] = tokenized['input_ids'][0] 65 | item['attention_mask'] = tokenized['attention_mask'][0] 66 | item['target'] = torch.tensor(self.target[idx]).type(torch.float32) 67 | 68 | return item 69 | 70 | def __len__(self): 71 | return len(self.target) 72 | 73 | class CLRPDataset_pred(torch.utils.data.Dataset): 74 | def __init__(self,df,tokenizer): 75 | self.excerpt = df['excerpt'].to_numpy() 76 | self.tokenizer = tokenizer 77 | 78 | def __getitem__(self,idx): 79 | encode = self.tokenizer(self.excerpt[idx],return_tensors='pt', 80 | max_length=256, 81 | padding='max_length',truncation=True) 82 | encoded = {'input_ids':encode['input_ids'][0], 83 | 'attention_mask':encode['attention_mask'][0] 84 | } 85 | 86 | return encoded 87 | 88 | def __len__(self): 89 | return len(self.excerpt) 90 | 91 | class CLRPDataset_pseudo(torch.utils.data.Dataset): 92 | def __init__(self, is_train, label_path, train_data, tokenizer): 93 | self.tokenizer = tokenizer 94 | if is_train: 95 | df1 = pd.read_csv(label_path+"labeled_extra_0.csv") 96 | df2 = pd.read_csv(label_path+"labeled_extra_1.csv") 97 | df3 = pd.read_csv(label_path+"labeled_extra_2.csv") 98 | df4 = pd.read_csv(label_path+"labeled_extra_3.csv") 99 | df5 = pd.read_csv(label_path+"labeled_extra_4.csv") 100 | self.excerpt = df1['excerpt'].to_numpy() 101 | self.target = (df1['target'] + df2['target'] + df3['target'] + df4['target'] + df5['target']).to_numpy()/5 102 | else: 103 | self.excerpt = train_data['excerpt'].to_numpy() 104 | self.target = train_data['target'].to_numpy() 105 | 106 | def __getitem__(self, idx): 107 | tokenized = self.tokenizer(self.excerpt[idx],return_tensors='pt', 108 | max_length=256, 109 | padding='max_length',truncation=True) 110 | 111 | item = {} 112 | item['input_ids'] = tokenized['input_ids'][0] 113 | item['attention_mask'] = tokenized['attention_mask'][0] 114 | item['target'] = torch.tensor(self.target[idx]).type(torch.float32) 115 | 116 | return item 117 | 118 | def __len__(self): 119 | return len(self.target) 120 | 121 | #reads 5 fold labeled data and mix 3x training data in 122 | class CLRPDataset_pseudo_5fold(torch.utils.data.Dataset): 123 | def __init__(self, is_train, fold, train_data, tokenizer, label_path): 124 | self.tokenizer = tokenizer 125 | if is_train: 126 | df = pd.read_csv(label_path+f"labeled_extra_{fold}.csv") 127 | tr = train_data.query(f"kfold != {fold}")[['excerpt','target']] 128 | df = pd.concat([df,tr,tr,tr], ignore_index=True) 129 | df = df.sample(frac=1).reset_index(drop=True) 130 | else: 131 | df = train_data.query(f"kfold == {fold}")[['excerpt','target']] 132 | self.excerpt = df['excerpt'].to_numpy() 133 | self.target = df['target'].to_numpy() 134 | ### 135 | 136 | def __getitem__(self, idx): 137 | tokenized = self.tokenizer(self.excerpt[idx],return_tensors='pt', 138 | max_length=256, 139 | padding='max_length',truncation=True) 140 | 141 | item = {} 142 | item['input_ids'] = tokenized['input_ids'][0] 143 | item['attention_mask'] = tokenized['attention_mask'][0] 144 | item['target'] = torch.tensor(self.target[idx]).type(torch.float32) 145 | 146 | return item 147 | 148 | def __len__(self): 149 | return len(self.target) 150 | -------------------------------------------------------------------------------- /components/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from transformers import AutoModel, AutoConfig 4 | from components.util import init_params 5 | 6 | class Custom_bert(nn.Module): 7 | def __init__(self,model_dir): 8 | super().__init__() 9 | 10 | #load base model 11 | config = AutoConfig.from_pretrained(model_dir) 12 | config.update({"output_hidden_states":True, 13 | "hidden_dropout_prob": 0.0, 14 | "layer_norm_eps": 1e-7}) 15 | 16 | self.base = AutoModel.from_pretrained(model_dir, config=config) 17 | 18 | dim = self.base.encoder.layer[0].output.dense.bias.shape[0] 19 | 20 | self.dropout = nn.Dropout(p=0.2) 21 | self.high_dropout = nn.Dropout(p=0.5) 22 | 23 | #weights for weighted layer average 24 | n_weights = 24 25 | weights_init = torch.zeros(n_weights).float() 26 | weights_init.data[:-1] = -3 27 | self.layer_weights = torch.nn.Parameter(weights_init) 28 | 29 | #attention head 30 | self.attention = nn.Sequential( 31 | nn.Linear(1024, 1024), 32 | nn.Tanh(), 33 | nn.Linear(1024, 1), 34 | nn.Softmax(dim=1) 35 | ) 36 | self.cls = nn.Sequential( 37 | nn.Linear(dim,1) 38 | ) 39 | init_params([self.cls,self.attention]) 40 | 41 | def reini_head(self): 42 | init_params([self.cls,self.attention]) 43 | return 44 | 45 | def forward(self, input_ids, attention_mask): 46 | base_output = self.base(input_ids=input_ids, 47 | attention_mask=attention_mask) 48 | 49 | #weighted average of all encoder outputs 50 | cls_outputs = torch.stack( 51 | [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0 52 | ) 53 | cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0) 54 | 55 | #multisample dropout 56 | logits = torch.mean( 57 | torch.stack( 58 | [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)], 59 | dim=0, 60 | ), 61 | dim=0, 62 | ) 63 | return self.cls(logits) 64 | -------------------------------------------------------------------------------- /components/optimizer.py: -------------------------------------------------------------------------------- 1 | from transformers import AdamW 2 | import torch 3 | 4 | def get_optimizer(model,config): 5 | # divide encoder layers into 3 groups and assign different lr 6 | # head lr is set separately 7 | layers = len(model.base.encoder.layer) 8 | no_decay = ["bias", "LayerNorm.weight"] 9 | high_lr_head = ["layer_weights"] 10 | ### not in high_lr_head 11 | params_lst = [{'params':[p for n, p in model.named_parameters() 12 | if not any(en in n for en,ep in model.base.encoder.layer.named_parameters()) 13 | and not any(nd in n for nd in no_decay) 14 | and not any(nd in n for nd in high_lr_head)], 15 | 'lr': config['head_lr'], 16 | 'weight_decay': config['weight_decay'] 17 | }] 18 | params_lst.append({'params':[p for n, p in model.named_parameters() 19 | if not any(en in n for en,ep in model.base.encoder.layer.named_parameters()) 20 | and any(nd in n for nd in no_decay) 21 | and not any(nd in n for nd in high_lr_head)], 22 | 'lr': config['head_lr'], 23 | 'weight_decay': 0.0 24 | }) 25 | ### 26 | ### in high_lr_head 27 | params_lst.append({'params':[p for n, p in model.named_parameters() 28 | if not any(en in n for en,ep in model.base.encoder.layer.named_parameters()) 29 | and not any(nd in n for nd in no_decay) 30 | and any(lw in n for lw in high_lr_head)], 31 | 'lr': config['weight_lr'], 32 | 'weight_decay': config['weight_decay'] 33 | }) 34 | params_lst.append({'params':[p for n, p in model.named_parameters() 35 | if not any(en in n for en,ep in model.base.encoder.layer.named_parameters()) 36 | and any(nd in n for nd in no_decay) 37 | and any(lw in n for lw in high_lr_head)], 38 | 'lr': config['weight_lr'], 39 | 'weight_decay': 0.0 40 | }) 41 | ### 42 | parts = 3 43 | for i,j in zip(range(layers-1,-1,-int(layers/parts)),range(0,layers,int(layers/parts))): 44 | for k in range(int(layers/parts)): 45 | param_dict1 = {'params': [p for n, p in model.base.encoder.layer[i-k].named_parameters() 46 | if not any(nd in n for nd in no_decay)], 47 | 'weight_decay': config['weight_decay'], 48 | 'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr'] 49 | } 50 | param_dict2 = {'params': [p for n, p in model.base.encoder.layer[i-k].named_parameters() 51 | if any(nd in n for nd in no_decay)], 52 | 'weight_decay': 0.0, 53 | 'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr'] 54 | } 55 | params_lst.append(param_dict1) 56 | params_lst.append(param_dict2) 57 | 58 | optimizer = AdamW(params_lst, betas = config['betas']) 59 | 60 | return optimizer 61 | 62 | def get_optimizer_robertaMLM(model,config): 63 | layers = len(model.roberta.encoder.layer) 64 | no_decay = ["bias", "LayerNorm.weight"] 65 | high_lr_head = ["layer_weights"] 66 | ### not in high_lr_head 67 | params_lst = [{'params':[p for n, p in model.named_parameters() 68 | if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters()) 69 | and not any(nd in n for nd in no_decay) 70 | and not any(nd in n for nd in high_lr_head)], 71 | 'lr': config['head_lr'], 72 | 'weight_decay': config['weight_decay'] 73 | }] 74 | params_lst.append({'params':[p for n, p in model.named_parameters() 75 | if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters()) 76 | and any(nd in n for nd in no_decay) 77 | and not any(nd in n for nd in high_lr_head)], 78 | 'lr': config['head_lr'], 79 | 'weight_decay': 0.0 80 | }) 81 | ### 82 | ### in high_lr_head 83 | params_lst.append({'params':[p for n, p in model.named_parameters() 84 | if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters()) 85 | and not any(nd in n for nd in no_decay) 86 | and any(lw in n for lw in high_lr_head)], 87 | 'lr': config['base_lr'], 88 | 'weight_decay': config['weight_decay'] 89 | }) 90 | params_lst.append({'params':[p for n, p in model.named_parameters() 91 | if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters()) 92 | and any(nd in n for nd in no_decay) 93 | and any(lw in n for lw in high_lr_head)], 94 | 'lr': config['base_lr'], 95 | 'weight_decay': 0.0 96 | }) 97 | ### 98 | parts = 3 99 | for i,j in zip(range(layers-1,-1,-int(layers/parts)),range(0,layers,int(layers/parts))): 100 | for k in range(int(layers/parts)): 101 | param_dict1 = {'params': [p for n, p in model.roberta.encoder.layer[i-k].named_parameters() 102 | if not any(nd in n for nd in no_decay)], 103 | 'weight_decay': config['weight_decay'], 104 | 'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr'] 105 | } 106 | param_dict2 = {'params': [p for n, p in model.roberta.encoder.layer[i-k].named_parameters() 107 | if any(nd in n for nd in no_decay)], 108 | 'weight_decay': 0.0, 109 | 'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr'] 110 | } 111 | params_lst.append(param_dict1) 112 | params_lst.append(param_dict2) 113 | 114 | optimizer = AdamW(params_lst, betas = config['betas']) 115 | 116 | return optimizer 117 | 118 | def get_scheduler(optimizer, total_train_steps, config): 119 | #two schedules: 120 | #1. custom is similar to linear decay with warmup 121 | #2. 3stage is simply halving every 1/3 steps 122 | def lr_lambda_1(step): 123 | total_steps = total_train_steps 124 | w = int(config['warm_up_ratio']*total_steps) 125 | d1 = int(config['decline_1']*total_steps) 126 | d2 = int(config['decline_2']*total_steps) 127 | d3 = int(config['decline_3']*total_steps) 128 | d4 = int(config['decline_4']*total_steps) 129 | min_vs_base_ratio = config['min_lr']/config['base_lr'] 130 | low_vs_base_ratio = config['low_lr']/config['base_lr'] 131 | if step <= w: 132 | return step/w 133 | elif step <= d1: 134 | return 1 135 | elif step <= d3: 136 | return max(min_vs_base_ratio,min_vs_base_ratio+(1-min_vs_base_ratio)*(d2-step)/(d2-d1)) 137 | else: 138 | return max(low_vs_base_ratio,low_vs_base_ratio+(min_vs_base_ratio-low_vs_base_ratio)*(d4-step)/(d4-d3)) 139 | def lr_lambda_2(step): 140 | if step <= total_train_steps * (1/3): 141 | return 1 142 | if step <= total_train_steps * (2/3): 143 | return 0.5 144 | if step <= total_train_steps * (3/3): 145 | return 0.25 146 | if config['lr_type'] == 'custom': 147 | return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda_1) 148 | elif config['lr_type'] == '3stage': 149 | return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda_2) 150 | -------------------------------------------------------------------------------- /components/predict.py: -------------------------------------------------------------------------------- 1 | from components.dataset import CLRPDataset_pred 2 | from components.model import Custom_bert 3 | import numpy as np 4 | import torch 5 | import gc 6 | gc.enable() 7 | from tqdm import tqdm 8 | from transformers import AutoTokenizer 9 | device = "cuda:0" 10 | 11 | def run_fold(fold_num,model_path,data): 12 | if 'roberta' in model_path: 13 | model_dir = './pretrained/roberta-large/' 14 | model_name = f"roberta_large_{fold_num}.pt" 15 | elif 'deberta' in model_path: 16 | model_dir = './pretrained/deberta-large/' 17 | model_name = f"deberta_large_{fold_num}.pt" 18 | 19 | tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256) 20 | model = Custom_bert(model_dir).to(device) 21 | _ = model.eval() 22 | model.load_state_dict(torch.load(model_path+model_name)) 23 | 24 | test_ds = CLRPDataset_pred(data,tokenizer) 25 | test_dl = torch.utils.data.DataLoader(test_ds, 26 | batch_size = 192, 27 | shuffle=False, 28 | pin_memory=True) 29 | 30 | pred = [] 31 | with torch.no_grad(): 32 | for batch in tqdm(test_dl): 33 | input_ids = batch['input_ids'].to(device) 34 | attention_mask = batch['attention_mask'].to(device) 35 | output = model(input_ids, attention_mask) 36 | pred.extend(output.detach().cpu().numpy()) 37 | 38 | del model, test_dl, test_ds 39 | gc.collect() 40 | torch.cuda.empty_cache() 41 | 42 | return np.array(pred) 43 | 44 | def get_single_model(pth,data): 45 | pred0 = run_fold(0,pth,data) 46 | pred1 = run_fold(1,pth,data) 47 | pred2 = run_fold(2,pth,data) 48 | pred3 = run_fold(3,pth,data) 49 | pred4 = run_fold(4,pth,data) 50 | 51 | return [pred0,pred1,pred2,pred3,pred4] 52 | -------------------------------------------------------------------------------- /components/train.py: -------------------------------------------------------------------------------- 1 | from components.dataset import CLRPDataset_finetune, CLRPDataset_pseudo, CLRPDataset_pseudo_5fold 2 | from components.util import seed_everything, create_folds, generate_config 3 | from components.model import Custom_bert 4 | from components.optimizer import get_optimizer, get_scheduler 5 | import pandas as pd 6 | import torch 7 | import torch.nn as nn 8 | from transformers import AutoTokenizer 9 | from tqdm import tqdm 10 | import numpy as np 11 | import os 12 | import gc 13 | gc.enable() 14 | 15 | def run_fold_ft(fold,config,train_data,tokenizer,t_bar): 16 | device = "cuda:0" 17 | #prep train/val datasets 18 | train_dataset = CLRPDataset_finetune(True, fold,train_data,tokenizer) 19 | val_dataset = CLRPDataset_finetune(False, fold,train_data,tokenizer) 20 | 21 | train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True) 22 | val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True) 23 | 24 | total_train_steps = int(len(train_loader) * config['num_epoch'] / config['accumulation_steps']) 25 | val_step = 1 26 | min_valid_loss = np.inf 27 | 28 | #load model 29 | model = Custom_bert(config['model_dir']).to(device) 30 | _ = model.eval() 31 | 32 | model.load_state_dict(torch.load(config['pretrained_path']), strict=False) 33 | 34 | #get optimizer and scheduler 35 | optimizer = get_optimizer(model,config) 36 | lr_scheduler = get_scheduler(optimizer,total_train_steps,config) 37 | 38 | step = 0 39 | min_step = 0 40 | last_save_step = 0 41 | last_save_index = 0 42 | 43 | #seed_everything(seed=config['seed_'] + fold) 44 | 45 | optimizer.zero_grad() 46 | for epoch in range(config['num_epoch']): 47 | model.train() 48 | count = 0 49 | total_loss = 0 50 | for batch in train_loader: 51 | input_ids = batch['input_ids'].to(device) 52 | attention_mask = batch['attention_mask'].to(device) 53 | target = batch['target'].to(device) 54 | 55 | outputs = model(input_ids, attention_mask) 56 | 57 | cls_loss = nn.MSELoss()(torch.squeeze(outputs,1),target) 58 | 59 | loss = cls_loss / config['accumulation_steps'] 60 | 61 | total_loss+=torch.pow(nn.MSELoss()(torch.squeeze(outputs,1),target),0.5).item() / config['accumulation_steps'] 62 | 63 | loss.backward() 64 | 65 | if (count+1) % config['accumulation_steps'] == 0: 66 | optimizer.step() 67 | lr_scheduler.step() 68 | optimizer.zero_grad() 69 | count = 0 70 | total_loss = 0 71 | else: 72 | count+=1 73 | 74 | #only save in radius of certain step 75 | if step >= (config['save_center']-config['save_radius']) and step <= (config['save_center']+config['save_radius']): 76 | val_step = 1 77 | do_val = True 78 | if config['only_val_in_radius']: 79 | if step < (config['save_center']-config['save_radius']) or step > (config['save_center']+config['save_radius']): 80 | do_val = False 81 | 82 | if ((step+1) % val_step == 0 and count == 0) and do_val: 83 | model.eval() 84 | l_val = nn.MSELoss(reduction='sum') 85 | with torch.no_grad(): 86 | total_loss_val = 0 87 | for batch in val_loader: 88 | input_ids = batch['input_ids'].to(device) 89 | attention_mask = batch['attention_mask'].to(device) 90 | outputs = model(input_ids, attention_mask) 91 | 92 | cls_loss_val = l_val(torch.squeeze(outputs),batch['target'].to(device)) 93 | 94 | val_loss = cls_loss_val 95 | 96 | total_loss_val+=val_loss.item() 97 | total_loss_val/=len(val_dataset) 98 | total_loss_val = total_loss_val**0.5 99 | 100 | if min_valid_loss > total_loss_val and step >= (config['save_center']-config['save_radius']) and step <= (config['save_center']+config['save_radius']): 101 | #saves model with lower loss 102 | min_step = step 103 | min_valid_loss = total_loss_val 104 | #print("min loss updated to ",min_valid_loss," at step ",min_step) 105 | if not os.path.isdir('./models'): 106 | os.mkdir('./models') 107 | if not os.path.isdir(config['save_path']): 108 | os.mkdir(config['save_path']) 109 | if 'roberta' in config['model_dir']: 110 | torch.save(model.state_dict(), config['save_path']+f'roberta_large_{fold}.pt') 111 | else: 112 | torch.save(model.state_dict(), config['save_path']+f'deberta_large_{fold}.pt') 113 | model.train() 114 | step+=1 115 | t_bar.update(1) 116 | del model,train_dataset,train_loader,val_dataset,val_loader 117 | gc.collect() 118 | torch.cuda.empty_cache() 119 | return min_valid_loss, min_step 120 | 121 | def train_ft(config): 122 | seed_everything(config['seed_']) 123 | 124 | train_data = pd.read_csv("./data/train.csv") 125 | train_data = create_folds(train_data, num_splits=5) 126 | model_dir = config['model_dir'] 127 | tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256) 128 | 129 | t_bar = tqdm(total=((2834*0.8//config['batch_size'])+1)*config['num_epoch']*config['n_folds']) 130 | train_losses = [] 131 | for i in range(config['n_folds']): 132 | loss, m_step = run_fold_ft(i,config,train_data,tokenizer,t_bar) 133 | train_losses.append(loss) 134 | return train_losses 135 | 136 | def train_pseudo(config, label_path): 137 | device = "cuda:0" 138 | seed_everything(config['seed_']) 139 | train_data = pd.read_csv("./data/train.csv") 140 | train_data = create_folds(train_data, num_splits=5) 141 | 142 | model_dir = config['model_dir'] 143 | tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256) 144 | 145 | train_dataset = CLRPDataset_pseudo(True,label_path,train_data,tokenizer) 146 | t_bar = tqdm(total=((len(train_dataset)//config['batch_size'])+1)*config['num_epoch']) 147 | train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True) 148 | 149 | val_dataset = CLRPDataset_pseudo(False,label_path,train_data,tokenizer) 150 | val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True) 151 | 152 | total_train_steps = int(len(train_loader) * config['num_epoch'] / config['accumulation_steps']) 153 | val_step = 100*config['accumulation_steps'] 154 | min_valid_loss = np.inf 155 | 156 | model = Custom_bert(config['model_dir']).to(device) 157 | _ = model.eval() 158 | 159 | if config['pretrained_path'] not in [None,'None']: 160 | print(model.load_state_dict(torch.load(config['pretrained_path']), strict=False)) 161 | 162 | optimizer = get_optimizer(model,config) 163 | lr_scheduler = get_scheduler(optimizer,total_train_steps,config) 164 | 165 | step = 0 166 | min_step = 0 167 | last_save_step = 0 168 | last_save_index = 0 169 | 170 | optimizer.zero_grad() 171 | for epoch in range(config['num_epoch']): 172 | model.train() 173 | count = 0 174 | total_loss = 0 175 | for batch in train_loader: 176 | input_ids = batch['input_ids'].to(device) 177 | attention_mask = batch['attention_mask'].to(device) 178 | target = batch['target'].to(device) 179 | outputs = model(input_ids, attention_mask) 180 | 181 | cls_loss = nn.MSELoss()(torch.squeeze(outputs,1),target) 182 | 183 | loss = cls_loss / config['accumulation_steps'] 184 | 185 | total_loss+=torch.pow(nn.MSELoss()(torch.squeeze(outputs,1),target),0.5).item() / config['accumulation_steps'] 186 | loss.backward() 187 | 188 | if (count+1) % config['accumulation_steps'] == 0: 189 | optimizer.step() 190 | lr_scheduler.step() 191 | optimizer.zero_grad() 192 | count = 0 193 | total_loss = 0 194 | else: 195 | count+=1 196 | 197 | if ((step+1) % val_step == 0): 198 | l_val = nn.MSELoss(reduction='sum') 199 | with torch.no_grad(): 200 | model.eval() 201 | total_loss_val = 0 202 | for batch in val_loader: 203 | input_ids = batch['input_ids'].to(device) 204 | attention_mask = batch['attention_mask'].to(device) 205 | outputs = model(input_ids, attention_mask) 206 | 207 | cls_loss_val = l_val(torch.squeeze(outputs),batch['target'].to(device)) 208 | 209 | val_loss = cls_loss_val 210 | 211 | total_loss_val+=val_loss.item() 212 | total_loss_val/=len(val_dataset) 213 | total_loss_val = total_loss_val**0.5 214 | 215 | if min_valid_loss > total_loss_val: 216 | min_step = step 217 | min_valid_loss = total_loss_val 218 | #print("min loss updated to ",min_valid_loss," at step ",min_step) 219 | # Saving State Dict 220 | if not os.path.isdir(config['save_path']): 221 | os.mkdir(config['save_path']) 222 | torch.save(model.state_dict(), config['save_path'] + config['pseudo_save_name']) 223 | model.train() 224 | step+=1 225 | t_bar.update(1) 226 | del model,train_dataset,train_loader 227 | gc.collect() 228 | torch.cuda.empty_cache() 229 | 230 | return min_valid_loss 231 | 232 | def train_pseudo_5fold(config, label_path): 233 | device = "cuda:0" 234 | seed_everything(config['seed_']) 235 | 236 | train_data = pd.read_csv("./data/train.csv") 237 | train_data = create_folds(train_data, num_splits=5) 238 | model_dir = config['model_dir'] 239 | tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256) 240 | 241 | min_val_losses = [] 242 | for fold in range(config['n_folds']): 243 | train_dataset = CLRPDataset_pseudo_5fold(True,fold,train_data,tokenizer,label_path) 244 | train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True) 245 | 246 | val_dataset = CLRPDataset_pseudo_5fold(False,fold,train_data,tokenizer,label_path) 247 | val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True) 248 | 249 | if fold == 0: 250 | t_bar = tqdm(total=((len(train_dataset)*5//config['batch_size'])+1)*config['num_epoch']) 251 | 252 | total_train_steps = int(len(train_loader) * config['num_epoch'] / config['accumulation_steps']) 253 | val_step = 100*config['accumulation_steps'] 254 | min_valid_loss = np.inf 255 | 256 | model = Custom_bert(config['model_dir']).to(device) 257 | _ = model.eval() 258 | 259 | if config['pretrained_path'] not in [None,'None']: 260 | model.load_state_dict(torch.load(config['pretrained_path']), strict=False) 261 | 262 | optimizer = get_optimizer(model,config) 263 | lr_scheduler = get_scheduler(optimizer,total_train_steps,config) 264 | 265 | step = 0 266 | min_step = 0 267 | last_save_step = 0 268 | last_save_index = 0 269 | 270 | optimizer.zero_grad() 271 | for epoch in range(config['num_epoch']): 272 | model.train() 273 | count = 0 274 | total_loss = 0 275 | for batch in train_loader: 276 | input_ids = batch['input_ids'].to(device) 277 | attention_mask = batch['attention_mask'].to(device) 278 | target = batch['target'].to(device) 279 | 280 | outputs = model(input_ids, attention_mask) 281 | 282 | cls_loss = nn.MSELoss()(torch.squeeze(outputs,1),target) 283 | 284 | loss = cls_loss / config['accumulation_steps'] 285 | 286 | total_loss+=torch.pow(nn.MSELoss()(torch.squeeze(outputs,1),target),0.5).item() / config['accumulation_steps'] 287 | loss.backward() 288 | 289 | if (count+1) % config['accumulation_steps'] == 0: 290 | optimizer.step() 291 | lr_scheduler.step() 292 | optimizer.zero_grad() 293 | count = 0 294 | total_loss = 0 295 | else: 296 | count+=1 297 | 298 | if ((step+1) % val_step == 0): 299 | model.eval() 300 | l_val = nn.MSELoss(reduction='sum') 301 | with torch.no_grad(): 302 | total_loss_val = 0 303 | for batch in val_loader: 304 | input_ids = batch['input_ids'].to(device) 305 | attention_mask = batch['attention_mask'].to(device) 306 | outputs = model(input_ids, attention_mask) 307 | 308 | cls_loss_val = l_val(torch.squeeze(outputs),batch['target'].to(device)) 309 | 310 | val_loss = cls_loss_val 311 | 312 | total_loss_val+=val_loss.item() 313 | total_loss_val/=len(val_dataset) 314 | total_loss_val = total_loss_val**0.5 315 | 316 | if min_valid_loss > total_loss_val and epoch > 0: 317 | min_step = step 318 | min_valid_loss = total_loss_val 319 | if not os.path.isdir('./models'): 320 | os.mkdir('./models') 321 | if not os.path.isdir(config['save_path']): 322 | os.mkdir(config['save_path']) 323 | if 'roberta' in config['model_dir']: 324 | torch.save(model.state_dict(), config['save_path']+f'roberta_large_{fold}.pt') 325 | else: 326 | torch.save(model.state_dict(), config['save_path']+f'deberta_large_{fold}.pt') 327 | model.train() 328 | step+=1 329 | t_bar.update(1) 330 | del model,train_dataset,train_loader,val_dataset,val_loader 331 | gc.collect() 332 | torch.cuda.empty_cache() 333 | min_val_losses.append(min_valid_loss) 334 | return min_val_losses 335 | -------------------------------------------------------------------------------- /components/util.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import os 4 | import torch 5 | import pandas as pd 6 | from sklearn.model_selection import KFold,StratifiedKFold 7 | 8 | def seed_everything(seed): 9 | random.seed(seed) 10 | np.random.seed(seed) 11 | os.environ["PYTHONHASHSEED"] = str(seed) 12 | 13 | torch.manual_seed(seed) 14 | torch.cuda.manual_seed(seed) 15 | torch.cuda.manual_seed_all(seed) 16 | 17 | torch.backends.cudnn.deterministic = True 18 | 19 | def create_folds(data, num_splits): 20 | # we create a new column called kfold and fill it with -1 21 | data["kfold"] = -1 22 | 23 | # the next step is to randomize the rows of the data 24 | data = data.sample(frac=1).reset_index(drop=True) 25 | 26 | # calculate number of bins by Sturge's rule 27 | # I take the floor of the value, you can also 28 | # just round it 29 | num_bins = int(np.floor(1 + np.log2(len(data)))) 30 | 31 | # bin targets 32 | data.loc[:, "bins"] = pd.cut( 33 | data["target"], bins=num_bins, labels=False 34 | ) 35 | 36 | # initiate the kfold class from model_selection module 37 | kf = StratifiedKFold(n_splits=num_splits) 38 | 39 | # fill the new kfold column 40 | # note that, instead of targets, we use bins! 41 | for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)): 42 | data.loc[v_, 'kfold'] = f 43 | 44 | # drop the bins column 45 | data = data.drop("bins", axis=1) 46 | 47 | # return dataframe with folds 48 | return data 49 | 50 | def init_params(module_lst): 51 | for module in module_lst: 52 | for param in module.parameters(): 53 | if param.dim() > 1: 54 | torch.nn.init.xavier_uniform_(param) 55 | return 56 | 57 | def generate_config(model_type,pretrained_path,save_path,lr_type,lr_setting): 58 | config = {'model_dir': './pretrained/roberta-large/', 59 | 'n_folds': 5, 60 | 'num_epoch': 3, 61 | 'weight_decay': 0.01, 62 | 'head_lr': 1e-4, 63 | 'weight_lr': 5e-2, 64 | 'base_lr': 7e-5, 65 | 'min_lr': 2e-5, 66 | 'low_lr': 1e-5, 67 | 'warm_up_ratio': 0.06, 68 | 'decline_1': 0.15, 69 | 'decline_2': 0.6, 70 | 'decline_3': 0.7, 71 | 'decline_4': 0.75, 72 | 'layerwise_decay_rate': 0.875**0.5, 73 | 'seed_': 88888888, 74 | 'reini_head':False, 75 | 'only_val_in_radius': True, 76 | 'save_center': 330, 77 | 'save_radius': 5, 78 | 'betas': (0.9, 0.999), 79 | } 80 | config['pretrained_path'] = pretrained_path 81 | config['save_path'] = save_path 82 | config['lr_type'] = lr_type 83 | if model_type == 'ro': 84 | config['model_dir'] = './pretrained/roberta-large/' 85 | config['batch_size'] = 16 86 | config['accumulation_steps'] = 1 87 | config['pseudo_save_name'] = 'roberta_large_single.pt' 88 | elif model_type == 'de': 89 | config['model_dir'] = './pretrained/deberta-large/' 90 | config['batch_size'] = 8 91 | config['accumulation_steps'] = 2 92 | config['save_center'] = 660 93 | config['save_radius'] = 10 94 | config['pseudo_save_name'] = 'deberta_large_single.pt' 95 | 96 | if lr_setting == '2': 97 | config['num_epoch'] = 2 98 | config['head_lr']= 1e-5 99 | config['weight_lr']= 5e-3 100 | config['base_lr']= 7e-6 101 | config['min_lr']= 2e-6 102 | config['low_lr']= 1e-6 103 | elif lr_setting == '3': 104 | config['head_lr']= 5e-5 105 | config['weight_lr']= 2e-3 106 | config['base_lr']= 3e-5 107 | config['min_lr']= 1e-5 108 | config['low_lr']= 5e-6 109 | 110 | return config 111 | -------------------------------------------------------------------------------- /run_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python3.9 0.prepare_data.py 3 | python3.9 1.roberta_pretrain.py 4 | python3.9 2.finetune.py ro ./models/roberta_large_pretrain.pt ./models/roberta_1/ custom 1 5 | 6 | python3.9 4.predict.py ./extra_data/extra_excerpt.csv ./extra_data/pseudo_1/ 1 1 ./models/roberta_1/ 7 | python3.9 3.pseudo_train.py de ./extra_data/pseudo_1/ ./models/deberta_1/ 1 8 | 9 | python3.9 4.predict.py ./extra_data/extra_excerpt.csv ./extra_data/pseudo_2/ 2 1 ./models/roberta_1/ ./models/deberta_1/ 10 | python3.9 3.pseudo_train.py de ./extra_data/pseudo_2/ ./models/deberta_2/ 0 11 | python3.9 2.finetune.py de ./models/deberta_2/deberta_large_single.pt ./models/deberta_2/ 3stage 3 12 | python3.9 4.predict.py ./extra_data/extra_excerpt.csv ./extra_data/pseudo_3/ 3 1 ./models/roberta_1/ ./models/deberta_1/ ./models/deberta_2/ 13 | python3.9 3.pseudo_train.py ro ./extra_data/pseudo_3/ ./models/roberta_2/ 0 14 | python3.9 2.finetune.py ro ./models/roberta_2/roberta_large_single.pt ./models/roberta_2/ 3stage 3 15 | --------------------------------------------------------------------------------