├── 0.prepare_data.py
├── 1.roberta_pretrain.py
├── 2.finetune.py
├── 3.pseudo_train.py
├── 4.predict.py
├── LICENSE
├── README.md
├── components
├── dataset.py
├── model.py
├── optimizer.py
├── predict.py
├── train.py
└── util.py
└── run_train.sh
/0.prepare_data.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | def main():
3 | with open('./extra_data/simple_english_wiki.txt') as f:
4 | contents = f.read()
5 | contents = contents.split('\n\n')
6 | for i in range(len(contents)):
7 | contents[i] = '\n'.join(contents[i].split('\n')[1:])
8 | length = 1000
9 | all_data_split = []
10 | for txt in contents:
11 | [all_data_split.append(txt[0+i:length+i]) for i in range(0, len(txt), length)]
12 | with open('./extra_data/cbt_valid.txt') as f:
13 | cbt_v = f.read()
14 | with open('./extra_data/cbt_test.txt') as f:
15 | cbt_te = f.read()
16 | with open('./extra_data/cbt_train.txt') as f:
17 | cbt_tr = f.read()
18 | cbt = cbt_v+cbt_te+cbt_tr
19 | cbt = cbt.replace('`',"'")
20 | cbt = cbt.replace("''",'"')
21 | _=[all_data_split.append(cbt[0+i:length+i]) for i in range(0, len(cbt), length)]
22 | df = pd.DataFrame()
23 | df['excerpt'] = all_data_split
24 | df.to_csv('./extra_data/extra_excerpt.csv',index=False)
25 |
26 | if __name__ == "__main__":
27 | main()
28 |
--------------------------------------------------------------------------------
/1.roberta_pretrain.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from transformers import AutoTokenizer, RobertaForMaskedLM, AutoConfig
3 | from transformers import Trainer, TrainingArguments
4 | from components.util import seed_everything
5 | from components.dataset import MLMDataset
6 | from components.optimizer import get_optimizer_robertaMLM, get_scheduler
7 | import torch
8 | import os
9 | import sys
10 |
11 | def main():
12 | ###
13 | # MLM pretrain with training data
14 | ###
15 | device = "cuda:0"
16 | model_dir = './pretrained/roberta-large/'
17 | tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256, add_prefix_space=True)
18 | model = RobertaForMaskedLM.from_pretrained(model_dir, local_files_only=True).to(device)
19 |
20 | df = pd.read_csv('./data/train.csv')[['excerpt']]
21 | texts = df['excerpt'].tolist()
22 | df_val = pd.read_csv('./data/test.csv')[['excerpt']]
23 | test = df_val['excerpt'].tolist()
24 | texts = texts+test
25 |
26 | seed_everything(456982)
27 |
28 | train_dataset = MLMDataset(True,texts,tokenizer)
29 | val_dataset = MLMDataset(True,texts,tokenizer)
30 |
31 | config = {
32 | 'lr_type':'custom',
33 | 'base_lr':9e-5,
34 | 'head_lr':1.2e-4,
35 | 'min_lr':4e-5,
36 | 'low_lr':2e-5,
37 | 'n_epoch':5,
38 | 'bs':16,
39 | 'ga':1,
40 | 'lr_scheduler_mul_factor':2,
41 | 'weight_decay':0.01,
42 | 'warm_up_ratio':0.2,
43 | 'decline_1': 0.2,
44 | 'decline_2': 0.7,
45 | 'decline_3': 0.8,
46 | 'decline_4': 0.9,
47 | 'layerwise_decay_rate': 0.9**0.5,
48 | 'betas': (0.9,0.993),
49 | }
50 |
51 | train_len = len(train_dataset)
52 | total_train_steps = int(train_len * config['n_epoch'] / config['ga'] / config['bs'])
53 | optimizer = get_optimizer_robertaMLM(model,config)
54 | lr_scheduler = get_scheduler(optimizer, total_train_steps, config)
55 |
56 | training_args = TrainingArguments(
57 | output_dir='./', # output directory
58 | num_train_epochs=config['n_epoch'], # total number of training epochs
59 | overwrite_output_dir=True,
60 | per_device_train_batch_size=config['bs'], # batch size per device during training
61 | per_device_eval_batch_size=32, # batch size for evaluation
62 | weight_decay=0.01, # strength of weight decay
63 | logging_strategy='no',
64 | gradient_accumulation_steps = config['ga'],
65 | save_strategy = "no",
66 | evaluation_strategy= 'epoch',
67 | prediction_loss_only=True,
68 | learning_rate = config['base_lr'],
69 | )
70 |
71 | trainer = Trainer(
72 | model=model, # the instantiated 🤗 Transformers model to be trained
73 | args=training_args, # training arguments, defined above
74 | train_dataset=train_dataset, # training dataset
75 | eval_dataset=val_dataset, # evaluation dataset
76 | optimizers = (optimizer, lr_scheduler)
77 | )
78 |
79 | trainer.train()
80 | if not os.path.isdir('./models'):
81 | os.mkdir('./models')
82 | dict_ = model.state_dict()
83 | for key in list(dict_.keys()):
84 | dict_[key.replace('roberta.', 'base.')] = dict_.pop(key)
85 | torch.save(dict_, f'./models/roberta_large_pretrain.pt')
86 |
87 | if __name__ == "__main__":
88 | main()
89 |
--------------------------------------------------------------------------------
/2.finetune.py:
--------------------------------------------------------------------------------
1 | #args:
2 | #1. type of model: 'ro' or 'de'
3 | #2. pretrained path
4 | #3. save path
5 | #4. lr type 'custom' or '3stage'
6 | #5. lr config type 1-3
7 | # 1.training from scratch (use lr type 'custom')
8 | # 2.pseudo pretrain (use lr type 'custom')
9 | # 3.pseudo finetune (use lr type '3stage')
10 |
11 | from components.train import train_ft
12 | from components.util import generate_config
13 | import sys
14 | import numpy as np
15 |
16 | def main():
17 | ###
18 | # training using provided training data
19 | ###
20 | config = generate_config(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5])
21 | losses = train_ft(config)
22 | print(np.mean(losses),'\n',losses)
23 |
24 | if __name__ == "__main__":
25 | main()
26 |
--------------------------------------------------------------------------------
/3.pseudo_train.py:
--------------------------------------------------------------------------------
1 | #args:
2 | #1. type of model: 'ro' or 'de'
3 | #2. label path
4 | #3. save path
5 | #4. mode: 0=mix 1=5fold
6 |
7 | from components.train import train_pseudo, train_pseudo_5fold
8 | from components.util import generate_config
9 | import sys
10 | import numpy as np
11 |
12 | def main():
13 | ###
14 | # training using extra training data
15 | ###
16 | config = generate_config(sys.argv[1],'None',sys.argv[3],'custom','2')
17 | if sys.argv[4]=='0':
18 | min_valid_loss = train_pseudo(config,sys.argv[2])
19 | print(min_valid_loss)
20 | else:
21 | min_valid_loss = train_pseudo_5fold(config,sys.argv[2])
22 | print(min_valid_loss)
23 |
24 | if __name__ == "__main__":
25 | main()
26 |
--------------------------------------------------------------------------------
/4.predict.py:
--------------------------------------------------------------------------------
1 | #args:
2 | #1. source file
3 | #2. target file
4 | #3. num of models
5 | #4. model dir
6 | #5. mode: 0=label data 1=5fold labels
7 | #...
8 |
9 | import sys
10 | import numpy as np
11 | from components.predict import get_single_model
12 | import pandas as pd
13 | import os
14 |
15 | def main():
16 | ###
17 | # generate prediction for 1. inference 2. 5fold labels
18 | ###
19 | source_path = sys.argv[1]
20 | target_path = sys.argv[2]
21 | num_of_models = int(sys.argv[3])
22 | model_dirs = [sys.argv[i+5] for i in range(num_of_models)]
23 |
24 | data = pd.read_csv(source_path)
25 |
26 | preds = []
27 | for i in range(num_of_models):
28 | preds.append(get_single_model(model_dirs[i],data))
29 |
30 | if sys.argv[4] == '1':
31 | #hard coded weight for when one of each of roberta and deberta is used to predict
32 | if num_of_models == 2 and 'roberta' in model_dirs[0] and 'deberta' in model_dirs[1]:
33 | preds_fold0 = [pred[0] for pred in preds]
34 | preds_fold1 = [pred[1] for pred in preds]
35 | preds_fold2 = [pred[2] for pred in preds]
36 | preds_fold3 = [pred[3] for pred in preds]
37 | preds_fold4 = [pred[4] for pred in preds]
38 | preds_0 = preds_fold0[0] * 0.33 + preds_fold0[1] * 0.67
39 | preds_1 = preds_fold1[0] * 0.33 + preds_fold1[1] * 0.67
40 | preds_2 = preds_fold2[0] * 0.33 + preds_fold2[1] * 0.67
41 | preds_3 = preds_fold3[0] * 0.33 + preds_fold3[1] * 0.67
42 | preds_4 = preds_fold4[0] * 0.33 + preds_fold4[1] * 0.67
43 | else:
44 | preds_0 = np.mean(np.concatenate([pred[0] for pred in preds],axis=1),axis=1)
45 | preds_1 = np.mean(np.concatenate([pred[1] for pred in preds],axis=1),axis=1)
46 | preds_2 = np.mean(np.concatenate([pred[2] for pred in preds],axis=1),axis=1)
47 | preds_3 = np.mean(np.concatenate([pred[3] for pred in preds],axis=1),axis=1)
48 | preds_4 = np.mean(np.concatenate([pred[4] for pred in preds],axis=1),axis=1)
49 | labeled_extra0 = data.copy()
50 | labeled_extra1 = data.copy()
51 | labeled_extra2 = data.copy()
52 | labeled_extra3 = data.copy()
53 | labeled_extra4 = data.copy()
54 | labeled_extra0['target'] = preds_0
55 | labeled_extra1['target'] = preds_1
56 | labeled_extra2['target'] = preds_2
57 | labeled_extra3['target'] = preds_3
58 | labeled_extra4['target'] = preds_4
59 |
60 | if not os.path.isdir(target_path):
61 | os.mkdir(target_path)
62 | labeled_extra0.to_csv(target_path + 'labeled_extra_0.csv',index=False)
63 | labeled_extra1.to_csv(target_path + 'labeled_extra_1.csv',index=False)
64 | labeled_extra2.to_csv(target_path + 'labeled_extra_2.csv',index=False)
65 | labeled_extra3.to_csv(target_path + 'labeled_extra_3.csv',index=False)
66 | labeled_extra4.to_csv(target_path + 'labeled_extra_4.csv',index=False)
67 |
68 | else:
69 | preds = [np.expand_dims(np.mean(np.concatenate(pred,axis=1),axis=1),axis=1) for pred in preds]
70 | if num_of_models == 2 and 'roberta' in model_dirs[0] and 'deberta' in model_dirs[1]:
71 | pred = preds[0] * 0.33 + preds[1] * 0.67
72 | else:
73 | cat = np.concatenate(preds,axis=1)
74 | pred = np.mean(cat,axis=1)
75 | data['target'] = pred
76 | data.to_csv(target_path,index=False)
77 |
78 | if __name__ == "__main__":
79 | main()
80 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Danielhuxc
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CLRP-solution
2 | ### requirements:
3 | numpy==1.20.2 \
4 | pandas==1.2.4 \
5 | transformers==4.5.1 \
6 | torch==1.9.0+cu111 \
7 | sklearn==0.0 \
8 | tqdm==4.60.0
9 |
10 | 24GB VRAM
11 |
12 | ### prep:
13 | download pretrained roberta-large and deberta-large from: \
14 | https://huggingface.co/roberta-large \
15 | https://huggingface.co/microsoft/deberta-large \
16 | and save it in \
17 | ./pretrained/roberta-large\
18 | ./pretrained/deberta-large
19 |
20 | download \
21 | Children's Book Test from: \
22 | https://research.fb.com/downloads/babi/ \
23 | Simple Wiki Dump from: \
24 | https://github.com/LGDoor/Dump-of-Simple-English-Wiki \
25 | and save it as follows \
26 | ./extra_data/cbt_test.txt \
27 | ./extra_data/cbt_train.txt \
28 | ./extra_data/cbt_valid.txt \
29 | ./extra_data/simple_english_wiki.txt
30 |
31 | CLRP training data goes to \
32 | ./data/train.csv \
33 | ./data/test.csv
34 |
35 | ### train from scratch:
36 | ./run_train.sh \
37 | takes about 30 hours
38 |
39 | ### predict:
40 | python 4.predict.py ./{path_to_source_file}.csv ./{path_to_save}.csv 3 0 ./models/roberta_2/ ./models/deberta_1/ ./models/deberta_2/ \
41 | make sure the column name is 'excerpt' in source csv file
42 |
43 | ### solution writeup:
44 | https://www.kaggle.com/c/commonlitreadabilityprize/discussion/258095
45 |
--------------------------------------------------------------------------------
/components/dataset.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import pandas as pd
3 |
4 | class MLMDataset(torch.utils.data.Dataset):
5 | def __init__(self, is_train, texts, tokenizer):
6 | self.is_train = is_train
7 | self.tokenizer = tokenizer
8 | if self.is_train:
9 | self.data = texts
10 | else:
11 | self.data = texts
12 | ### only use portion of data
13 | length = int(len(self.data)/1)
14 | self.data = self.data[:length]
15 | ###
16 |
17 | def __getitem__(self, idx):
18 | item = self.tokenizer(self.data[idx], padding='max_length', is_split_into_words = False,truncation=True, return_tensors="pt")
19 |
20 | item['labels'] = item['input_ids'].clone()
21 |
22 | probability_matrix = torch.full(item['labels'].shape, 0.15)
23 | special_tokens_mask = [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in item['labels'].tolist()]
24 | probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
25 | masked_indices = torch.bernoulli(probability_matrix).bool()
26 | item['labels'][~masked_indices] = -100
27 |
28 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
29 | indices_replaced = torch.bernoulli(torch.full(item['labels'].shape, 0.8)).bool() & masked_indices
30 | item['input_ids'][indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
31 |
32 | # 10% of the time, we replace masked input tokens with random word
33 | indices_random = torch.bernoulli(torch.full(item['labels'].shape, 0.5)).bool() & masked_indices & ~indices_replaced
34 | random_words = torch.randint(len(self.tokenizer), item['labels'].shape, dtype=torch.long)
35 | item['input_ids'][indices_random] = random_words[indices_random]
36 |
37 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged
38 | item['input_ids'] = item['input_ids'][0]
39 | item['attention_mask'] = item['attention_mask'][0]
40 | item['labels'] = item['labels'][0]
41 | return item
42 |
43 | def __len__(self):
44 | return len(self.data)
45 |
46 | class CLRPDataset_finetune(torch.utils.data.Dataset):
47 | def __init__(self, is_train, fold, train_data, tokenizer):
48 | self.is_train = is_train
49 | self.tokenizer = tokenizer
50 |
51 | if is_train:
52 | df = train_data.query(f"kfold != {fold}")[['excerpt','target']]
53 | else:
54 | df = train_data.query(f"kfold == {fold}")[['excerpt','target']]
55 | self.excerpt = df['excerpt'].to_numpy()
56 | self.target = df['target'].to_numpy()
57 |
58 | def __getitem__(self, idx):
59 | tokenized = self.tokenizer(self.excerpt[idx],return_tensors='pt',
60 | max_length=256,
61 | padding='max_length',truncation=True)
62 |
63 | item = {}
64 | item['input_ids'] = tokenized['input_ids'][0]
65 | item['attention_mask'] = tokenized['attention_mask'][0]
66 | item['target'] = torch.tensor(self.target[idx]).type(torch.float32)
67 |
68 | return item
69 |
70 | def __len__(self):
71 | return len(self.target)
72 |
73 | class CLRPDataset_pred(torch.utils.data.Dataset):
74 | def __init__(self,df,tokenizer):
75 | self.excerpt = df['excerpt'].to_numpy()
76 | self.tokenizer = tokenizer
77 |
78 | def __getitem__(self,idx):
79 | encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
80 | max_length=256,
81 | padding='max_length',truncation=True)
82 | encoded = {'input_ids':encode['input_ids'][0],
83 | 'attention_mask':encode['attention_mask'][0]
84 | }
85 |
86 | return encoded
87 |
88 | def __len__(self):
89 | return len(self.excerpt)
90 |
91 | class CLRPDataset_pseudo(torch.utils.data.Dataset):
92 | def __init__(self, is_train, label_path, train_data, tokenizer):
93 | self.tokenizer = tokenizer
94 | if is_train:
95 | df1 = pd.read_csv(label_path+"labeled_extra_0.csv")
96 | df2 = pd.read_csv(label_path+"labeled_extra_1.csv")
97 | df3 = pd.read_csv(label_path+"labeled_extra_2.csv")
98 | df4 = pd.read_csv(label_path+"labeled_extra_3.csv")
99 | df5 = pd.read_csv(label_path+"labeled_extra_4.csv")
100 | self.excerpt = df1['excerpt'].to_numpy()
101 | self.target = (df1['target'] + df2['target'] + df3['target'] + df4['target'] + df5['target']).to_numpy()/5
102 | else:
103 | self.excerpt = train_data['excerpt'].to_numpy()
104 | self.target = train_data['target'].to_numpy()
105 |
106 | def __getitem__(self, idx):
107 | tokenized = self.tokenizer(self.excerpt[idx],return_tensors='pt',
108 | max_length=256,
109 | padding='max_length',truncation=True)
110 |
111 | item = {}
112 | item['input_ids'] = tokenized['input_ids'][0]
113 | item['attention_mask'] = tokenized['attention_mask'][0]
114 | item['target'] = torch.tensor(self.target[idx]).type(torch.float32)
115 |
116 | return item
117 |
118 | def __len__(self):
119 | return len(self.target)
120 |
121 | #reads 5 fold labeled data and mix 3x training data in
122 | class CLRPDataset_pseudo_5fold(torch.utils.data.Dataset):
123 | def __init__(self, is_train, fold, train_data, tokenizer, label_path):
124 | self.tokenizer = tokenizer
125 | if is_train:
126 | df = pd.read_csv(label_path+f"labeled_extra_{fold}.csv")
127 | tr = train_data.query(f"kfold != {fold}")[['excerpt','target']]
128 | df = pd.concat([df,tr,tr,tr], ignore_index=True)
129 | df = df.sample(frac=1).reset_index(drop=True)
130 | else:
131 | df = train_data.query(f"kfold == {fold}")[['excerpt','target']]
132 | self.excerpt = df['excerpt'].to_numpy()
133 | self.target = df['target'].to_numpy()
134 | ###
135 |
136 | def __getitem__(self, idx):
137 | tokenized = self.tokenizer(self.excerpt[idx],return_tensors='pt',
138 | max_length=256,
139 | padding='max_length',truncation=True)
140 |
141 | item = {}
142 | item['input_ids'] = tokenized['input_ids'][0]
143 | item['attention_mask'] = tokenized['attention_mask'][0]
144 | item['target'] = torch.tensor(self.target[idx]).type(torch.float32)
145 |
146 | return item
147 |
148 | def __len__(self):
149 | return len(self.target)
150 |
--------------------------------------------------------------------------------
/components/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from transformers import AutoModel, AutoConfig
4 | from components.util import init_params
5 |
6 | class Custom_bert(nn.Module):
7 | def __init__(self,model_dir):
8 | super().__init__()
9 |
10 | #load base model
11 | config = AutoConfig.from_pretrained(model_dir)
12 | config.update({"output_hidden_states":True,
13 | "hidden_dropout_prob": 0.0,
14 | "layer_norm_eps": 1e-7})
15 |
16 | self.base = AutoModel.from_pretrained(model_dir, config=config)
17 |
18 | dim = self.base.encoder.layer[0].output.dense.bias.shape[0]
19 |
20 | self.dropout = nn.Dropout(p=0.2)
21 | self.high_dropout = nn.Dropout(p=0.5)
22 |
23 | #weights for weighted layer average
24 | n_weights = 24
25 | weights_init = torch.zeros(n_weights).float()
26 | weights_init.data[:-1] = -3
27 | self.layer_weights = torch.nn.Parameter(weights_init)
28 |
29 | #attention head
30 | self.attention = nn.Sequential(
31 | nn.Linear(1024, 1024),
32 | nn.Tanh(),
33 | nn.Linear(1024, 1),
34 | nn.Softmax(dim=1)
35 | )
36 | self.cls = nn.Sequential(
37 | nn.Linear(dim,1)
38 | )
39 | init_params([self.cls,self.attention])
40 |
41 | def reini_head(self):
42 | init_params([self.cls,self.attention])
43 | return
44 |
45 | def forward(self, input_ids, attention_mask):
46 | base_output = self.base(input_ids=input_ids,
47 | attention_mask=attention_mask)
48 |
49 | #weighted average of all encoder outputs
50 | cls_outputs = torch.stack(
51 | [self.dropout(layer) for layer in base_output['hidden_states'][-24:]], dim=0
52 | )
53 | cls_output = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * cls_outputs).sum(0)
54 |
55 | #multisample dropout
56 | logits = torch.mean(
57 | torch.stack(
58 | [torch.sum(self.attention(self.high_dropout(cls_output)) * cls_output, dim=1) for _ in range(5)],
59 | dim=0,
60 | ),
61 | dim=0,
62 | )
63 | return self.cls(logits)
64 |
--------------------------------------------------------------------------------
/components/optimizer.py:
--------------------------------------------------------------------------------
1 | from transformers import AdamW
2 | import torch
3 |
4 | def get_optimizer(model,config):
5 | # divide encoder layers into 3 groups and assign different lr
6 | # head lr is set separately
7 | layers = len(model.base.encoder.layer)
8 | no_decay = ["bias", "LayerNorm.weight"]
9 | high_lr_head = ["layer_weights"]
10 | ### not in high_lr_head
11 | params_lst = [{'params':[p for n, p in model.named_parameters()
12 | if not any(en in n for en,ep in model.base.encoder.layer.named_parameters())
13 | and not any(nd in n for nd in no_decay)
14 | and not any(nd in n for nd in high_lr_head)],
15 | 'lr': config['head_lr'],
16 | 'weight_decay': config['weight_decay']
17 | }]
18 | params_lst.append({'params':[p for n, p in model.named_parameters()
19 | if not any(en in n for en,ep in model.base.encoder.layer.named_parameters())
20 | and any(nd in n for nd in no_decay)
21 | and not any(nd in n for nd in high_lr_head)],
22 | 'lr': config['head_lr'],
23 | 'weight_decay': 0.0
24 | })
25 | ###
26 | ### in high_lr_head
27 | params_lst.append({'params':[p for n, p in model.named_parameters()
28 | if not any(en in n for en,ep in model.base.encoder.layer.named_parameters())
29 | and not any(nd in n for nd in no_decay)
30 | and any(lw in n for lw in high_lr_head)],
31 | 'lr': config['weight_lr'],
32 | 'weight_decay': config['weight_decay']
33 | })
34 | params_lst.append({'params':[p for n, p in model.named_parameters()
35 | if not any(en in n for en,ep in model.base.encoder.layer.named_parameters())
36 | and any(nd in n for nd in no_decay)
37 | and any(lw in n for lw in high_lr_head)],
38 | 'lr': config['weight_lr'],
39 | 'weight_decay': 0.0
40 | })
41 | ###
42 | parts = 3
43 | for i,j in zip(range(layers-1,-1,-int(layers/parts)),range(0,layers,int(layers/parts))):
44 | for k in range(int(layers/parts)):
45 | param_dict1 = {'params': [p for n, p in model.base.encoder.layer[i-k].named_parameters()
46 | if not any(nd in n for nd in no_decay)],
47 | 'weight_decay': config['weight_decay'],
48 | 'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr']
49 | }
50 | param_dict2 = {'params': [p for n, p in model.base.encoder.layer[i-k].named_parameters()
51 | if any(nd in n for nd in no_decay)],
52 | 'weight_decay': 0.0,
53 | 'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr']
54 | }
55 | params_lst.append(param_dict1)
56 | params_lst.append(param_dict2)
57 |
58 | optimizer = AdamW(params_lst, betas = config['betas'])
59 |
60 | return optimizer
61 |
62 | def get_optimizer_robertaMLM(model,config):
63 | layers = len(model.roberta.encoder.layer)
64 | no_decay = ["bias", "LayerNorm.weight"]
65 | high_lr_head = ["layer_weights"]
66 | ### not in high_lr_head
67 | params_lst = [{'params':[p for n, p in model.named_parameters()
68 | if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters())
69 | and not any(nd in n for nd in no_decay)
70 | and not any(nd in n for nd in high_lr_head)],
71 | 'lr': config['head_lr'],
72 | 'weight_decay': config['weight_decay']
73 | }]
74 | params_lst.append({'params':[p for n, p in model.named_parameters()
75 | if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters())
76 | and any(nd in n for nd in no_decay)
77 | and not any(nd in n for nd in high_lr_head)],
78 | 'lr': config['head_lr'],
79 | 'weight_decay': 0.0
80 | })
81 | ###
82 | ### in high_lr_head
83 | params_lst.append({'params':[p for n, p in model.named_parameters()
84 | if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters())
85 | and not any(nd in n for nd in no_decay)
86 | and any(lw in n for lw in high_lr_head)],
87 | 'lr': config['base_lr'],
88 | 'weight_decay': config['weight_decay']
89 | })
90 | params_lst.append({'params':[p for n, p in model.named_parameters()
91 | if not any(en in n for en,ep in model.roberta.encoder.layer.named_parameters())
92 | and any(nd in n for nd in no_decay)
93 | and any(lw in n for lw in high_lr_head)],
94 | 'lr': config['base_lr'],
95 | 'weight_decay': 0.0
96 | })
97 | ###
98 | parts = 3
99 | for i,j in zip(range(layers-1,-1,-int(layers/parts)),range(0,layers,int(layers/parts))):
100 | for k in range(int(layers/parts)):
101 | param_dict1 = {'params': [p for n, p in model.roberta.encoder.layer[i-k].named_parameters()
102 | if not any(nd in n for nd in no_decay)],
103 | 'weight_decay': config['weight_decay'],
104 | 'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr']
105 | }
106 | param_dict2 = {'params': [p for n, p in model.roberta.encoder.layer[i-k].named_parameters()
107 | if any(nd in n for nd in no_decay)],
108 | 'weight_decay': 0.0,
109 | 'lr':pow(config['layerwise_decay_rate'],j)*config['base_lr']
110 | }
111 | params_lst.append(param_dict1)
112 | params_lst.append(param_dict2)
113 |
114 | optimizer = AdamW(params_lst, betas = config['betas'])
115 |
116 | return optimizer
117 |
118 | def get_scheduler(optimizer, total_train_steps, config):
119 | #two schedules:
120 | #1. custom is similar to linear decay with warmup
121 | #2. 3stage is simply halving every 1/3 steps
122 | def lr_lambda_1(step):
123 | total_steps = total_train_steps
124 | w = int(config['warm_up_ratio']*total_steps)
125 | d1 = int(config['decline_1']*total_steps)
126 | d2 = int(config['decline_2']*total_steps)
127 | d3 = int(config['decline_3']*total_steps)
128 | d4 = int(config['decline_4']*total_steps)
129 | min_vs_base_ratio = config['min_lr']/config['base_lr']
130 | low_vs_base_ratio = config['low_lr']/config['base_lr']
131 | if step <= w:
132 | return step/w
133 | elif step <= d1:
134 | return 1
135 | elif step <= d3:
136 | return max(min_vs_base_ratio,min_vs_base_ratio+(1-min_vs_base_ratio)*(d2-step)/(d2-d1))
137 | else:
138 | return max(low_vs_base_ratio,low_vs_base_ratio+(min_vs_base_ratio-low_vs_base_ratio)*(d4-step)/(d4-d3))
139 | def lr_lambda_2(step):
140 | if step <= total_train_steps * (1/3):
141 | return 1
142 | if step <= total_train_steps * (2/3):
143 | return 0.5
144 | if step <= total_train_steps * (3/3):
145 | return 0.25
146 | if config['lr_type'] == 'custom':
147 | return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda_1)
148 | elif config['lr_type'] == '3stage':
149 | return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda_2)
150 |
--------------------------------------------------------------------------------
/components/predict.py:
--------------------------------------------------------------------------------
1 | from components.dataset import CLRPDataset_pred
2 | from components.model import Custom_bert
3 | import numpy as np
4 | import torch
5 | import gc
6 | gc.enable()
7 | from tqdm import tqdm
8 | from transformers import AutoTokenizer
9 | device = "cuda:0"
10 |
11 | def run_fold(fold_num,model_path,data):
12 | if 'roberta' in model_path:
13 | model_dir = './pretrained/roberta-large/'
14 | model_name = f"roberta_large_{fold_num}.pt"
15 | elif 'deberta' in model_path:
16 | model_dir = './pretrained/deberta-large/'
17 | model_name = f"deberta_large_{fold_num}.pt"
18 |
19 | tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256)
20 | model = Custom_bert(model_dir).to(device)
21 | _ = model.eval()
22 | model.load_state_dict(torch.load(model_path+model_name))
23 |
24 | test_ds = CLRPDataset_pred(data,tokenizer)
25 | test_dl = torch.utils.data.DataLoader(test_ds,
26 | batch_size = 192,
27 | shuffle=False,
28 | pin_memory=True)
29 |
30 | pred = []
31 | with torch.no_grad():
32 | for batch in tqdm(test_dl):
33 | input_ids = batch['input_ids'].to(device)
34 | attention_mask = batch['attention_mask'].to(device)
35 | output = model(input_ids, attention_mask)
36 | pred.extend(output.detach().cpu().numpy())
37 |
38 | del model, test_dl, test_ds
39 | gc.collect()
40 | torch.cuda.empty_cache()
41 |
42 | return np.array(pred)
43 |
44 | def get_single_model(pth,data):
45 | pred0 = run_fold(0,pth,data)
46 | pred1 = run_fold(1,pth,data)
47 | pred2 = run_fold(2,pth,data)
48 | pred3 = run_fold(3,pth,data)
49 | pred4 = run_fold(4,pth,data)
50 |
51 | return [pred0,pred1,pred2,pred3,pred4]
52 |
--------------------------------------------------------------------------------
/components/train.py:
--------------------------------------------------------------------------------
1 | from components.dataset import CLRPDataset_finetune, CLRPDataset_pseudo, CLRPDataset_pseudo_5fold
2 | from components.util import seed_everything, create_folds, generate_config
3 | from components.model import Custom_bert
4 | from components.optimizer import get_optimizer, get_scheduler
5 | import pandas as pd
6 | import torch
7 | import torch.nn as nn
8 | from transformers import AutoTokenizer
9 | from tqdm import tqdm
10 | import numpy as np
11 | import os
12 | import gc
13 | gc.enable()
14 |
15 | def run_fold_ft(fold,config,train_data,tokenizer,t_bar):
16 | device = "cuda:0"
17 | #prep train/val datasets
18 | train_dataset = CLRPDataset_finetune(True, fold,train_data,tokenizer)
19 | val_dataset = CLRPDataset_finetune(False, fold,train_data,tokenizer)
20 |
21 | train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
22 | val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True)
23 |
24 | total_train_steps = int(len(train_loader) * config['num_epoch'] / config['accumulation_steps'])
25 | val_step = 1
26 | min_valid_loss = np.inf
27 |
28 | #load model
29 | model = Custom_bert(config['model_dir']).to(device)
30 | _ = model.eval()
31 |
32 | model.load_state_dict(torch.load(config['pretrained_path']), strict=False)
33 |
34 | #get optimizer and scheduler
35 | optimizer = get_optimizer(model,config)
36 | lr_scheduler = get_scheduler(optimizer,total_train_steps,config)
37 |
38 | step = 0
39 | min_step = 0
40 | last_save_step = 0
41 | last_save_index = 0
42 |
43 | #seed_everything(seed=config['seed_'] + fold)
44 |
45 | optimizer.zero_grad()
46 | for epoch in range(config['num_epoch']):
47 | model.train()
48 | count = 0
49 | total_loss = 0
50 | for batch in train_loader:
51 | input_ids = batch['input_ids'].to(device)
52 | attention_mask = batch['attention_mask'].to(device)
53 | target = batch['target'].to(device)
54 |
55 | outputs = model(input_ids, attention_mask)
56 |
57 | cls_loss = nn.MSELoss()(torch.squeeze(outputs,1),target)
58 |
59 | loss = cls_loss / config['accumulation_steps']
60 |
61 | total_loss+=torch.pow(nn.MSELoss()(torch.squeeze(outputs,1),target),0.5).item() / config['accumulation_steps']
62 |
63 | loss.backward()
64 |
65 | if (count+1) % config['accumulation_steps'] == 0:
66 | optimizer.step()
67 | lr_scheduler.step()
68 | optimizer.zero_grad()
69 | count = 0
70 | total_loss = 0
71 | else:
72 | count+=1
73 |
74 | #only save in radius of certain step
75 | if step >= (config['save_center']-config['save_radius']) and step <= (config['save_center']+config['save_radius']):
76 | val_step = 1
77 | do_val = True
78 | if config['only_val_in_radius']:
79 | if step < (config['save_center']-config['save_radius']) or step > (config['save_center']+config['save_radius']):
80 | do_val = False
81 |
82 | if ((step+1) % val_step == 0 and count == 0) and do_val:
83 | model.eval()
84 | l_val = nn.MSELoss(reduction='sum')
85 | with torch.no_grad():
86 | total_loss_val = 0
87 | for batch in val_loader:
88 | input_ids = batch['input_ids'].to(device)
89 | attention_mask = batch['attention_mask'].to(device)
90 | outputs = model(input_ids, attention_mask)
91 |
92 | cls_loss_val = l_val(torch.squeeze(outputs),batch['target'].to(device))
93 |
94 | val_loss = cls_loss_val
95 |
96 | total_loss_val+=val_loss.item()
97 | total_loss_val/=len(val_dataset)
98 | total_loss_val = total_loss_val**0.5
99 |
100 | if min_valid_loss > total_loss_val and step >= (config['save_center']-config['save_radius']) and step <= (config['save_center']+config['save_radius']):
101 | #saves model with lower loss
102 | min_step = step
103 | min_valid_loss = total_loss_val
104 | #print("min loss updated to ",min_valid_loss," at step ",min_step)
105 | if not os.path.isdir('./models'):
106 | os.mkdir('./models')
107 | if not os.path.isdir(config['save_path']):
108 | os.mkdir(config['save_path'])
109 | if 'roberta' in config['model_dir']:
110 | torch.save(model.state_dict(), config['save_path']+f'roberta_large_{fold}.pt')
111 | else:
112 | torch.save(model.state_dict(), config['save_path']+f'deberta_large_{fold}.pt')
113 | model.train()
114 | step+=1
115 | t_bar.update(1)
116 | del model,train_dataset,train_loader,val_dataset,val_loader
117 | gc.collect()
118 | torch.cuda.empty_cache()
119 | return min_valid_loss, min_step
120 |
121 | def train_ft(config):
122 | seed_everything(config['seed_'])
123 |
124 | train_data = pd.read_csv("./data/train.csv")
125 | train_data = create_folds(train_data, num_splits=5)
126 | model_dir = config['model_dir']
127 | tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256)
128 |
129 | t_bar = tqdm(total=((2834*0.8//config['batch_size'])+1)*config['num_epoch']*config['n_folds'])
130 | train_losses = []
131 | for i in range(config['n_folds']):
132 | loss, m_step = run_fold_ft(i,config,train_data,tokenizer,t_bar)
133 | train_losses.append(loss)
134 | return train_losses
135 |
136 | def train_pseudo(config, label_path):
137 | device = "cuda:0"
138 | seed_everything(config['seed_'])
139 | train_data = pd.read_csv("./data/train.csv")
140 | train_data = create_folds(train_data, num_splits=5)
141 |
142 | model_dir = config['model_dir']
143 | tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256)
144 |
145 | train_dataset = CLRPDataset_pseudo(True,label_path,train_data,tokenizer)
146 | t_bar = tqdm(total=((len(train_dataset)//config['batch_size'])+1)*config['num_epoch'])
147 | train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
148 |
149 | val_dataset = CLRPDataset_pseudo(False,label_path,train_data,tokenizer)
150 | val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True)
151 |
152 | total_train_steps = int(len(train_loader) * config['num_epoch'] / config['accumulation_steps'])
153 | val_step = 100*config['accumulation_steps']
154 | min_valid_loss = np.inf
155 |
156 | model = Custom_bert(config['model_dir']).to(device)
157 | _ = model.eval()
158 |
159 | if config['pretrained_path'] not in [None,'None']:
160 | print(model.load_state_dict(torch.load(config['pretrained_path']), strict=False))
161 |
162 | optimizer = get_optimizer(model,config)
163 | lr_scheduler = get_scheduler(optimizer,total_train_steps,config)
164 |
165 | step = 0
166 | min_step = 0
167 | last_save_step = 0
168 | last_save_index = 0
169 |
170 | optimizer.zero_grad()
171 | for epoch in range(config['num_epoch']):
172 | model.train()
173 | count = 0
174 | total_loss = 0
175 | for batch in train_loader:
176 | input_ids = batch['input_ids'].to(device)
177 | attention_mask = batch['attention_mask'].to(device)
178 | target = batch['target'].to(device)
179 | outputs = model(input_ids, attention_mask)
180 |
181 | cls_loss = nn.MSELoss()(torch.squeeze(outputs,1),target)
182 |
183 | loss = cls_loss / config['accumulation_steps']
184 |
185 | total_loss+=torch.pow(nn.MSELoss()(torch.squeeze(outputs,1),target),0.5).item() / config['accumulation_steps']
186 | loss.backward()
187 |
188 | if (count+1) % config['accumulation_steps'] == 0:
189 | optimizer.step()
190 | lr_scheduler.step()
191 | optimizer.zero_grad()
192 | count = 0
193 | total_loss = 0
194 | else:
195 | count+=1
196 |
197 | if ((step+1) % val_step == 0):
198 | l_val = nn.MSELoss(reduction='sum')
199 | with torch.no_grad():
200 | model.eval()
201 | total_loss_val = 0
202 | for batch in val_loader:
203 | input_ids = batch['input_ids'].to(device)
204 | attention_mask = batch['attention_mask'].to(device)
205 | outputs = model(input_ids, attention_mask)
206 |
207 | cls_loss_val = l_val(torch.squeeze(outputs),batch['target'].to(device))
208 |
209 | val_loss = cls_loss_val
210 |
211 | total_loss_val+=val_loss.item()
212 | total_loss_val/=len(val_dataset)
213 | total_loss_val = total_loss_val**0.5
214 |
215 | if min_valid_loss > total_loss_val:
216 | min_step = step
217 | min_valid_loss = total_loss_val
218 | #print("min loss updated to ",min_valid_loss," at step ",min_step)
219 | # Saving State Dict
220 | if not os.path.isdir(config['save_path']):
221 | os.mkdir(config['save_path'])
222 | torch.save(model.state_dict(), config['save_path'] + config['pseudo_save_name'])
223 | model.train()
224 | step+=1
225 | t_bar.update(1)
226 | del model,train_dataset,train_loader
227 | gc.collect()
228 | torch.cuda.empty_cache()
229 |
230 | return min_valid_loss
231 |
232 | def train_pseudo_5fold(config, label_path):
233 | device = "cuda:0"
234 | seed_everything(config['seed_'])
235 |
236 | train_data = pd.read_csv("./data/train.csv")
237 | train_data = create_folds(train_data, num_splits=5)
238 | model_dir = config['model_dir']
239 | tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True, model_max_length=256)
240 |
241 | min_val_losses = []
242 | for fold in range(config['n_folds']):
243 | train_dataset = CLRPDataset_pseudo_5fold(True,fold,train_data,tokenizer,label_path)
244 | train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
245 |
246 | val_dataset = CLRPDataset_pseudo_5fold(False,fold,train_data,tokenizer,label_path)
247 | val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True)
248 |
249 | if fold == 0:
250 | t_bar = tqdm(total=((len(train_dataset)*5//config['batch_size'])+1)*config['num_epoch'])
251 |
252 | total_train_steps = int(len(train_loader) * config['num_epoch'] / config['accumulation_steps'])
253 | val_step = 100*config['accumulation_steps']
254 | min_valid_loss = np.inf
255 |
256 | model = Custom_bert(config['model_dir']).to(device)
257 | _ = model.eval()
258 |
259 | if config['pretrained_path'] not in [None,'None']:
260 | model.load_state_dict(torch.load(config['pretrained_path']), strict=False)
261 |
262 | optimizer = get_optimizer(model,config)
263 | lr_scheduler = get_scheduler(optimizer,total_train_steps,config)
264 |
265 | step = 0
266 | min_step = 0
267 | last_save_step = 0
268 | last_save_index = 0
269 |
270 | optimizer.zero_grad()
271 | for epoch in range(config['num_epoch']):
272 | model.train()
273 | count = 0
274 | total_loss = 0
275 | for batch in train_loader:
276 | input_ids = batch['input_ids'].to(device)
277 | attention_mask = batch['attention_mask'].to(device)
278 | target = batch['target'].to(device)
279 |
280 | outputs = model(input_ids, attention_mask)
281 |
282 | cls_loss = nn.MSELoss()(torch.squeeze(outputs,1),target)
283 |
284 | loss = cls_loss / config['accumulation_steps']
285 |
286 | total_loss+=torch.pow(nn.MSELoss()(torch.squeeze(outputs,1),target),0.5).item() / config['accumulation_steps']
287 | loss.backward()
288 |
289 | if (count+1) % config['accumulation_steps'] == 0:
290 | optimizer.step()
291 | lr_scheduler.step()
292 | optimizer.zero_grad()
293 | count = 0
294 | total_loss = 0
295 | else:
296 | count+=1
297 |
298 | if ((step+1) % val_step == 0):
299 | model.eval()
300 | l_val = nn.MSELoss(reduction='sum')
301 | with torch.no_grad():
302 | total_loss_val = 0
303 | for batch in val_loader:
304 | input_ids = batch['input_ids'].to(device)
305 | attention_mask = batch['attention_mask'].to(device)
306 | outputs = model(input_ids, attention_mask)
307 |
308 | cls_loss_val = l_val(torch.squeeze(outputs),batch['target'].to(device))
309 |
310 | val_loss = cls_loss_val
311 |
312 | total_loss_val+=val_loss.item()
313 | total_loss_val/=len(val_dataset)
314 | total_loss_val = total_loss_val**0.5
315 |
316 | if min_valid_loss > total_loss_val and epoch > 0:
317 | min_step = step
318 | min_valid_loss = total_loss_val
319 | if not os.path.isdir('./models'):
320 | os.mkdir('./models')
321 | if not os.path.isdir(config['save_path']):
322 | os.mkdir(config['save_path'])
323 | if 'roberta' in config['model_dir']:
324 | torch.save(model.state_dict(), config['save_path']+f'roberta_large_{fold}.pt')
325 | else:
326 | torch.save(model.state_dict(), config['save_path']+f'deberta_large_{fold}.pt')
327 | model.train()
328 | step+=1
329 | t_bar.update(1)
330 | del model,train_dataset,train_loader,val_dataset,val_loader
331 | gc.collect()
332 | torch.cuda.empty_cache()
333 | min_val_losses.append(min_valid_loss)
334 | return min_val_losses
335 |
--------------------------------------------------------------------------------
/components/util.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 | import os
4 | import torch
5 | import pandas as pd
6 | from sklearn.model_selection import KFold,StratifiedKFold
7 |
8 | def seed_everything(seed):
9 | random.seed(seed)
10 | np.random.seed(seed)
11 | os.environ["PYTHONHASHSEED"] = str(seed)
12 |
13 | torch.manual_seed(seed)
14 | torch.cuda.manual_seed(seed)
15 | torch.cuda.manual_seed_all(seed)
16 |
17 | torch.backends.cudnn.deterministic = True
18 |
19 | def create_folds(data, num_splits):
20 | # we create a new column called kfold and fill it with -1
21 | data["kfold"] = -1
22 |
23 | # the next step is to randomize the rows of the data
24 | data = data.sample(frac=1).reset_index(drop=True)
25 |
26 | # calculate number of bins by Sturge's rule
27 | # I take the floor of the value, you can also
28 | # just round it
29 | num_bins = int(np.floor(1 + np.log2(len(data))))
30 |
31 | # bin targets
32 | data.loc[:, "bins"] = pd.cut(
33 | data["target"], bins=num_bins, labels=False
34 | )
35 |
36 | # initiate the kfold class from model_selection module
37 | kf = StratifiedKFold(n_splits=num_splits)
38 |
39 | # fill the new kfold column
40 | # note that, instead of targets, we use bins!
41 | for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
42 | data.loc[v_, 'kfold'] = f
43 |
44 | # drop the bins column
45 | data = data.drop("bins", axis=1)
46 |
47 | # return dataframe with folds
48 | return data
49 |
50 | def init_params(module_lst):
51 | for module in module_lst:
52 | for param in module.parameters():
53 | if param.dim() > 1:
54 | torch.nn.init.xavier_uniform_(param)
55 | return
56 |
57 | def generate_config(model_type,pretrained_path,save_path,lr_type,lr_setting):
58 | config = {'model_dir': './pretrained/roberta-large/',
59 | 'n_folds': 5,
60 | 'num_epoch': 3,
61 | 'weight_decay': 0.01,
62 | 'head_lr': 1e-4,
63 | 'weight_lr': 5e-2,
64 | 'base_lr': 7e-5,
65 | 'min_lr': 2e-5,
66 | 'low_lr': 1e-5,
67 | 'warm_up_ratio': 0.06,
68 | 'decline_1': 0.15,
69 | 'decline_2': 0.6,
70 | 'decline_3': 0.7,
71 | 'decline_4': 0.75,
72 | 'layerwise_decay_rate': 0.875**0.5,
73 | 'seed_': 88888888,
74 | 'reini_head':False,
75 | 'only_val_in_radius': True,
76 | 'save_center': 330,
77 | 'save_radius': 5,
78 | 'betas': (0.9, 0.999),
79 | }
80 | config['pretrained_path'] = pretrained_path
81 | config['save_path'] = save_path
82 | config['lr_type'] = lr_type
83 | if model_type == 'ro':
84 | config['model_dir'] = './pretrained/roberta-large/'
85 | config['batch_size'] = 16
86 | config['accumulation_steps'] = 1
87 | config['pseudo_save_name'] = 'roberta_large_single.pt'
88 | elif model_type == 'de':
89 | config['model_dir'] = './pretrained/deberta-large/'
90 | config['batch_size'] = 8
91 | config['accumulation_steps'] = 2
92 | config['save_center'] = 660
93 | config['save_radius'] = 10
94 | config['pseudo_save_name'] = 'deberta_large_single.pt'
95 |
96 | if lr_setting == '2':
97 | config['num_epoch'] = 2
98 | config['head_lr']= 1e-5
99 | config['weight_lr']= 5e-3
100 | config['base_lr']= 7e-6
101 | config['min_lr']= 2e-6
102 | config['low_lr']= 1e-6
103 | elif lr_setting == '3':
104 | config['head_lr']= 5e-5
105 | config['weight_lr']= 2e-3
106 | config['base_lr']= 3e-5
107 | config['min_lr']= 1e-5
108 | config['low_lr']= 5e-6
109 |
110 | return config
111 |
--------------------------------------------------------------------------------
/run_train.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | python3.9 0.prepare_data.py
3 | python3.9 1.roberta_pretrain.py
4 | python3.9 2.finetune.py ro ./models/roberta_large_pretrain.pt ./models/roberta_1/ custom 1
5 |
6 | python3.9 4.predict.py ./extra_data/extra_excerpt.csv ./extra_data/pseudo_1/ 1 1 ./models/roberta_1/
7 | python3.9 3.pseudo_train.py de ./extra_data/pseudo_1/ ./models/deberta_1/ 1
8 |
9 | python3.9 4.predict.py ./extra_data/extra_excerpt.csv ./extra_data/pseudo_2/ 2 1 ./models/roberta_1/ ./models/deberta_1/
10 | python3.9 3.pseudo_train.py de ./extra_data/pseudo_2/ ./models/deberta_2/ 0
11 | python3.9 2.finetune.py de ./models/deberta_2/deberta_large_single.pt ./models/deberta_2/ 3stage 3
12 | python3.9 4.predict.py ./extra_data/extra_excerpt.csv ./extra_data/pseudo_3/ 3 1 ./models/roberta_1/ ./models/deberta_1/ ./models/deberta_2/
13 | python3.9 3.pseudo_train.py ro ./extra_data/pseudo_3/ ./models/roberta_2/ 0
14 | python3.9 2.finetune.py ro ./models/roberta_2/roberta_large_single.pt ./models/roberta_2/ 3stage 3
15 |
--------------------------------------------------------------------------------