├── __pycache__ └── model.cpython-37.pyc ├── model.py ├── convert_data.py ├── README.md ├── pred.py └── train_pytorch.py /__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bino282/bert4news/HEAD/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from transformers import * 4 | from torch.nn import CrossEntropyLoss, MSELoss 5 | 6 | class BertClassification(BertPreTrainedModel): 7 | def __init__(self, config): 8 | super(BertClassification, self).__init__(config) 9 | self.num_labels = config.num_labels 10 | self.bert = BertModel(config) 11 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 12 | self.classifier = nn.Linear(4*config.hidden_size, self.config.num_labels) 13 | self.init_weights() 14 | 15 | def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, 16 | position_ids=None, head_mask=None, inputs_embeds=None, labels=None): 17 | 18 | outputs = self.bert(input_ids, 19 | attention_mask=attention_mask, 20 | token_type_ids=token_type_ids, 21 | position_ids=position_ids, 22 | head_mask=head_mask, 23 | inputs_embeds=inputs_embeds) 24 | 25 | #pooled_output = outputs[1] 26 | 27 | pooled_output = torch.cat((outputs[2][-1][:,0, ...],outputs[2][-2][:,0, ...], outputs[2][-3][:,0, ...], outputs[2][-4][:,0, ...]),-1) 28 | pooled_output = self.dropout(pooled_output) 29 | logits = self.classifier(pooled_output) 30 | 31 | outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here 32 | 33 | if labels is not None: 34 | if self.num_labels == 1: 35 | # We are doing regression 36 | loss_fct = MSELoss() 37 | loss = loss_fct(logits.view(-1), labels.view(-1)) 38 | else: 39 | loss_fct = CrossEntropyLoss() 40 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 41 | outputs = (loss,) + outputs 42 | 43 | return outputs # (loss), logits, (hidden_states), (attentions) -------------------------------------------------------------------------------- /convert_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | import re 4 | import os 5 | from sklearn.model_selection import train_test_split 6 | df = pd.read_csv("./data/all.csv",sep='\t') 7 | 8 | df_train,df_dev = train_test_split(df,test_size=0.2, random_state=42) 9 | df_train.to_csv("./data/train.csv",index=False) 10 | df_dev.to_csv("./data/dev.csv",index=False) 11 | print(df_train) 12 | exit() 13 | 14 | if not os.path.exists("./data"): 15 | os.mkdir("./data") 16 | 17 | ### Cleaning training file 18 | 19 | train = open("./raw/train.crash","r",encoding="utf-8").readlines() 20 | id_locations = [] 21 | label_locations = [] 22 | for idx, line in tqdm(enumerate(train)): 23 | line = line.strip() 24 | if line.startswith("train_"): 25 | id_locations.append(idx) 26 | elif line == "0" or line == "1": 27 | label_locations.append(idx) 28 | data = [] 29 | 30 | for id_loc, l_loc in tqdm(zip(id_locations, label_locations)): 31 | line_id = train[id_loc].strip() 32 | label = train[l_loc].strip() 33 | text = re.sub('\s+', ' ', ' '.join(train[id_loc + 1: l_loc])).strip()[1:-1].strip() 34 | data.append(f"{line_id}\t{text}\t{label}") 35 | 36 | with open("./data/train.csv", "w",encoding="utf-8") as f: 37 | f.write("id\ttext\tlabel\n") 38 | f.write("\n".join(data)) 39 | 40 | ### Cleaning test file 41 | 42 | test = open("./raw/test.crash","r",encoding="utf-8").readlines() 43 | id_locations = [] 44 | for idx, line in tqdm(enumerate(test)): 45 | line = line.strip() 46 | if line.startswith("test_"): 47 | id_locations.append(idx) 48 | data = [] 49 | 50 | for i, id_loc in tqdm(enumerate(id_locations)): 51 | if i >= len(id_locations) - 1: 52 | end = len(test) 53 | else: 54 | end = id_locations[i + 1] 55 | line_id = test[id_loc].strip() 56 | text = re.sub('\s+', ' ', ' '.join(test[id_loc + 1:end])).strip()[1:-1].strip() 57 | data.append(f"{line_id}\t{text}") 58 | 59 | with open("./data/test.csv", "w",encoding="utf-8") as f: 60 | f.write("id\ttext\n") 61 | f.write("\n".join(data)) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BERT for Vietnamese is trained on more 20 GB news dataset 2 | 3 | Apply for task sentiment analysis on using [AIViVN's comments dataset](https://www.aivivn.com/contests/6) 4 | 5 | The model achieved 0.90268 on the public leaderboard, (winner's score is 0.90087) 6 | Bert4news is used for a toolkit Vietnames(segmentation and Named Entity Recognition) at ViNLPtoolkit(https://github.com/bino282/ViNLP) 7 | 8 | ***************New Mar 11 , 2020 *************** 9 | 10 | **[BERT](https://github.com/google-research/bert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805). 11 | 12 | We use word sentencepiece, use basic bert tokenization and same config with bert base with lowercase = False. 13 | 14 | You can download trained model: 15 | - [tensorflow](https://drive.google.com/drive/folders/1d_MFVi32YRZTBHDNQahAGyqlzetS8XLU?usp=sharing). 16 | - [pytorch](https://drive.google.com/drive/folders/1-vAaQTdaLwMk2rEyTXOsTZAfZ7MTlyIQ?usp=sharing). 17 | 18 | Use with huggingface/transformers 19 | ``` bash 20 | import torch 21 | from transformers import AutoTokenizer,AutoModel 22 | tokenizer= AutoTokenizer.from_pretrained("NlpHUST/vibert4news-base-cased") 23 | bert_model = AutoModel.from_pretrained("NlpHUST/vibert4news-base-cased") 24 | 25 | line = "Tôi là sinh viên trường Bách Khoa Hà Nội ." 26 | input_id = tokenizer.encode(line,add_special_tokens = True) 27 | att_mask = [int(token_id > 0) for token_id in input_id] 28 | input_ids = torch.tensor([input_id]) 29 | att_masks = torch.tensor([att_mask]) 30 | with torch.no_grad(): 31 | features = bert_model(input_ids,att_masks) 32 | 33 | print(features) 34 | 35 | ``` 36 | 37 | Run training with base config 38 | 39 | ``` bash 40 | 41 | python train_pytorch.py \ 42 | --model_path=bert4news.pytorch \ 43 | --max_len=200 \ 44 | --batch_size=16 \ 45 | --epochs=6 \ 46 | --lr=2e-5 47 | 48 | ``` 49 | 50 | ### Contact information 51 | For personal communication related to this project, please contact Nha Nguyen Van (nha282@gmail.com). 52 | -------------------------------------------------------------------------------- /pred.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from transformers import BertTokenizer 4 | import torch 5 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 6 | 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--model_path',default='./model_save') 11 | parser.add_argument('--max_len',default=200,type=int) 12 | parser.add_argument('--batch_size',default=16,type=int) 13 | args = parser.parse_args() 14 | 15 | # If there's a GPU available... 16 | if torch.cuda.is_available(): 17 | 18 | # Tell PyTorch to use the GPU. 19 | device = torch.device("cuda") 20 | 21 | print('There are %d GPU(s) available.' % torch.cuda.device_count()) 22 | 23 | print('We will use the GPU:', torch.cuda.get_device_name(0)) 24 | # If not... 25 | else: 26 | print('No GPU available, using the CPU instead.') 27 | device = torch.device("cpu") 28 | MODEL_PATH = '../local/bert_vi/bert4news.pytorch' 29 | # Load the dataset into a pandas dataframe. 30 | df = pd.read_csv("./data/test.csv",sep="\t") 31 | 32 | # Report the number of sentences. 33 | print('Number of test sentences: {:,}\n'.format(df.shape[0])) 34 | 35 | # Create sentence and label lists 36 | id_test = df.id.values.tolist() 37 | sentences = df.text.values 38 | 39 | # Load the BERT tokenizer. 40 | print('Loading BERT tokenizer...') 41 | tokenizer = BertTokenizer.from_pretrained(MODEL_PATH, do_lower_case=False) 42 | 43 | # Print the original sentence. 44 | print(' Original: ', sentences[0]) 45 | 46 | # Print the sentence split into tokens. 47 | print('Tokenized: ', tokenizer.tokenize(sentences[0])) 48 | 49 | # Print the sentence mapped to token ids. 50 | print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0]))) 51 | 52 | 53 | # Tokenize all of the sentences and map the tokens to thier word IDs. 54 | input_ids = [] 55 | # For every sentence... 56 | for sent in sentences: 57 | try: 58 | if(len(sent)==0): 59 | sent='' 60 | print(sent) 61 | except: 62 | sent= '' 63 | print(sent) 64 | encoded_sent = tokenizer.encode( 65 | sent, # Sentence to encode. 66 | add_special_tokens = True, # Add '[CLS]' and '[SEP]' 67 | ) 68 | 69 | input_ids.append(encoded_sent) 70 | 71 | from keras.preprocessing.sequence import pad_sequences 72 | # Pad our input tokens 73 | MAX_LEN = args.max_len 74 | input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 75 | dtype="long", truncating="post", padding="post") 76 | 77 | # Create attention masks 78 | attention_masks = [] 79 | 80 | # Create a mask of 1s for each token followed by 0s for padding 81 | for seq in input_ids: 82 | seq_mask = [float(i>0) for i in seq] 83 | attention_masks.append(seq_mask) 84 | 85 | # Convert to tensors. 86 | prediction_inputs = torch.tensor(input_ids,dtype=torch.long) 87 | prediction_masks = torch.tensor(attention_masks,dtype=torch.long) 88 | 89 | # Set the batch size. 90 | batch_size = args.batch_size 91 | 92 | # Create the DataLoader. 93 | prediction_data = TensorDataset(prediction_inputs, prediction_masks) 94 | prediction_sampler = SequentialSampler(prediction_data) 95 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) 96 | 97 | # Prediction on test set 98 | print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs))) 99 | 100 | # load_moel 101 | from transformers import BertForSequenceClassification, AdamW, BertConfig 102 | import torch.nn.functional as F 103 | import os 104 | MODEL_PATH = args.model_path 105 | 106 | list_model = os.listdir(MODEL_PATH) 107 | list_predictions = [] 108 | for model_name in list_model: 109 | model = BertForSequenceClassification.from_pretrained( 110 | os.path.join(MODEL_PATH,model_name), 111 | num_labels = 2, 112 | output_attentions = False, 113 | output_hidden_states = False 114 | ) 115 | if torch.cuda.is_available(): 116 | model.cuda() 117 | model.eval() 118 | 119 | # Tracking variables 120 | predictions= [] 121 | # Predict 122 | for batch in prediction_dataloader: 123 | batch = tuple(t.to(device) for t in batch) 124 | b_input_ids, b_input_mask = batch 125 | with torch.no_grad(): 126 | outputs = model(b_input_ids, token_type_ids=None, 127 | attention_mask=b_input_mask) 128 | 129 | logits = outputs[0] 130 | logits = F.softmax(logits,dim=1) 131 | logits = logits.detach().cpu().numpy() 132 | predictions.append(logits) 133 | flat_predictions = [item for sublist in predictions for item in sublist] 134 | list_predictions.append(flat_predictions) 135 | 136 | list_predictions = np.asarray(list_predictions) 137 | 138 | list_predictions = np.mean(list_predictions,axis=0) 139 | 140 | fw = open("submission.csv","w",encoding="utf-8") 141 | fw.write("id,label") 142 | fw.write("\n") 143 | flat_predictions = np.argmax(list_predictions, axis=1).flatten().tolist() 144 | for i in range(len(id_test)): 145 | fw.write(",".join([id_test[i],str(flat_predictions[i])])) 146 | fw.write('\n') 147 | fw.close() -------------------------------------------------------------------------------- /train_pytorch.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import BertTokenizer 3 | import pandas as pd 4 | import torch 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--model_path',default='bert4news.pytorch') 9 | parser.add_argument('--max_len',default=200,type=int) 10 | parser.add_argument('--batch_size',default=16,type=int) 11 | parser.add_argument('--epochs',default=6,type=int) 12 | parser.add_argument('--lr',default=2e-5,type=float) 13 | args = parser.parse_args() 14 | 15 | if torch.cuda.is_available(): 16 | device = torch.device("cuda") 17 | 18 | print('There are %d GPU(s) available.' % torch.cuda.device_count()) 19 | 20 | print('We will use the GPU:', torch.cuda.get_device_name(0)) 21 | else: 22 | print('No GPU available, using the CPU instead.') 23 | device = torch.device("cpu") 24 | 25 | MODEL_PATH = args.model_path 26 | MAX_LEN = args.max_len 27 | batch_size = args.batch_size 28 | epochs = args.epochs 29 | lr = args.lr 30 | 31 | 32 | df= pd.read_csv("./data/all.csv",sep="\t") 33 | sentences = df.text.values 34 | labels = df.label.values 35 | 36 | # Load the BERT tokenizer. 37 | print('Loading BERT tokenizer...') 38 | tokenizer = BertTokenizer.from_pretrained(MODEL_PATH, do_lower_case=False) 39 | print(' Original: ', sentences[0]) 40 | print('Tokenized: ', tokenizer.tokenize(sentences[0])) 41 | print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0]))) 42 | input_ids = [] 43 | 44 | # For every sentence... 45 | for sent in sentences: 46 | encoded_sent = tokenizer.encode( 47 | sent, # Sentence to encode. 48 | add_special_tokens = True, # Add '[CLS]' and '[SEP]' 49 | #return_tensors = 'pt', # Return pytorch tensors. 50 | ) 51 | input_ids.append(encoded_sent) 52 | # Print sentence 0, now as a list of IDs. 53 | print('Original: ', sentences[0]) 54 | print('Token IDs:', input_ids[0]) 55 | 56 | from keras.preprocessing.sequence import pad_sequences 57 | 58 | print('\nPadding/truncating all sentences to %d values...' % MAX_LEN) 59 | 60 | print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id)) 61 | 62 | input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 63 | value=0, truncating="post", padding="post") 64 | print(input_ids) 65 | 66 | print('\nDone.') 67 | 68 | # Create attention masks 69 | attention_masks = [] 70 | 71 | # For each sentence... 72 | for sent in input_ids: 73 | 74 | # Create the attention mask. 75 | # - If a token ID is 0, then it's padding, set the mask to 0. 76 | # - If a token ID is > 0, then it's a real token, set the mask to 1. 77 | att_mask = [int(token_id > 0) for token_id in sent] 78 | 79 | # Store the attention mask for this sentence. 80 | attention_masks.append(att_mask) 81 | 82 | # Use train_test_split to split our data into train and validation sets for 83 | # training 84 | from sklearn.model_selection import train_test_split 85 | 86 | # Use 90% for training and 10% for validation. 87 | train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 88 | random_state=42, test_size=0.1) 89 | # Do the same for the masks. 90 | train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, 91 | random_state=42, test_size=0.1) 92 | 93 | # Convert all inputs and labels into torch tensors, the required datatype 94 | # for our model. 95 | train_inputs = torch.tensor(train_inputs,dtype=torch.long) 96 | validation_inputs = torch.tensor(validation_inputs,dtype=torch.long) 97 | 98 | train_labels = torch.tensor(train_labels,dtype=torch.long) 99 | validation_labels = torch.tensor(validation_labels,dtype=torch.long) 100 | 101 | train_masks = torch.tensor(train_masks,dtype=torch.long) 102 | validation_masks = torch.tensor(validation_masks,dtype=torch.long) 103 | 104 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 105 | 106 | # Create the DataLoader for our training set. 107 | train_data = TensorDataset(train_inputs, train_masks, train_labels) 108 | train_sampler = RandomSampler(train_data) 109 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) 110 | 111 | # Create the DataLoader for our validation set. 112 | validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) 113 | validation_sampler = SequentialSampler(validation_data) 114 | validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) 115 | 116 | from transformers import BertForSequenceClassification, AdamW, BertConfig 117 | from model import BertClassification 118 | 119 | model = BertForSequenceClassification.from_pretrained( 120 | MODEL_PATH, 121 | num_labels = 2, 122 | output_attentions = False, 123 | output_hidden_states = True, 124 | ) 125 | 126 | # Tell pytorch to run this model on the GPU. 127 | if torch.cuda.is_available(): 128 | model.cuda() 129 | 130 | # Get all of the model's parameters as a list of tuples. 131 | params = list(model.named_parameters()) 132 | 133 | print('The BERT model has {:} different named parameters.\n'.format(len(params))) 134 | 135 | print('==== Embedding Layer ====\n') 136 | 137 | for p in params[0:5]: 138 | print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) 139 | 140 | print('\n==== First Transformer ====\n') 141 | 142 | for p in params[5:21]: 143 | print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) 144 | 145 | print('\n==== Output Layer ====\n') 146 | 147 | for p in params[-4:]: 148 | print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) 149 | 150 | optimizer = AdamW(model.parameters(), 151 | lr = lr, 152 | eps = 1e-8 153 | ) 154 | 155 | from transformers import get_linear_schedule_with_warmup 156 | 157 | # Total number of training steps is number of batches * number of epochs. 158 | total_steps = len(train_dataloader) * epochs 159 | 160 | # Create the learning rate scheduler. 161 | scheduler = get_linear_schedule_with_warmup(optimizer, 162 | num_warmup_steps = 0, # Default value in run_glue.py 163 | num_training_steps = total_steps) 164 | 165 | import numpy as np 166 | from sklearn.metrics import f1_score 167 | # Function to calculate the accuracy of our predictions vs labels 168 | def flat_accuracy(preds, labels): 169 | pred_flat = np.argmax(preds, axis=1).flatten() 170 | labels_flat = labels.flatten() 171 | return f1_score(pred_flat, labels_flat) 172 | import time 173 | import datetime 174 | 175 | def format_time(elapsed): 176 | ''' 177 | Takes a time in seconds and returns a string hh:mm:ss 178 | ''' 179 | # Round to the nearest second. 180 | elapsed_rounded = int(round((elapsed))) 181 | 182 | # Format as hh:mm:ss 183 | return str(datetime.timedelta(seconds=elapsed_rounded)) 184 | 185 | import random 186 | 187 | # Set the seed value all over the place to make this reproducible. 188 | seed_val = 42 189 | 190 | random.seed(seed_val) 191 | np.random.seed(seed_val) 192 | torch.manual_seed(seed_val) 193 | torch.cuda.manual_seed_all(seed_val) 194 | 195 | # Store the average loss after each epoch so we can plot them. 196 | loss_values = [] 197 | 198 | # For each epoch... 199 | for epoch_i in range(0, epochs): 200 | 201 | print("") 202 | print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) 203 | print('Training...') 204 | 205 | t0 = time.time() 206 | 207 | # Reset the total loss for this epoch. 208 | total_loss = 0 209 | model.train() 210 | 211 | # For each batch of training data... 212 | for step, batch in enumerate(train_dataloader): 213 | 214 | # Progress update every 40 batches. 215 | if step % 40 == 0 and not step == 0: 216 | # Calculate elapsed time in minutes. 217 | elapsed = format_time(time.time() - t0) 218 | 219 | # Report progress. 220 | print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) 221 | 222 | b_input_ids = batch[0].to(device) 223 | b_input_mask = batch[1].to(device) 224 | b_labels = batch[2].to(device) 225 | 226 | model.zero_grad() 227 | 228 | outputs = model(input_ids=b_input_ids, 229 | token_type_ids=None, 230 | attention_mask=b_input_mask, 231 | inputs_embeds=None, 232 | labels=b_labels) 233 | 234 | loss = outputs[0] 235 | 236 | total_loss += loss.item() 237 | loss.backward() 238 | torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 239 | optimizer.step() 240 | scheduler.step() 241 | 242 | # Calculate the average loss over the training data. 243 | avg_train_loss = total_loss / len(train_dataloader) 244 | 245 | # Store the loss value for plotting the learning curve. 246 | loss_values.append(avg_train_loss) 247 | 248 | print("") 249 | print(" Average training loss: {0:.2f}".format(avg_train_loss)) 250 | print(" Training epcoh took: {:}".format(format_time(time.time() - t0))) 251 | 252 | # ======================================== 253 | # Validation 254 | # ======================================== 255 | # After the completion of each training epoch, measure our performance on 256 | # our validation set. 257 | 258 | print("") 259 | print("Running Validation...") 260 | 261 | t0 = time.time() 262 | 263 | model.eval() 264 | 265 | # Tracking variables 266 | eval_loss, eval_accuracy = 0, 0 267 | nb_eval_steps, nb_eval_examples = 0, 0 268 | 269 | # Evaluate data for one epoch 270 | for batch in validation_dataloader: 271 | 272 | # Add batch to GPU 273 | batch = tuple(t.to(device) for t in batch) 274 | 275 | # Unpack the inputs from our dataloader 276 | b_input_ids, b_input_mask, b_labels = batch 277 | 278 | # Telling the model not to compute or store gradients, saving memory and 279 | # speeding up validation 280 | with torch.no_grad(): 281 | outputs = model(b_input_ids, 282 | token_type_ids=None, 283 | attention_mask=b_input_mask) 284 | 285 | logits = outputs[0] 286 | 287 | # Move logits and labels to CPU 288 | logits = logits.detach().cpu().numpy() 289 | label_ids = b_labels.to('cpu').numpy() 290 | 291 | # Calculate the accuracy for this batch of test sentences. 292 | tmp_eval_accuracy = flat_accuracy(logits, label_ids) 293 | 294 | # Accumulate the total accuracy. 295 | eval_accuracy += tmp_eval_accuracy 296 | 297 | # Track the number of batches 298 | nb_eval_steps += 1 299 | 300 | # Report the final accuracy for this validation run. 301 | print(" F1 score: {0:.2f}".format(eval_accuracy/nb_eval_steps)) 302 | print(" Validation took: {:}".format(format_time(time.time() - t0))) 303 | import os 304 | 305 | # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() 306 | 307 | 308 | output_dir = os.path.join('./model_save', 'checkpoint-{}-{}'.format(lr, eval_accuracy/nb_eval_steps)) 309 | 310 | # Create output directory if needed 311 | if not os.path.exists(output_dir): 312 | os.makedirs(output_dir) 313 | 314 | print("Saving model to %s" % output_dir) 315 | 316 | # Save a trained model, configuration and tokenizer using `save_pretrained()`. 317 | # They can then be reloaded using `from_pretrained()` 318 | model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training 319 | model_to_save.save_pretrained(output_dir) 320 | tokenizer.save_pretrained(output_dir) 321 | 322 | print("") 323 | print("Training complete!") --------------------------------------------------------------------------------