├── __pycache__
    └── model.cpython-37.pyc
├── model.py
├── convert_data.py
├── README.md
├── pred.py
└── train_pytorch.py


/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bino282/bert4news/HEAD/__pycache__/model.cpython-37.pyc


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from transformers import *
 4 | from torch.nn import CrossEntropyLoss, MSELoss
 5 | 
 6 | class BertClassification(BertPreTrainedModel):
 7 |     def __init__(self, config):
 8 |        super(BertClassification, self).__init__(config)
 9 |        self.num_labels = config.num_labels
10 |        self.bert = BertModel(config)
11 |        self.dropout = nn.Dropout(config.hidden_dropout_prob)
12 |        self.classifier = nn.Linear(4*config.hidden_size, self.config.num_labels)
13 |        self.init_weights()
14 | 
15 |     def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
16 |                 position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
17 | 
18 |         outputs = self.bert(input_ids,
19 |                             attention_mask=attention_mask,
20 |                             token_type_ids=token_type_ids,
21 |                             position_ids=position_ids,
22 |                             head_mask=head_mask,
23 |                             inputs_embeds=inputs_embeds)
24 | 
25 |         #pooled_output = outputs[1]
26 | 
27 |         pooled_output = torch.cat((outputs[2][-1][:,0, ...],outputs[2][-2][:,0, ...], outputs[2][-3][:,0, ...], outputs[2][-4][:,0, ...]),-1)
28 |         pooled_output = self.dropout(pooled_output)
29 |         logits = self.classifier(pooled_output)
30 | 
31 |         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
32 | 
33 |         if labels is not None:
34 |             if self.num_labels == 1:
35 |                 #  We are doing regression
36 |                 loss_fct = MSELoss()
37 |                 loss = loss_fct(logits.view(-1), labels.view(-1))
38 |             else:
39 |                 loss_fct = CrossEntropyLoss()
40 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
41 |             outputs = (loss,) + outputs
42 | 
43 |         return outputs  # (loss), logits, (hidden_states), (attentions)


--------------------------------------------------------------------------------
/convert_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from tqdm import tqdm
 3 | import re
 4 | import os 
 5 | from sklearn.model_selection import train_test_split
 6 | df = pd.read_csv("./data/all.csv",sep='\t')
 7 | 
 8 | df_train,df_dev = train_test_split(df,test_size=0.2, random_state=42)
 9 | df_train.to_csv("./data/train.csv",index=False)
10 | df_dev.to_csv("./data/dev.csv",index=False)
11 | print(df_train)
12 | exit()
13 | 
14 | if not os.path.exists("./data"):
15 |     os.mkdir("./data")
16 | 
17 | ### Cleaning training file
18 | 
19 | train = open("./raw/train.crash","r",encoding="utf-8").readlines()
20 | id_locations = []
21 | label_locations = []
22 | for idx, line in tqdm(enumerate(train)):
23 |     line = line.strip()
24 |     if line.startswith("train_"):
25 |         id_locations.append(idx)
26 |     elif line == "0" or line == "1":
27 |         label_locations.append(idx)
28 | data = []
29 | 
30 | for id_loc, l_loc in tqdm(zip(id_locations, label_locations)):
31 |     line_id = train[id_loc].strip()
32 |     label = train[l_loc].strip()
33 |     text = re.sub('\s+', ' ', ' '.join(train[id_loc + 1: l_loc])).strip()[1:-1].strip()
34 |     data.append(f"{line_id}\t{text}\t{label}")
35 | 
36 | with open("./data/train.csv", "w",encoding="utf-8") as f:
37 |     f.write("id\ttext\tlabel\n")
38 |     f.write("\n".join(data))
39 | 
40 | ### Cleaning test file
41 | 
42 | test = open("./raw/test.crash","r",encoding="utf-8").readlines()
43 | id_locations = []
44 | for idx, line in tqdm(enumerate(test)):
45 |     line = line.strip()
46 |     if line.startswith("test_"):
47 |         id_locations.append(idx)
48 | data = []
49 | 
50 | for i, id_loc in tqdm(enumerate(id_locations)):
51 |     if i >= len(id_locations) - 1:
52 |         end = len(test)
53 |     else:
54 |         end = id_locations[i + 1]
55 |     line_id = test[id_loc].strip()
56 |     text = re.sub('\s+', ' ', ' '.join(test[id_loc + 1:end])).strip()[1:-1].strip()
57 |     data.append(f"{line_id}\t{text}")
58 | 
59 | with open("./data/test.csv", "w",encoding="utf-8") as f:
60 |     f.write("id\ttext\n")
61 |     f.write("\n".join(data))


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # BERT for Vietnamese is trained on more 20 GB news dataset
 2 | 
 3 | Apply for task sentiment analysis on using [AIViVN's comments dataset](https://www.aivivn.com/contests/6)
 4 | 
 5 | The model achieved 0.90268 on the public leaderboard, (winner's score is 0.90087)
 6 | Bert4news is used for a toolkit Vietnames(segmentation and Named Entity Recognition) at ViNLPtoolkit(https://github.com/bino282/ViNLP)
 7 | 
 8 | ***************New Mar 11 , 2020 ***************
 9 | 
10 | **[BERT](https://github.com/google-research/bert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
11 | 
12 | We use word sentencepiece, use basic bert tokenization and same config with bert base with lowercase = False.
13 | 
14 | You can download trained model:
15 | - [tensorflow](https://drive.google.com/drive/folders/1d_MFVi32YRZTBHDNQahAGyqlzetS8XLU?usp=sharing).
16 | - [pytorch](https://drive.google.com/drive/folders/1-vAaQTdaLwMk2rEyTXOsTZAfZ7MTlyIQ?usp=sharing).
17 | 
18 | Use with huggingface/transformers
19 | ``` bash
20 | import torch
21 | from transformers import AutoTokenizer,AutoModel
22 | tokenizer= AutoTokenizer.from_pretrained("NlpHUST/vibert4news-base-cased")
23 | bert_model = AutoModel.from_pretrained("NlpHUST/vibert4news-base-cased")
24 | 
25 | line = "Tôi là sinh viên trường Bách Khoa Hà Nội ."
26 | input_id = tokenizer.encode(line,add_special_tokens = True)
27 | att_mask = [int(token_id > 0) for token_id in input_id]
28 | input_ids = torch.tensor([input_id])
29 | att_masks = torch.tensor([att_mask])
30 | with torch.no_grad():
31 |     features = bert_model(input_ids,att_masks)
32 | 
33 | print(features)
34 | 
35 | ```
36 | 
37 | Run training with base config
38 | 
39 | ``` bash
40 | 
41 | python train_pytorch.py \
42 |   --model_path=bert4news.pytorch \
43 |   --max_len=200 \
44 |   --batch_size=16 \
45 |   --epochs=6 \
46 |   --lr=2e-5
47 | 
48 | ```
49 | 
50 | ### Contact information
51 | For personal communication related to this project, please contact Nha Nguyen Van (nha282@gmail.com).
52 | 


--------------------------------------------------------------------------------
/pred.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from transformers import BertTokenizer
  4 | import torch
  5 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
  6 | 
  7 | import argparse
  8 | 
  9 | parser = argparse.ArgumentParser()
 10 | parser.add_argument('--model_path',default='./model_save')
 11 | parser.add_argument('--max_len',default=200,type=int)
 12 | parser.add_argument('--batch_size',default=16,type=int)
 13 | args = parser.parse_args()
 14 | 
 15 | # If there's a GPU available...
 16 | if torch.cuda.is_available():    
 17 | 
 18 |     # Tell PyTorch to use the GPU.    
 19 |     device = torch.device("cuda")
 20 | 
 21 |     print('There are %d GPU(s) available.' % torch.cuda.device_count())
 22 | 
 23 |     print('We will use the GPU:', torch.cuda.get_device_name(0))
 24 | # If not...
 25 | else:
 26 |     print('No GPU available, using the CPU instead.')
 27 |     device = torch.device("cpu")
 28 | MODEL_PATH = '../local/bert_vi/bert4news.pytorch'
 29 | # Load the dataset into a pandas dataframe.
 30 | df = pd.read_csv("./data/test.csv",sep="\t")
 31 | 
 32 | # Report the number of sentences.
 33 | print('Number of test sentences: {:,}\n'.format(df.shape[0]))
 34 | 
 35 | # Create sentence and label lists
 36 | id_test = df.id.values.tolist()
 37 | sentences = df.text.values
 38 | 
 39 | # Load the BERT tokenizer.
 40 | print('Loading BERT tokenizer...')
 41 | tokenizer = BertTokenizer.from_pretrained(MODEL_PATH, do_lower_case=False)
 42 | 
 43 | # Print the original sentence.
 44 | print(' Original: ', sentences[0])
 45 | 
 46 | # Print the sentence split into tokens.
 47 | print('Tokenized: ', tokenizer.tokenize(sentences[0]))
 48 | 
 49 | # Print the sentence mapped to token ids.
 50 | print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))
 51 | 
 52 | 
 53 | # Tokenize all of the sentences and map the tokens to thier word IDs.
 54 | input_ids = []
 55 | # For every sentence...
 56 | for sent in sentences:
 57 |     try:
 58 |         if(len(sent)==0):
 59 |             sent=''
 60 |             print(sent)
 61 |     except:
 62 |         sent= ''
 63 |         print(sent)
 64 |     encoded_sent = tokenizer.encode(
 65 |                         sent,                      # Sentence to encode.
 66 |                         add_special_tokens = True, # Add '[CLS]' and '[SEP]'
 67 |                    )
 68 |     
 69 |     input_ids.append(encoded_sent)
 70 | 
 71 | from keras.preprocessing.sequence import pad_sequences
 72 | # Pad our input tokens
 73 | MAX_LEN = args.max_len
 74 | input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
 75 |                           dtype="long", truncating="post", padding="post")
 76 | 
 77 | # Create attention masks
 78 | attention_masks = []
 79 | 
 80 | # Create a mask of 1s for each token followed by 0s for padding
 81 | for seq in input_ids:
 82 |   seq_mask = [float(i>0) for i in seq]
 83 |   attention_masks.append(seq_mask) 
 84 | 
 85 | # Convert to tensors.
 86 | prediction_inputs = torch.tensor(input_ids,dtype=torch.long)
 87 | prediction_masks = torch.tensor(attention_masks,dtype=torch.long)
 88 | 
 89 | # Set the batch size.  
 90 | batch_size = args.batch_size
 91 | 
 92 | # Create the DataLoader.
 93 | prediction_data = TensorDataset(prediction_inputs, prediction_masks)
 94 | prediction_sampler = SequentialSampler(prediction_data)
 95 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
 96 | 
 97 | # Prediction on test set
 98 | print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
 99 | 
100 | # load_moel
101 | from transformers import BertForSequenceClassification, AdamW, BertConfig
102 | import torch.nn.functional as F
103 | import os
104 | MODEL_PATH = args.model_path
105 | 
106 | list_model = os.listdir(MODEL_PATH)
107 | list_predictions = []
108 | for model_name in list_model:
109 |     model = BertForSequenceClassification.from_pretrained(
110 |         os.path.join(MODEL_PATH,model_name),
111 |         num_labels = 2,
112 |         output_attentions = False,
113 |         output_hidden_states = False
114 |     )
115 |     if torch.cuda.is_available():
116 |         model.cuda()
117 |     model.eval()
118 | 
119 |     # Tracking variables 
120 |     predictions= []
121 |     # Predict 
122 |     for batch in prediction_dataloader:
123 |         batch = tuple(t.to(device) for t in batch)
124 |         b_input_ids, b_input_mask = batch
125 |         with torch.no_grad():
126 |             outputs = model(b_input_ids, token_type_ids=None, 
127 |                             attention_mask=b_input_mask)
128 | 
129 |         logits = outputs[0]
130 |         logits = F.softmax(logits,dim=1)
131 |         logits = logits.detach().cpu().numpy()  
132 |         predictions.append(logits)
133 |     flat_predictions = [item for sublist in predictions for item in sublist]
134 |     list_predictions.append(flat_predictions)
135 | 
136 | list_predictions = np.asarray(list_predictions)
137 | 
138 | list_predictions = np.mean(list_predictions,axis=0)
139 | 
140 | fw = open("submission.csv","w",encoding="utf-8")
141 | fw.write("id,label")
142 | fw.write("\n")
143 | flat_predictions = np.argmax(list_predictions, axis=1).flatten().tolist()
144 | for i in range(len(id_test)):
145 |     fw.write(",".join([id_test[i],str(flat_predictions[i])]))
146 |     fw.write('\n')
147 | fw.close()


--------------------------------------------------------------------------------
/train_pytorch.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from transformers import BertTokenizer
  3 | import pandas as pd
  4 | import torch
  5 | import argparse
  6 | 
  7 | parser = argparse.ArgumentParser()
  8 | parser.add_argument('--model_path',default='bert4news.pytorch')
  9 | parser.add_argument('--max_len',default=200,type=int)
 10 | parser.add_argument('--batch_size',default=16,type=int)
 11 | parser.add_argument('--epochs',default=6,type=int)
 12 | parser.add_argument('--lr',default=2e-5,type=float)
 13 | args = parser.parse_args()
 14 | 
 15 | if torch.cuda.is_available():       
 16 |     device = torch.device("cuda")
 17 | 
 18 |     print('There are %d GPU(s) available.' % torch.cuda.device_count())
 19 | 
 20 |     print('We will use the GPU:', torch.cuda.get_device_name(0))
 21 | else:
 22 |     print('No GPU available, using the CPU instead.')
 23 |     device = torch.device("cpu")
 24 | 
 25 | MODEL_PATH = args.model_path
 26 | MAX_LEN = args.max_len
 27 | batch_size = args.batch_size
 28 | epochs = args.epochs
 29 | lr = args.lr
 30 | 
 31 | 
 32 | df= pd.read_csv("./data/all.csv",sep="\t")
 33 | sentences = df.text.values
 34 | labels = df.label.values
 35 | 
 36 | # Load the BERT tokenizer.
 37 | print('Loading BERT tokenizer...')
 38 | tokenizer = BertTokenizer.from_pretrained(MODEL_PATH, do_lower_case=False)
 39 | print(' Original: ', sentences[0])
 40 | print('Tokenized: ', tokenizer.tokenize(sentences[0]))
 41 | print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))
 42 | input_ids = []
 43 | 
 44 | # For every sentence...
 45 | for sent in sentences:
 46 |     encoded_sent = tokenizer.encode(
 47 |                         sent,                      # Sentence to encode.
 48 |                         add_special_tokens = True, # Add '[CLS]' and '[SEP]'
 49 |                         #return_tensors = 'pt',     # Return pytorch tensors.
 50 |                    )
 51 |     input_ids.append(encoded_sent)
 52 | # Print sentence 0, now as a list of IDs.
 53 | print('Original: ', sentences[0])
 54 | print('Token IDs:', input_ids[0])
 55 | 
 56 | from keras.preprocessing.sequence import pad_sequences
 57 | 
 58 | print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
 59 | 
 60 | print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))
 61 | 
 62 | input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
 63 |                           value=0, truncating="post", padding="post")
 64 | print(input_ids)
 65 | 
 66 | print('\nDone.')
 67 | 
 68 | # Create attention masks
 69 | attention_masks = []
 70 | 
 71 | # For each sentence...
 72 | for sent in input_ids:
 73 |     
 74 |     # Create the attention mask.
 75 |     #   - If a token ID is 0, then it's padding, set the mask to 0.
 76 |     #   - If a token ID is > 0, then it's a real token, set the mask to 1.
 77 |     att_mask = [int(token_id > 0) for token_id in sent]
 78 |     
 79 |     # Store the attention mask for this sentence.
 80 |     attention_masks.append(att_mask)
 81 | 
 82 | # Use train_test_split to split our data into train and validation sets for
 83 | # training
 84 | from sklearn.model_selection import train_test_split
 85 | 
 86 | # Use 90% for training and 10% for validation.
 87 | train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
 88 |                                                             random_state=42, test_size=0.1)
 89 | # Do the same for the masks.
 90 | train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
 91 |                                              random_state=42, test_size=0.1)
 92 | 
 93 | # Convert all inputs and labels into torch tensors, the required datatype 
 94 | # for our model.
 95 | train_inputs = torch.tensor(train_inputs,dtype=torch.long)
 96 | validation_inputs = torch.tensor(validation_inputs,dtype=torch.long)
 97 | 
 98 | train_labels = torch.tensor(train_labels,dtype=torch.long)
 99 | validation_labels = torch.tensor(validation_labels,dtype=torch.long)
100 | 
101 | train_masks = torch.tensor(train_masks,dtype=torch.long)
102 | validation_masks = torch.tensor(validation_masks,dtype=torch.long)
103 | 
104 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
105 | 
106 | # Create the DataLoader for our training set.
107 | train_data = TensorDataset(train_inputs, train_masks, train_labels)
108 | train_sampler = RandomSampler(train_data)
109 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
110 | 
111 | # Create the DataLoader for our validation set.
112 | validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
113 | validation_sampler = SequentialSampler(validation_data)
114 | validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
115 | 
116 | from transformers import BertForSequenceClassification, AdamW, BertConfig
117 | from model import BertClassification
118 | 
119 | model = BertForSequenceClassification.from_pretrained(
120 |     MODEL_PATH,
121 |     num_labels = 2,
122 |     output_attentions = False,
123 |     output_hidden_states = True,
124 | )
125 | 
126 | # Tell pytorch to run this model on the GPU.
127 | if torch.cuda.is_available():
128 |     model.cuda()
129 | 
130 | # Get all of the model's parameters as a list of tuples.
131 | params = list(model.named_parameters())
132 | 
133 | print('The BERT model has {:} different named parameters.\n'.format(len(params)))
134 | 
135 | print('==== Embedding Layer ====\n')
136 | 
137 | for p in params[0:5]:
138 |     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
139 | 
140 | print('\n==== First Transformer ====\n')
141 | 
142 | for p in params[5:21]:
143 |     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
144 | 
145 | print('\n==== Output Layer ====\n')
146 | 
147 | for p in params[-4:]:
148 |     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
149 | 
150 | optimizer = AdamW(model.parameters(),
151 |                   lr = lr,
152 |                   eps = 1e-8
153 |                 )
154 | 
155 | from transformers import get_linear_schedule_with_warmup
156 | 
157 | # Total number of training steps is number of batches * number of epochs.
158 | total_steps = len(train_dataloader) * epochs
159 | 
160 | # Create the learning rate scheduler.
161 | scheduler = get_linear_schedule_with_warmup(optimizer, 
162 |                                             num_warmup_steps = 0, # Default value in run_glue.py
163 |                                             num_training_steps = total_steps)
164 | 
165 | import numpy as np
166 | from sklearn.metrics import f1_score
167 | # Function to calculate the accuracy of our predictions vs labels
168 | def flat_accuracy(preds, labels):
169 |     pred_flat = np.argmax(preds, axis=1).flatten()
170 |     labels_flat = labels.flatten()
171 |     return f1_score(pred_flat, labels_flat)
172 | import time
173 | import datetime
174 | 
175 | def format_time(elapsed):
176 |     '''
177 |     Takes a time in seconds and returns a string hh:mm:ss
178 |     '''
179 |     # Round to the nearest second.
180 |     elapsed_rounded = int(round((elapsed)))
181 |     
182 |     # Format as hh:mm:ss
183 |     return str(datetime.timedelta(seconds=elapsed_rounded))
184 | 
185 | import random
186 | 
187 | # Set the seed value all over the place to make this reproducible.
188 | seed_val = 42
189 | 
190 | random.seed(seed_val)
191 | np.random.seed(seed_val)
192 | torch.manual_seed(seed_val)
193 | torch.cuda.manual_seed_all(seed_val)
194 | 
195 | # Store the average loss after each epoch so we can plot them.
196 | loss_values = []
197 | 
198 | # For each epoch...
199 | for epoch_i in range(0, epochs):
200 | 
201 |     print("")
202 |     print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
203 |     print('Training...')
204 | 
205 |     t0 = time.time()
206 | 
207 |     # Reset the total loss for this epoch.
208 |     total_loss = 0
209 |     model.train()
210 | 
211 |     # For each batch of training data...
212 |     for step, batch in enumerate(train_dataloader):
213 | 
214 |         # Progress update every 40 batches.
215 |         if step % 40 == 0 and not step == 0:
216 |             # Calculate elapsed time in minutes.
217 |             elapsed = format_time(time.time() - t0)
218 |             
219 |             # Report progress.
220 |             print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
221 | 
222 |         b_input_ids = batch[0].to(device)
223 |         b_input_mask = batch[1].to(device)
224 |         b_labels = batch[2].to(device)
225 | 
226 |         model.zero_grad() 
227 | 
228 |         outputs = model(input_ids=b_input_ids, 
229 |                     token_type_ids=None, 
230 |                     attention_mask=b_input_mask, 
231 |                     inputs_embeds=None,
232 |                     labels=b_labels)
233 |     
234 |         loss = outputs[0]
235 | 
236 |         total_loss += loss.item()
237 |         loss.backward()
238 |         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
239 |         optimizer.step()
240 |         scheduler.step()
241 | 
242 |     # Calculate the average loss over the training data.
243 |     avg_train_loss = total_loss / len(train_dataloader)            
244 |     
245 |     # Store the loss value for plotting the learning curve.
246 |     loss_values.append(avg_train_loss)
247 | 
248 |     print("")
249 |     print("  Average training loss: {0:.2f}".format(avg_train_loss))
250 |     print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
251 |         
252 |     # ========================================
253 |     #               Validation
254 |     # ========================================
255 |     # After the completion of each training epoch, measure our performance on
256 |     # our validation set.
257 | 
258 |     print("")
259 |     print("Running Validation...")
260 | 
261 |     t0 = time.time()
262 | 
263 |     model.eval()
264 | 
265 |     # Tracking variables 
266 |     eval_loss, eval_accuracy = 0, 0
267 |     nb_eval_steps, nb_eval_examples = 0, 0
268 | 
269 |     # Evaluate data for one epoch
270 |     for batch in validation_dataloader:
271 |         
272 |         # Add batch to GPU
273 |         batch = tuple(t.to(device) for t in batch)
274 |         
275 |         # Unpack the inputs from our dataloader
276 |         b_input_ids, b_input_mask, b_labels = batch
277 |         
278 |         # Telling the model not to compute or store gradients, saving memory and
279 |         # speeding up validation
280 |         with torch.no_grad():        
281 |             outputs = model(b_input_ids, 
282 |                             token_type_ids=None, 
283 |                             attention_mask=b_input_mask)
284 |         
285 |         logits = outputs[0]
286 | 
287 |         # Move logits and labels to CPU
288 |         logits = logits.detach().cpu().numpy()
289 |         label_ids = b_labels.to('cpu').numpy()
290 |         
291 |         # Calculate the accuracy for this batch of test sentences.
292 |         tmp_eval_accuracy = flat_accuracy(logits, label_ids)
293 |         
294 |         # Accumulate the total accuracy.
295 |         eval_accuracy += tmp_eval_accuracy
296 | 
297 |         # Track the number of batches
298 |         nb_eval_steps += 1
299 | 
300 |     # Report the final accuracy for this validation run.
301 |     print("  F1 score: {0:.2f}".format(eval_accuracy/nb_eval_steps))
302 |     print("  Validation took: {:}".format(format_time(time.time() - t0)))
303 |     import os
304 | 
305 |     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
306 | 
307 | 
308 |     output_dir = os.path.join('./model_save', 'checkpoint-{}-{}'.format(lr, eval_accuracy/nb_eval_steps))
309 | 
310 |     # Create output directory if needed
311 |     if not os.path.exists(output_dir):
312 |         os.makedirs(output_dir)
313 | 
314 |     print("Saving model to %s" % output_dir)
315 | 
316 |     # Save a trained model, configuration and tokenizer using `save_pretrained()`.
317 |     # They can then be reloaded using `from_pretrained()`
318 |     model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
319 |     model_to_save.save_pretrained(output_dir)
320 |     tokenizer.save_pretrained(output_dir)
321 | 
322 | print("")
323 | print("Training complete!")


--------------------------------------------------------------------------------