├── figures
    ├── workflow.png
    ├── QueryRewriter.png
    └── CodeSelector-Workflow.png
├── QueryRewriter
    ├── train
    │   ├── utils.py
    │   ├── prepare_data.py
    │   ├── ParaphraseDataset.py
    │   ├── train.py
    │   └── T5FineTuner.py
    └── eval
    │   ├── QueryRewriter_model.py
    │   └── evaluation.py
├── CodeSelector
    ├── eval
    │   ├── utils.py
    │   ├── Bert_MLP.py
    │   ├── CodeSelector_model.py
    │   └── evaluation.py
    └── train
    │   ├── utils.py
    │   ├── Bert_MLP.py
    │   ├── data_prepare.py
    │   └── train.py
└── README.md


/figures/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondacm/Que2Code/HEAD/figures/workflow.png


--------------------------------------------------------------------------------
/figures/QueryRewriter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondacm/Que2Code/HEAD/figures/QueryRewriter.png


--------------------------------------------------------------------------------
/figures/CodeSelector-Workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondacm/Que2Code/HEAD/figures/CodeSelector-Workflow.png


--------------------------------------------------------------------------------
/QueryRewriter/train/utils.py:
--------------------------------------------------------------------------------
 1 | # import packages 
 2 | import argparse
 3 | import glob
 4 | import os
 5 | import json
 6 | import time
 7 | import logging
 8 | import random
 9 | import re
10 | from itertools import chain
11 | from string import punctuation
12 | 
13 | import nltk
14 | nltk.download('punkt')
15 | from nltk.tokenize import sent_tokenize
16 | 
17 | import pandas as pd
18 | import numpy as np
19 | import torch
20 | from torch.utils.data import Dataset, DataLoader
21 | import pytorch_lightning as pl
22 | 
23 | from transformers import (
24 |     AdamW,
25 |     T5ForConditionalGeneration,
26 |     T5Tokenizer,
27 |     get_linear_schedule_with_warmup
28 | )
29 | 
30 | # set a seed 
31 | def set_seed(seed):
32 | 	random.seed(seed)
33 | 	np.random.seed(seed)
34 | 	torch.manual_seed(seed)
35 | 	set_seed(42)
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/CodeSelector/eval/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | import pickle
 5 | import time
 6 | import datetime 
 7 | import random
 8 | from sklearn.model_selection import train_test_split
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.model_selection import GridSearchCV
11 | from sklearn.model_selection import cross_val_score
12 | import torch
13 | import transformers as ppb
14 | import warnings
15 | from transformers import BertTokenizer
16 | from transformers import BertModel
17 | import torch.nn as nn
18 | import torch.nn.functional as F
19 | from transformers import AutoTokenizer
20 | from sklearn.model_selection import train_test_split
21 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
22 | from transformers import get_linear_schedule_with_warmup
23 | from transformers import  AdamW, BertConfig
24 | warnings.filterwarnings('ignore')
25 | 
26 | def flat_accuracy(preds, labels):
27 |     pred_flat = preds.flatten()
28 |     labels_flat = labels.flatten()
29 |     return np.sum(pred_flat == labels_flat) / len(labels_flat)
30 | 
31 | def format_time(elapsed):
32 |     '''
33 |     Takes a time in seconds and returns a string hh:mm:ss
34 |     '''
35 |     # round to the nearest second 
36 |     elapsed_rounded = int(round(elapsed))
37 |     # format as hh:mm:ss 
38 |     return str(datetime.timedelta(seconds=elapsed_rounded))
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/CodeSelector/train/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | import pickle
 5 | import time
 6 | import datetime 
 7 | import random
 8 | from sklearn.model_selection import train_test_split
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.model_selection import GridSearchCV
11 | from sklearn.model_selection import cross_val_score
12 | import torch
13 | import transformers as ppb
14 | import warnings
15 | from transformers import BertTokenizer
16 | from transformers import BertModel
17 | import torch.nn as nn
18 | import torch.nn.functional as F
19 | from transformers import AutoTokenizer
20 | from sklearn.model_selection import train_test_split
21 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
22 | from transformers import get_linear_schedule_with_warmup
23 | from transformers import  AdamW, BertConfig
24 | warnings.filterwarnings('ignore')
25 | 
26 | def flat_accuracy(preds, labels):
27 |     pred_flat = preds.flatten()
28 |     labels_flat = labels.flatten()
29 |     return np.sum(pred_flat == labels_flat) / len(labels_flat)
30 | 
31 | def format_time(elapsed):
32 |     '''
33 |     Takes a time in seconds and returns a string hh:mm:ss
34 |     '''
35 |     # round to the nearest second 
36 |     elapsed_rounded = int(round(elapsed))
37 |     # format as hh:mm:ss 
38 |     return str(datetime.timedelta(seconds=elapsed_rounded))
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/QueryRewriter/train/prepare_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pickle
 3 | 
 4 | train_question1 = []
 5 | train_question2 = []
 6 | 
 7 | # Make train data
 8 | with open('./duplicate_questions.train', 'r') as fin:
 9 |     for line in fin:
10 |         src_qid, src_qtitle, tgt_qid, tgt_qtitle = line.strip().split('\t')
11 |         duplicate_question = src_qtitle.lower()
12 |         master_question = tgt_qtitle.lower()
13 |         train_question1.append(duplicate_question)
14 |         train_question2.append(master_question)
15 |     # break
16 | 
17 | assert len(train_question1) == len(train_question2)
18 | train_df = pd.DataFrame({'question1':train_question1, 'question2':train_question2})
19 | train_df.to_csv('./so_train.csv', index=False, sep="\t") 
20 | 
21 | 
22 | val_question1 = []
23 | val_question2 = []
24 | 
25 | # Make test data
26 | with open('./duplicate_questions.val', 'r') as fin:
27 |     for line in fin:
28 |         src_qid, src_qtitle, tgt_qid, tgt_qtitle = line.strip().split('\t')
29 |         duplicate_question = src_qtitle.lower()
30 |         master_question = tgt_qtitle.lower()
31 |         val_question1.append(duplicate_question)
32 |         val_question2.append(master_question)
33 |         # break
34 | 
35 | assert len(val_question1) == len(val_question2)
36 | val_df = pd.DataFrame({'question1':val_question1, 'question2':val_question2})
37 | val_df.to_csv('./so_val.csv', index=False, sep="\t") 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/QueryRewriter/train/ParaphraseDataset.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | 
 3 | class ParaphraseDataset(Dataset):
 4 |     def __init__(self, tokenizer, data_dir, type_path, max_len=512):
 5 |         self.path = os.path.join(data_dir, type_path + '.csv')
 6 | 
 7 |         self.source_column = "question1"
 8 |         self.target_column = "question2"
 9 |         self.data = pd.read_csv(self.path, sep="\t").astype(str)
10 | 
11 |         self.max_len = max_len
12 |         self.tokenizer = tokenizer
13 |         self.inputs = []
14 |         self.targets = []
15 | 
16 |         self._build()
17 | 
18 |     def __len__(self):
19 |         return len(self.inputs)
20 | 
21 |     def __getitem__(self, index):
22 |         source_ids = self.inputs[index]["input_ids"].squeeze()
23 |         target_ids = self.targets[index]["input_ids"].squeeze()
24 | 
25 |         src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
26 |         target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze
27 | 
28 |         return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
29 | 
30 |     def _build(self):
31 |         for idx in range(len(self.data)):
32 |             input_, target = self.data.loc[idx, self.source_column], self.data.loc[idx, self.target_column]
33 | 
34 |             input_ = "paraphrase: "+ input_ + ' </s>'
35 |             target = target + " </s>"
36 | 
37 |             # tokenize inputs
38 |             tokenized_inputs = self.tokenizer.batch_encode_plus(
39 |                 [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt", truncation='longest_first'
40 |             )
41 |             # tokenize targets
42 |             tokenized_targets = self.tokenizer.batch_encode_plus(
43 |                 [target], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt", truncation='longest_first'
44 |             )
45 | 
46 |             self.inputs.append(tokenized_inputs)
47 |             self.targets.append(tokenized_targets)
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/CodeSelector/eval/Bert_MLP.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | 
 3 | class Config(object):
 4 | 
 5 |     def __init__(self): 
 6 |         self.model_name = 'bert'
 7 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
 8 |         self.num_classes = 2
 9 |         self.bert_path = './Model'
10 |         self.hidden_size = 768
11 |         self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
12 |         self.batch_size = 16
13 |         self.num_epochs = 10 
14 | 
15 | 
16 | class Model(nn.Module):
17 |     
18 |     def __init__(self, config):
19 |         super(Model, self).__init__()
20 |         self.bert = BertModel.from_pretrained(config.bert_path)
21 |         for param in self.bert.parameters():
22 |             param.requires_grad = True 
23 |         self.fc0 = nn.Linear(2*config.hidden_size, 512)
24 |         self.fc1 = nn.Linear(512, 128)
25 |         self.fc2 = nn.Linear(128, config.num_classes)
26 | 
27 |     # def forward(self, input_ids, attention_mask, token_type_ids):
28 |     def forward(self, qc0_pair, qc1_pair):
29 |         
30 |         qc0_input_ids, qc0_input_mask, qc0_input_types = qc0_pair[0], qc0_pair[1], qc0_pair[2] 
31 |         qc1_input_ids, qc1_input_mask, qc1_input_types = qc1_pair[0], qc1_pair[1], qc1_pair[2] 
32 |         '''
33 |         qc0_last_hidden_states = self.bert(input_ids = qc0_input_ids, \
34 |                                        attention_mask = qc0_input_mask, \
35 |                                        token_type_ids = qc0_input_types) 
36 | 
37 |         qc1_last_hidden_states = self.bert(input_ids = qc1_input_ids, \
38 |                                        attention_mask = qc1_input_mask, \
39 |                                        token_type_ids = qc1_input_types) 
40 |         qc0_features = qc0_last_hidden_states[0][:,0,:]
41 |         qc1_features = qc1_last_hidden_states[0][:,0,:]
42 |         # print('qc0_features:', type(qc0_features), qc0_features.shape)
43 |         # print('qc1_features:', type(qc1_features), qc1_features.shape)
44 |         features = torch.cat((qc0_features, qc1_features), dim=1)
45 |         # print("features:", type(features), features.shape)
46 |         '''
47 |         _, qc0_pooled = self.bert(input_ids = qc0_input_ids, \
48 |                                        attention_mask = qc0_input_mask, \
49 |                                        token_type_ids = qc0_input_types) 
50 | 
51 |         _, qc1_pooled = self.bert(input_ids = qc1_input_ids, \
52 |                                        attention_mask = qc1_input_mask, \
53 |                                        token_type_ids = qc1_input_types) 
54 | 
55 |         features = torch.cat((qc0_pooled, qc1_pooled), dim=1)
56 |         features = self.fc0(features)
57 |         features = self.fc1(features)
58 |         out = self.fc2(features)
59 |         return out
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/CodeSelector/train/Bert_MLP.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | 
 3 | class Config(object):
 4 | 
 5 |     def __init__(self): 
 6 |         self.model_name = 'bert'
 7 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
 8 |         self.num_classes = 2
 9 |         self.bert_path = './Model'
10 |         self.hidden_size = 768
11 |         self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
12 |         self.batch_size = 16
13 |         self.num_epochs = 1 
14 | 
15 | 
16 | class Model(nn.Module):
17 |     
18 |     def __init__(self, config):
19 |         super(Model, self).__init__()
20 |         self.bert = BertModel.from_pretrained(config.bert_path)
21 |         for param in self.bert.parameters():
22 |             param.requires_grad = True 
23 |         self.fc0 = nn.Linear(2*config.hidden_size, 512)
24 |         self.fc1 = nn.Linear(512, 128)
25 |         self.fc2 = nn.Linear(128, config.num_classes)
26 | 
27 |     # def forward(self, input_ids, attention_mask, token_type_ids):
28 |     def forward(self, qc0_pair, qc1_pair):
29 |         
30 |         qc0_input_ids, qc0_input_mask, qc0_input_types = qc0_pair[0], qc0_pair[1], qc0_pair[2] 
31 |         qc1_input_ids, qc1_input_mask, qc1_input_types = qc1_pair[0], qc1_pair[1], qc1_pair[2] 
32 |         '''
33 |         qc0_last_hidden_states = self.bert(input_ids = qc0_input_ids, \
34 |                                        attention_mask = qc0_input_mask, \
35 |                                        token_type_ids = qc0_input_types) 
36 | 
37 |         qc1_last_hidden_states = self.bert(input_ids = qc1_input_ids, \
38 |                                        attention_mask = qc1_input_mask, \
39 |                                        token_type_ids = qc1_input_types) 
40 |         qc0_features = qc0_last_hidden_states[0][:,0,:]
41 |         qc1_features = qc1_last_hidden_states[0][:,0,:]
42 |         # print('qc0_features:', type(qc0_features), qc0_features.shape)
43 |         # print('qc1_features:', type(qc1_features), qc1_features.shape)
44 |         features = torch.cat((qc0_features, qc1_features), dim=1)
45 |         # print("features:", type(features), features.shape)
46 |         '''
47 |         _, qc0_pooled = self.bert(input_ids = qc0_input_ids, \
48 |                                        attention_mask = qc0_input_mask, \
49 |                                        token_type_ids = qc0_input_types) 
50 | 
51 |         _, qc1_pooled = self.bert(input_ids = qc1_input_ids, \
52 |                                        attention_mask = qc1_input_mask, \
53 |                                        token_type_ids = qc1_input_types) 
54 | 
55 |         features = torch.cat((qc0_pooled, qc1_pooled), dim=1)
56 |         features = self.fc0(features)
57 |         features = self.fc1(features)
58 |         out = self.fc2(features)
59 |         return out
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/QueryRewriter/train/train.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | from T5FineTuner import T5FineTuner
 3 | from T5FineTuner import LoggingCallback
 4 | from ParaphraseDataset import ParaphraseDataset  
 5 | 
 6 | # set arguments
 7 | args_dict = dict(
 8 |     data_dir="", # path for data files
 9 |     output_dir="", # path to save the checkpoints
10 |     model_name_or_path='./Paraphrse_Pretrained/',
11 |     tokenizer_name_or_path='./Paraphrse_Pretrained/',
12 |     max_seq_length=256,
13 |     learning_rate=3e-4,
14 |     weight_decay=0.0,
15 |     adam_epsilon=1e-8,
16 |     warmup_steps=0,
17 |     train_batch_size=1,
18 |     eval_batch_size=1,
19 |     num_train_epochs=2,
20 |     gradient_accumulation_steps=16,
21 |     n_gpu=1,
22 |     early_stop_callback=False,
23 |     fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
24 |     opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
25 |     max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
26 |     seed=42,
27 | )
28 | 
29 | tokenizer = T5Tokenizer.from_pretrained('./Paraphrse_Pretrained/')
30 | # dataset = ParaphraseDataset(tokenizer, 'data', 'dev', 256)  
31 | 
32 | train_path = "./data/so_train.csv"
33 | val_path = "./data/so_val.csv"
34 | 
35 | data_train = pd.read_csv(train_path, sep="\t")#.astype(str)
36 | # print(data_train.head())
37 | data_val = pd.read_csv(val_path, sep="\t")
38 | print(data_train.shape, data_val.shape)
39 | 
40 | if not os.path.exists('t5_paraphrase'):
41 |     os.makedirs('t5_paraphrase')
42 | 
43 | args_dict.update({'data_dir': 'data', 'output_dir': 't5_paraphrase', 'num_train_epochs':10,'max_seq_length':256})
44 | args = argparse.Namespace(**args_dict) 
45 | print("args_dict:")
46 | print(args_dict)
47 | 
48 | checkpoint_callback = pl.callbacks.ModelCheckpoint(\
49 |     filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
50 | )
51 | 
52 | train_params = dict(
53 |     accumulate_grad_batches=args.gradient_accumulation_steps,
54 |     gpus=args.n_gpu,
55 |     max_epochs=args.num_train_epochs,
56 |     # early_stop_callback=False,
57 |     precision= 16 if args.fp_16 else 32,
58 |     amp_level=args.opt_level,
59 |     gradient_clip_val=args.max_grad_norm,
60 |     checkpoint_callback=checkpoint_callback,
61 |     callbacks=[LoggingCallback()],
62 | )
63 | 
64 | # def get_dataset(tokenizer, type_path, args):
65 | # 	return ParaphraseDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)
66 | 
67 | print ("Initialize model")
68 | model = T5FineTuner(args)
69 | 
70 | trainer = pl.Trainer(**train_params)
71 | 
72 | print (" Training model")
73 | trainer.fit(model)
74 | print ("training finished")
75 | 
76 | print ("Saving model")
77 | model.model.save_pretrained('t5_paraphrase')
78 | 
79 | print ("Model saved")
80 | 


--------------------------------------------------------------------------------
/CodeSelector/eval/CodeSelector_model.py:
--------------------------------------------------------------------------------
 1 | from utils import *
 2 | from Bert_MLP import Model, Config
 3 | 
 4 | class CodeSelector(object):
 5 |     
 6 |     def __init__(self):
 7 |         self.config = Config()
 8 |         self.model = self.load_model() 
 9 |         self.tokenizer = self.load_tokenizer()
10 |         pass
11 | 
12 |     def load_model(self): 
13 |         PATH = './model_save/epoch7/model.ckpt'
14 |         model = Model(self.config).to(self.config.device)
15 |         model.load_state_dict(torch.load(PATH))
16 |         model.eval()
17 |         print('Model Loaded!')
18 |         return model
19 | 
20 |     def load_tokenizer(self): 
21 |         tokenizer = AutoTokenizer.from_pretrained('./Model')
22 |         print("Tokenizer Loaded!")
23 |         return tokenizer
24 |         pass
25 |     
26 |     def encode_qc(self, question, cs):  
27 |         encoded_qc = self.tokenizer(question, cs, padding=True, truncation=True, max_length=128, return_tensors='pt')
28 |         return encoded_qc
29 |     
30 |     def get_score(self, question, cs):
31 |         # encode qc_0
32 |         encoded_qc0 = self.encode_qc(question, cs)
33 |         qc0_input_ids = encoded_qc0['input_ids'] 
34 |         qc0_token_type_ids = encoded_qc0['token_type_ids']
35 |         qc0_attention_masks = encoded_qc0['attention_mask']
36 | 
37 |         # encode qc_1 
38 |         encoded_qc1 = self.encode_qc('', '')
39 |         qc1_input_ids = encoded_qc1['input_ids'] 
40 |         qc1_token_type_ids = encoded_qc1['token_type_ids']
41 |         qc1_attention_masks = encoded_qc1['attention_mask']
42 | 
43 |         # to device 
44 |         b_qc0_input_ids = qc0_input_ids.to(self.config.device)
45 |         b_qc0_input_mask = qc0_attention_masks.to(self.config.device)
46 |         b_qc0_input_types = qc0_token_type_ids.to(self.config.device)
47 |         b_qc1_input_ids = qc1_input_ids.to(self.config.device)
48 |         b_qc1_input_mask = qc1_attention_masks.to(self.config.device)
49 |         b_qc1_input_types = qc1_token_type_ids.to(self.config.device)
50 |         with torch.no_grad():
51 |             qc0 = (b_qc0_input_ids, b_qc0_input_mask, b_qc0_input_types)
52 |             qc1 = (b_qc1_input_ids, b_qc1_input_mask, b_qc1_input_types)
53 |             outputs = self.model(qc0, qc1)
54 | 
55 |         score  = outputs.data.cpu().numpy()[0][1]
56 |         return score
57 |     
58 | 
59 |     def get_candidate_scores(self, question, candidate_answers):
60 |         candidate_scores = []
61 |         for cs in candidate_answers:
62 |             score = self.get_score(question, cs)
63 |             candidate_scores.append(score)
64 |         return candidate_scores
65 | 
66 | 
67 | def main():
68 |     cs_model = CodeSelector()
69 |     with open('../../BM25-IR/eval_data.pkl', 'rb') as handler:
70 |         eval_data = pickle.load(handler) 
71 | 
72 |     for k, v in eval_data.items():
73 |         question = v['qtitle']
74 |         candidate_answers = []
75 |         best_code = v['best_code']
76 |         similar_code1 = v['similar_code1']
77 |         similar_code2 = v['similar_code2']
78 |         similar_code3 = v['similar_code3']
79 |         similar_code4 = v['similar_code4']
80 |         candidate_answers.append(best_code) 
81 |         candidate_answers.append(similar_code1) 
82 |         candidate_answers.append(similar_code2)
83 |         candidate_answers.append(similar_code3) 
84 |         candidate_answers.append(similar_code4)
85 |         candidate_scores = cs_model.get_candidate_scores(question, candidate_answers)
86 |         print(candidate_scores)
87 |         break
88 |     pass
89 | 
90 | if __name__ == '__main__':
91 |     main()
92 | 
93 | 


--------------------------------------------------------------------------------
/QueryRewriter/eval/QueryRewriter_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import pickle
  3 | import numpy as np
  4 | from transformers import T5ForConditionalGeneration,T5Tokenizer
  5 | from transformers import T5Tokenizer, T5Model
  6 | # from transformers import T5Tokenizer, T5EncoderModel
  7 | 
  8 | def set_seed(seed):
  9 |     torch.manual_seed(seed)
 10 |     if torch.cuda.is_available():
 11 |         torch.cuda.manual_seed_all(seed)
 12 | 
 13 | set_seed(42)
 14 | 
 15 | class QueryRewriter(object):
 16 |     
 17 |     def __init__(self):
 18 |         self.device = self.get_device()
 19 |         self.model = self.load_model()
 20 |         self.tokenizer = self.load_tokenizer()
 21 |         self.input_embeddings = self.load_input_embeddings()
 22 |         print( type(self.input_embeddings) )
 23 |         pass
 24 | 
 25 |     def get_device(self): 
 26 |         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 27 |         return device
 28 | 
 29 |     def load_model(self):
 30 |         model = T5ForConditionalGeneration.from_pretrained('./t5_paraphrase')
 31 |         # model = T5Model.from_pretrained('./t5_paraphrase')
 32 |         model = model.to(self.device)
 33 |         return model 
 34 | 
 35 |     def load_tokenizer(self):
 36 |         # tokenizer = T5Tokenizer.from_pretrained('t5-base')
 37 |         tokenizer = T5Tokenizer.from_pretrained('./t5_paraphrase')
 38 |         # tokenizer = T5Tokenizer.from_pretrained('./Paraphrse_Pretrained')
 39 |         return tokenizer
 40 | 
 41 |     def load_input_embeddings(self): 
 42 |         embeddings = self.model.get_input_embeddings()
 43 |         return embeddings 
 44 |    
 45 |     def encode(self, query):
 46 |         query = query.strip()
 47 |         # text =  "paraphrase: " + query + " </s>"
 48 |         text = query
 49 |         input_ids = self.tokenizer.encode(text, return_tensors="pt").to(self.device)
 50 | 
 51 |         ################ Alternative Method ############################
 52 |         # print("input_ids:", input_ids)
 53 |         outputs = self.input_embeddings(input_ids)
 54 |         # print("outputs:", outputs.shape)
 55 |         outputs = torch.squeeze(outputs)
 56 |         output_vec = outputs.cpu().detach().numpy()
 57 |         # print("output_vec:", output_vec.shape)
 58 |         output_vec = np.mean(output_vec, axis=0)
 59 |         # print("outputs:", type(outputs), len(outputs))
 60 |         return output_vec
 61 |     
 62 |     def paraphrase(self, query):
 63 |         query = query.strip()
 64 |         text =  "paraphrase: " + query + " </s>"
 65 |         max_len = 256
 66 |         encoding = self.tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
 67 |         input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)
 68 |         beam_outputs = self.model.generate(\
 69 |                                     input_ids=input_ids, \
 70 |                                     attention_mask=attention_masks,\
 71 |                                     do_sample=True,\
 72 |                                     max_length=256, \
 73 |                                     top_k=120,\
 74 |                                     top_p=0.95,\
 75 |                                     early_stopping=True,\
 76 |                                     num_return_sequences=3\
 77 |                                     )
 78 |         final_outputs =[] 
 79 |         for beam_output in beam_outputs:
 80 |             sent = self.tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
 81 |             if sent.lower() != query.lower() and sent not in final_outputs:
 82 |                 final_outputs.append(sent)
 83 |         return final_outputs
 84 |         
 85 | 
 86 | def main():
 87 |     query = "how to iterate the list in python"
 88 |     qr_model = QueryRewriter()
 89 |     query_vec = qr_model.encode(query)
 90 |     print("query_vec:", type(query_vec), query_vec.shape)
 91 |     
 92 |     paraphrase_q = qr_model.paraphrase(query)
 93 |     print(paraphrase_q)
 94 | 
 95 |     pass
 96 | 
 97 | if __name__ == "__main__":
 98 |     main()
 99 |     
100 | 
101 | 


--------------------------------------------------------------------------------
/CodeSelector/eval/evaluation.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import numpy as np
  3 | from model import Our_Model  
  4 | from CodeSelector_model import CodeSelector
  5 | from scipy import spatial
  6 | 
  7 | def hits_count(candidate_ranks, k):
  8 |   '''
  9 |     candidate_ranks:
 10 |     list of candidates' ranks; one rank per question;
 11 |     length is a number of questions
 12 |     rank is a number from q to len(candidates of the question)
 13 |     e.g. [2, 3] means that first candidate has the rank 2,
 14 |                            second candidate has the rank 3
 15 |     k: number of top-ranked elements (k in hits@k metric)
 16 |     result: return Hits@k value for current ranking 
 17 |   '''
 18 |   count = 0
 19 |   for rank in candidate_ranks:
 20 |     if rank <= k:
 21 |       count += 1
 22 |   return count/(len(candidate_ranks)+1e-8)
 23 |  
 24 | def dcg_score(candidate_ranks, k):
 25 |   '''
 26 |     candidate_ranks:
 27 |     list of candidates' ranks; one rank per question;
 28 |     length is a number of questions
 29 |     rank is a number from q to len(candidates of the question)
 30 |     e.g. [2, 3] means that first candidate has the rank 2,
 31 |                            second candidate has the rank 3
 32 |     k: number of top-ranked elements (k in hits@k metric)
 33 |     
 34 |     result: return DCG@k value for current ranking
 35 |   '''
 36 |   score = 0
 37 |   for rank in candidate_ranks:
 38 |     if rank <= k:
 39 |       score += 1/np.log2(1+rank)
 40 |   return score/(len(candidate_ranks)+1e-8) 
 41 | 
 42 | def rank_candidates(candidate_answers, candidate_scores):
 43 |     '''
 44 |         question: a string
 45 |         candidate_answers: a list of strings
 46 |         result: a list of pairs (initial position in the list, question)
 47 |     '''
 48 |     # question_vec = model.encode(question)
 49 |     # print("question_vec:", question_vec)
 50 |     # candidate_scores = []
 51 |     # for answer in candidate_answers:
 52 |         # answer_vec = model.encode(answer)
 53 |         # print("answer_vec:", answer_vec)
 54 |         # score = 1 - spatial.distance.cosine(question_vec, answer_vec) 
 55 |         # print("score:", score)
 56 |         # candidate_scores.append( score )
 57 |     # print("candidate_scores:", candidate_scores) 
 58 |     tl = [(i, candidate_answers[i], candidate_scores[i]) for i in range(len(candidate_answers))]
 59 |     # print("tl:", tl)
 60 |     stl = sorted(tl, key=lambda x:x[2], reverse=True)
 61 |     # print("stl:", stl)
 62 |     result = [(t[0], t[1]) for t in stl]
 63 |     # print("result:", result)
 64 |     return result 
 65 | 
 66 | with open('../../BM25-IR/eval_data.pkl', 'rb') as handler:
 67 |     eval_data = pickle.load(handler)
 68 | print('eval_data:', type(eval_data), len(eval_data))
 69 | 
 70 | 
 71 | eval_pairs = []
 72 | for k, v in eval_data.items():
 73 |     question = v['qtitle']
 74 |     best_code = v['best_code']
 75 |     similar_code1 = v['similar_code1']
 76 |     similar_code2 = v['similar_code2']
 77 |     similar_code3 = v['similar_code3']
 78 |     similar_code4 = v['similar_code4']
 79 |   
 80 |     eval_pairs.append(( question, \
 81 |                         best_code, \
 82 |                         similar_code1, \
 83 |                         similar_code2, \
 84 |                         similar_code3, \
 85 |                         similar_code4))
 86 | print('eval_pairs:', type(eval_pairs), len(eval_pairs))
 87 | 
 88 | # model = Our_Model() 
 89 | model = CodeSelector()
 90 | 
 91 | model_ranking = []
 92 | for i, e in enumerate(eval_pairs[:1500]):
 93 |     # print(i)
 94 |     question = e[0]
 95 |     best_code = e[1]
 96 |     similar_code1 = e[2]
 97 |     similar_code2 = e[3]
 98 |     similar_code3 = e[4]
 99 |     similar_code4 = e[5]
100 |     candidate_answers = []
101 |     candidate_answers.append(best_code) 
102 |     candidate_answers.append(similar_code1)
103 |     candidate_answers.append(similar_code2)
104 |     candidate_answers.append(similar_code3)
105 |     candidate_answers.append(similar_code4)
106 | 
107 |     # scores_map = model.get_scores_map(question, candidate_answers)
108 |     # candidate_scores = model.get_candidate_scores(scores_map) 
109 |     candidate_scores = model.get_candidate_scores(question, candidate_answers) 
110 |     # print("question:", question)
111 |     # print("candidate_answers:", candidate_answers)
112 | 
113 |     ranks = rank_candidates(candidate_answers, candidate_scores)
114 |     # print("ranks:", ranks)
115 |     model_ranking.append( [r[0] for r in ranks].index(0) + 1 )
116 |     # print("model_ranking:", model_ranking)
117 |     # break
118 | 
119 | for k in [1, 2, 3, 4, 5]:
120 |     print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(model_ranking, k), \
121 |                                               k, hits_count(model_ranking, k)))
122 | 
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # I Know What You Are Searching For: Code Snippet Recommendation from Stack Overflow Posts
 2 | 
 3 | ![Workflow of Que2Code](./figures/workflow.png)
 4 | 
 5 | 
 6 | Stack Overflow has been heavily used by software developers to seek programming-related information.
 7 | Typically, when developers encounter a technical problem, they formulate the problem as a query and use a search engine to obtain a list of possible relevant posts that may containuseful solutions to their problem. 
 8 | However, this kind of solution-seeking experience can be difficult and painful because the **_Query Mistmatch_** and **_Information Overload_** problems. To alleviate these challenges, in this work we present a query-driven code recommendation tool, named _Que2Code_, that identifies the best code snippets for a user query from Stack Overflow posts. 
 9 | The material used in our can be accessed with: 
10 | 
11 | - [Source Code Link](https://github.com/beyondacm/Que2Code)
12 | - [Dataset Download Link](https://drive.google.com/drive/folders/1-qlk1clhgy1Lzx4BIE5bW5fmEQsFSMjv?usp=sharing)
13 | - [Pretrained Model Download Link](https://drive.google.com/drive/folders/1-E8pPL3ze7jHkR4_J6htAPk7iN94yInt?usp=sharing)
14 | - [User Study](https://drive.google.com/file/d/1TJdpLwBFfdUcfvK42jLMGKNB4Ny87C_L/view?usp=sharing)
15 | 
16 | 
17 | Our model contains two stages: 
18 | 
19 | 1. _Semantically-Equivalent Question Retrieval_
20 | 2. _Best Code Snippet Recommendation_
21 | 
22 | 
23 | Our model has two sub-components, i.e., **QueryRewriter** and **CodeSelector**. **QueryRewriter** can qualitatively retrieve semantically-equivalent questions, and the **CodeSelector** can quantitatively rank the most relevant code snippets to the top of the recommendation candidates.
24 | 
25 | ## QueryRewriter
26 | ![Workflow of Que2Code](./figures/QueryRewriter.png)
27 | In the first stage, our **QueryRewriter** component tackles the _query mistmatch_ problem. 
28 | The idea of **QueryRewriter** is to use a rewritten version of a query question to cover a variety of different forms of semantically equivalent expressions. 
29 | In particular, we first collect the duplicate question pairs from Stack Overflow, because duplicate questions can be considered as semantically-equivalent questions of various user descriptions.
30 | We then frame this problem as a sequence-to-sequence learning problem, which directly maps a technical question to its corresponding duplicate question. 
31 | We train a text-to-text transformer, named **QueryRewriter**, by using the collected duplicate question pairs.
32 | 
33 | 
34 | 
35 | To train the **QueryRewriter**, please download our duplicate question dataset from the following link: [Dataset Download Link](https://drive.google.com/drive/folders/1-qlk1clhgy1Lzx4BIE5bW5fmEQsFSMjv?usp=sharing)
36 | 
37 | ```shell
38 | cd QueryRewriter/train/
39 | python prepare_data.py
40 | python train.py
41 | ```  
42 | 
43 | Or, we have released the pre-trained **QueryRewriter** model as described in the paper. You can use the following link to download our pretrained model: [Pretrained Model Download Link](https://drive.google.com/drive/folders/1-E8pPL3ze7jHkR4_J6htAPk7iN94yInt?usp=sharing)
44 | 
45 | The **QueryRewriter** can be easily used with the following way:
46 | 
47 | ```
48 | query = "how to iterate the list in python"
49 | # Initialize the model 
50 | QR_model = QueryRewriter()
51 | # Get the embedding of a query
52 | query_vec = QR_model.encode(query)
53 | # Get the paraphrase questions of a query
54 | paraphrase_q = QR_model.paraphrase(query)
55 | ```   
56 | 
57 | ## CodeSelector
58 | ![Workflow of Que2Code](./figures/CodeSelector-Workflow.png)
59 | In the second stage, our **CodeSelector** component tackles the _information overload_ challenge. 
60 | To do this, we first collect all the answers of the semantic relevant questions retrieved in the first stage. 
61 | We then extract all the code snippets from the collected answer posts to construct a candidate code snippets pool.
62 | For the given query question, we pair it with each of the code snippet candidates. We then fit them into the trained **CodeSelector** to estimate their matching scores and judge the preference orders.
63 | **CodeSelector** can then select the best code snippet from the code snippet candidates via pairwise comparison. 
64 | Our approach is fully data-driven and does not rely on hand-crafted rules.
65 | 
66 | To train the **CodeSelector**, please download our label query-code dataset from the following link: [Dataset Download Link](https://drive.google.com/drive/folders/1-qlk1clhgy1Lzx4BIE5bW5fmEQsFSMjv?usp=sharing). 
67 | Then train the model with the following scripts: 
68 | 
69 | ```shell
70 | cd CodeSelector/train/
71 | python data_prepare.py
72 | python train.py
73 | ```  
74 | 
75 | 
76 | Or, we have aslo released the pre-trained **CodeSelector** model as described in the paper. You can use the following link to download our pretrained model: [Pretrained Model Download Link](https://drive.google.com/drive/folders/1-E8pPL3ze7jHkR4_J6htAPk7iN94yInt?usp=sharing)
77 | 
78 | Our **CodeSelector** can also be used easily as following:
79 | 
80 | ```shell
81 | # Initilize the CodeSelector Model
82 | cs_model = CodeSelector()
83 | # Estimate the matching score between a query and a code snippet
84 | score = cs_model.get_score(query, code_snippet)
85 | ```  
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/QueryRewriter/eval/evaluation.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import random
  3 | import numpy as np
  4 | import pandas as pd
  5 | from scipy import spatial
  6 | from T5_model import T5_model  
  7 | 
  8 | random.seed(779)
  9 | 
 10 | def hits_count(candidate_ranks, k):
 11 |   '''
 12 |     candidate_ranks:
 13 |     list of candidates' ranks; one rank per question;
 14 |     length is a number of questions
 15 |     rank is a number from q to len(candidates of the question)
 16 |     e.g. [2, 3] means that first candidate has the rank 2,
 17 |                            second candidate has the rank 3
 18 |     k: number of top-ranked elements (k in hits@k metric)
 19 |     result: return Hits@k value for current ranking 
 20 |   '''
 21 |   count = 0
 22 |   for rank in candidate_ranks:
 23 |     if rank <= k:
 24 |       count += 1
 25 |   return count/(len(candidate_ranks)+1e-8)
 26 |  
 27 | def dcg_score(candidate_ranks, k):
 28 |   '''
 29 |     candidate_ranks:
 30 |     list of candidates' ranks; one rank per question;
 31 |     length is a number of questions
 32 |     rank is a number from q to len(candidates of the question)
 33 |     e.g. [2, 3] means that first candidate has the rank 2,
 34 |                            second candidate has the rank 3
 35 |     k: number of top-ranked elements (k in hits@k metric)
 36 |     
 37 |     result: return DCG@k value for current ranking
 38 |   '''
 39 |   score = 0
 40 |   for rank in candidate_ranks:
 41 |     if rank <= k:
 42 |       score += 1/np.log2(1+rank)
 43 |   return score/(len(candidate_ranks)+1e-8) 
 44 | 
 45 | def rank_candidates(question_vec, candidate_answers, model):
 46 |     '''
 47 |         question: a string
 48 |         candidate_answers: a list of strings
 49 |         result: a list of pairs (initial position in the list, question)
 50 |     '''
 51 |     # question_vec = model.encode(question)
 52 |     # print("question_vec:", question_vec)
 53 |     candidate_scores = []
 54 |     for answer in candidate_answers:
 55 |         answer_vec = model.encode(answer)
 56 |         # print("answer_vec:", answer_vec)
 57 |         score = 1 - spatial.distance.cosine(question_vec, answer_vec) 
 58 |         # print("score:", score)
 59 |         candidate_scores.append( score )
 60 |     # print("candidate_scores:", candidate_scores) 
 61 |     tl = [(i, candidate_answers[i], candidate_scores[i]) for i in range(len(candidate_answers))]
 62 |     # print("tl:", tl)
 63 |     stl = sorted(tl, key=lambda x:x[2], reverse=True)
 64 |     # print("stl:", stl)
 65 |     result = [(t[0], t[1]) for t in stl]
 66 |     # print("result:", result)
 67 |     return result 
 68 | 
 69 | with open('../../BM25-IR/eval_data.pkl', 'rb') as handler:
 70 |     eval_data = pickle.load(handler)
 71 | print('eval_data:', type(eval_data), len(eval_data))
 72 | 
 73 | with open('./eval_data_embed/eval_data_embed_0.pkl', 'rb') as handler:
 74 |     eval_data_embed = pickle.load(handler)
 75 | print('eval_data_embed:', type(eval_data_embed), len(eval_data_embed))
 76 | 
 77 | eval_pairs = []
 78 | for i in range(len(eval_data)):
 79 |     eval_pairs.append((eval_data[i]['src_qtitle'], \
 80 |                        eval_data[i]['tgt_qtitle'], \
 81 |                        eval_data[i]['similar_q1_title'], \
 82 |                        eval_data[i]['similar_q2_title'], \
 83 |                        eval_data[i]['similar_q3_title'], \
 84 |                        eval_data[i]['similar_q4_title']))
 85 | 
 86 | print('eval_pairs:', type(eval_pairs), len(eval_pairs))
 87 | 
 88 | model = T5_model() 
 89 | 
 90 | model_ranking = []
 91 | for i, e in enumerate(eval_pairs):
 92 |     if i not in eval_data_embed:
 93 |         continue
 94 |     src_question = e[0].strip().lower()
 95 |     tgt_question = e[1].strip().lower()
 96 |     similar_q1 = e[2].strip().lower()
 97 |     similar_q2 = e[3].strip().lower()
 98 |     similar_q3 = e[4].strip().lower()
 99 |     similar_q4 = e[5].strip().lower()
100 |     question = src_question
101 |     candidate_answers = []
102 |     candidate_answers.append(tgt_question) 
103 |     candidate_answers.append(similar_q1)
104 |     candidate_answers.append(similar_q2)
105 |     candidate_answers.append(similar_q3)
106 |     candidate_answers.append(similar_q4)
107 |     # print("question:", question)
108 |     # print("candidate_answers:", candidate_answers)
109 | 
110 |     question_vec = eval_data_embed[i]['question_embedding']
111 |     ranks = rank_candidates(question_vec, candidate_answers, model)
112 |     # print("ranks:", ranks)
113 |     model_ranking.append( [r[0] for r in ranks].index(0) + 1 )
114 |     # print("model_ranking:", model_ranking)
115 |     # break
116 | 
117 | print( "len of model_ranking:", len(model_ranking) )
118 | 
119 | def evaluate( sample_ranking ): 
120 |     eval_dcg_scores = [] 
121 |     eval_hits_count = []
122 |     for k in [1, 2 ,3, 4, 5]:
123 |         eval_dcg_scores.append( dcg_score(sample_ranking, k) )
124 |         eval_hits_count.append( hits_count(sample_ranking, k)) 
125 |     return eval_dcg_scores, eval_hits_count
126 | 
127 | dcg_scores_result = []
128 | hits_count_result = []
129 | 
130 | for i in range(10):
131 |     sample_ranking = random.sample(model_ranking, 200)
132 |     eval_dcg_scores, eval_hits_count  = evaluate( sample_ranking ) 
133 |     # Append
134 |     dcg_scores_result.append(eval_dcg_scores) 
135 |     hits_count_result.append(eval_hits_count)
136 | 
137 | dcg_scores_df = pd.DataFrame(dcg_scores_result, columns=['dcg1', 'dcg2', 'dcg3', 'dcg4', 'dcg5'])
138 | hits_count_df = pd.DataFrame(hits_count_result, columns=['hits1', 'hits2', 'hits3', 'hits4', 'hits5'])
139 | # print(dcg_scores_df)
140 | # print(hits_count_df)
141 | print(hits_count_df.describe())
142 | print(dcg_scores_df.describe())
143 | 
144 | # for k in [1, 2, 3, 4, 5]:
145 | #     print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(model_ranking, k), \
146 | #     k, hits_count(model_ranking, k)))
147 | 
148 | 


--------------------------------------------------------------------------------
/QueryRewriter/train/T5FineTuner.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | from ParaphraseDataset import *  
  3 | 
  4 | class T5FineTuner(pl.LightningModule):
  5 |     def __init__(self, hparams):
  6 |         super(T5FineTuner, self).__init__()
  7 |         self.hparams = hparams
  8 | 
  9 |         self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
 10 |         self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
 11 | 
 12 |     def is_logger(self):
 13 |         return True #self.trainer.proc_rank <= 0
 14 | 
 15 |     def forward(
 16 |             self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
 17 |     ):
 18 |         return self.model(
 19 |             input_ids,
 20 |             attention_mask=attention_mask,
 21 |             decoder_input_ids=decoder_input_ids,
 22 |             decoder_attention_mask=decoder_attention_mask,
 23 |             lm_labels=lm_labels,
 24 |         )
 25 | 
 26 |     def _step(self, batch):
 27 |         lm_labels = batch["target_ids"]
 28 |         lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100
 29 | 
 30 |         outputs = self(
 31 |             input_ids=batch["source_ids"],
 32 |             attention_mask=batch["source_mask"],
 33 |             lm_labels=lm_labels,
 34 |             decoder_attention_mask=batch['target_mask']
 35 |         )
 36 | 
 37 |         loss = outputs[0]
 38 | 
 39 |         return loss
 40 | 
 41 |     def training_step(self, batch, batch_idx):
 42 |         loss = self._step(batch)
 43 | 
 44 |         tensorboard_logs = {"train_loss": loss}
 45 |         return {"loss": loss, "log": tensorboard_logs}
 46 | 
 47 |     def training_epoch_end(self, outputs):
 48 |         avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
 49 |         tensorboard_logs = {"avg_train_loss": avg_train_loss}
 50 |         return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
 51 | 
 52 |     def validation_step(self, batch, batch_idx):
 53 |         loss = self._step(batch)
 54 |         return {"val_loss": loss}
 55 | 
 56 |     def validation_epoch_end(self, outputs):
 57 |         avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
 58 |         tensorboard_logs = {"val_loss": avg_loss}
 59 |         return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
 60 | 
 61 |     def configure_optimizers(self):
 62 |         "Prepare optimizer and schedule (linear warmup and decay)"
 63 | 
 64 |         model = self.model
 65 |         no_decay = ["bias", "LayerNorm.weight"]
 66 |         optimizer_grouped_parameters = [
 67 |             {
 68 |                 "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
 69 |                 "weight_decay": self.hparams.weight_decay,
 70 |             },
 71 |             {
 72 |                 "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
 73 |                 "weight_decay": 0.0,
 74 |             },
 75 |         ]
 76 |         optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
 77 |         self.opt = optimizer
 78 |         return [optimizer]
 79 | 
 80 |     def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None, on_tpu=False, using_native_amp=False, using_lbfgs=False):
 81 |         if self.trainer.use_tpu:
 82 |             xm.optimizer_step(optimizer)
 83 |         else:
 84 |             optimizer.step()
 85 |         optimizer.zero_grad()
 86 |         self.lr_scheduler.step()
 87 | 
 88 |     def get_tqdm_dict(self):
 89 |         tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
 90 | 
 91 |         return tqdm_dict
 92 | 
 93 |     def train_dataloader(self):
 94 |         train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="so_train", args=self.hparams)
 95 |         dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True,
 96 |                                 num_workers=4)
 97 |         t_total = (
 98 |                 (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
 99 |                 // self.hparams.gradient_accumulation_steps
100 |                 * float(self.hparams.num_train_epochs)
101 |         )
102 |         scheduler = get_linear_schedule_with_warmup(
103 |             self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
104 |         )
105 |         self.lr_scheduler = scheduler
106 |         return dataloader
107 | 
108 |     def val_dataloader(self):
109 |         val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="so_val", args=self.hparams)
110 |         return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)
111 | 
112 | logger = logging.getLogger(__name__)
113 | 
114 | class LoggingCallback(pl.Callback):
115 |   def on_validation_end(self, trainer, pl_module):
116 |     logger.info("***** Validation results *****")
117 |     if pl_module.is_logger():
118 |       metrics = trainer.callback_metrics
119 |       # Log results
120 |       for key in sorted(metrics):
121 |         if key not in ["log", "progress_bar"]:
122 |           logger.info("{} = {}\n".format(key, str(metrics[key])))
123 | 
124 |   def on_test_end(self, trainer, pl_module):
125 |     logger.info("***** Test results *****")
126 | 
127 |     if pl_module.is_logger():
128 |       metrics = trainer.callback_metrics
129 | 
130 |       # Log and save results to file
131 |       output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
132 |       with open(output_test_results_file, "w") as writer:
133 |         for key in sorted(metrics):
134 |           if key not in ["log", "progress_bar"]:
135 |             logger.info("{} = {}\n".format(key, str(metrics[key])))
136 |             writer.write("{} = {}\n".format(key, str(metrics[key])))
137 | 
138 | def get_dataset(tokenizer, type_path, args):
139 | 	return ParaphraseDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/CodeSelector/train/data_prepare.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | from Bert_MLP import Config
  3 | 
  4 | # Import DataSet
  5 | df = pd.read_csv('./Data/train_label_data', \
  6 |                  delimiter='\t', \
  7 |                  header = None, \
  8 |                  names = ['qid', 'question', 'aid0', 'cs0', 'aid1', 'cs1', 'label'])
  9 | print(type(df), df.shape)
 10 | print(df['label'].value_counts())
 11 | 
 12 | # Get the lists of questions 
 13 | questions = df.question.values.tolist()
 14 | cs0 = df.cs0.values.tolist()
 15 | cs1 = df.cs1.values.tolist()
 16 | labels = df.label.values.tolist()
 17 | 
 18 | print('questions:', type(questions), len(questions))
 19 | print('cs0:', type(cs0), len(cs0))
 20 | print('cs1:', type(cs1), len(cs1))
 21 | print('labels:', type(labels), len(labels))
 22 | 
 23 | # Tokenize & Input Formatting
 24 | ## Import model/tokenizer 
 25 | ## Load the BERT model
 26 | print("Loading BERT Model...")
 27 | bert_model = BertModel.from_pretrained('./Model')
 28 | bert_model.cuda()
 29 | print("Loading BERT Tokenizer...")
 30 | tokenizer = AutoTokenizer.from_pretrained('./Model')
 31 | # tokenizer = tokenizer_class.from_pretrained('./Model', do_lower_case=True)
 32 | 
 33 | 
 34 | # Required Formatting
 35 | ## 1. sentences to ids
 36 | ## 2. Padding & Truncating                          
 37 | ## 3. Attention Masks
 38 | ## 4. 
 39 | # Combine question + cs0 as the first inputs
 40 | encoded_qc0 = tokenizer(questions, cs0, padding=True, truncation=True, max_length=128, return_tensors='pt')
 41 | print("encoded_qc0:", type(encoded_qc0), len(encoded_qc0))
 42 | qc0_input_ids = encoded_qc0['input_ids']
 43 | qc0_token_type_ids = encoded_qc0['token_type_ids']
 44 | qc0_attention_masks = encoded_qc0['attention_mask']
 45 | 
 46 | print("qc0_input_ids:", type(qc0_input_ids), qc0_input_ids.shape)
 47 | print("qc0_type_ids:", type(qc0_token_type_ids), qc0_token_type_ids.shape)
 48 | print("qc0_attn_mask:", type(qc0_attention_masks), qc0_attention_masks.shape)
 49 | 
 50 | # Convert list to numpy array
 51 | qc0_input_ids = qc0_input_ids.cpu().detach().numpy()
 52 | qc0_token_type_ids = qc0_token_type_ids.cpu().detach().numpy()
 53 | qc0_attention_masks = qc0_attention_masks.cpu().detach().numpy()
 54 | print("qc0_input_ids:", type(qc0_input_ids), qc0_input_ids.shape )
 55 | print("qc0_type_ids:", type(qc0_token_type_ids), qc0_token_type_ids.shape )
 56 | print("qc0_attn_mask:", type(qc0_attention_masks), qc0_attention_masks.shape )
 57 | 
 58 | 
 59 | encoded_qc1 = tokenizer(questions, cs1, padding=True, truncation=True, max_length=128, return_tensors='pt')
 60 | 
 61 | qc1_input_ids = encoded_qc1['input_ids']
 62 | qc1_token_type_ids = encoded_qc1['token_type_ids']
 63 | qc1_attention_masks = encoded_qc1['attention_mask']
 64 | 
 65 | # Convert list to numpy array
 66 | qc1_input_ids = qc1_input_ids.cpu().detach().numpy()
 67 | qc1_token_type_ids = qc1_token_type_ids.cpu().detach().numpy()
 68 | qc1_attention_masks = qc1_attention_masks.cpu().detach().numpy()
 69 | print("qc1_input_ids:", type(qc1_input_ids), qc1_input_ids.shape )
 70 | print("qc1_type_ids:", type(qc1_token_type_ids), qc1_token_type_ids.shape )
 71 | print("qc1_attn_mask:", type(qc1_attention_masks), qc1_attention_masks.shape )
 72 | 
 73 | labels = np.asarray(labels)
 74 | 
 75 | with open('./Data/encoded_qc0.pkl', 'wb') as handle: 
 76 |     pickle.dump(encoded_qc0, handle) 
 77 | 
 78 | with open('./Data/encoded_qc1.pkl', 'wb') as handle:
 79 |     pickle.dump(encoded_qc1, handle)
 80 | 
 81 | with open('./Data/labels.pkl', 'wb') as handle:
 82 |     pickle.dump(labels, handle)
 83 | 
 84 | # Training and Validation Split on qc0
 85 | # Use 97% for training and 3% for validation
 86 | train_qc0_inputs, validation_qc0_inputs, train_labels, validation_labels = train_test_split(qc0_input_ids, \
 87 |                                                                                     labels, \
 88 |                                                                                     random_state=2018, \
 89 |                                                                                     test_size=0.03)
 90 | # Do the same for attention_mask
 91 | train_qc0_masks, validation_qc0_masks, _, _ = train_test_split(qc0_attention_masks, \
 92 |                                                        labels, \
 93 |                                                        random_state=2018, \
 94 |                                                        test_size = 0.03)
 95 | 
 96 | # Do the same for token_type_ids
 97 | train_qc0_types, validation_qc0_types, _, _ = train_test_split(qc0_token_type_ids, \
 98 |                                                        labels, \
 99 |                                                        random_state=2018, \
100 |                                                        test_size = 0.03)
101 | 
102 | # Training and Validation Split on qc1
103 | # Use 97% for training and 3% for validation
104 | train_qc1_inputs, validation_qc1_inputs, _, _ = train_test_split(qc1_input_ids, \
105 |                                                                  labels, \
106 |                                                                  random_state=2018, \
107 |                                                                  test_size=0.03)
108 | # Do the same for attention_mask
109 | train_qc1_masks, validation_qc1_masks, _, _ = train_test_split(qc1_attention_masks, \
110 |                                                        labels, \
111 |                                                        random_state=2018, \
112 |                                                        test_size = 0.03)
113 | 
114 | # Do the same for token_type_ids
115 | train_qc1_types, validation_qc1_types, _, _ = train_test_split(qc1_token_type_ids, \
116 |                                                        labels, \
117 |                                                        random_state=2018, \
118 |                                                        test_size = 0.03)
119 | 
120 | # Convert to Pytorch Data Types
121 | train_qc0_inputs = torch.tensor(train_qc0_inputs)
122 | train_qc0_masks = torch.tensor(train_qc0_masks)
123 | train_qc0_types = torch.tensor(train_qc0_types)
124 | 
125 | train_qc1_inputs = torch.tensor(train_qc1_inputs)
126 | train_qc1_masks = torch.tensor(train_qc1_masks)
127 | train_qc1_types = torch.tensor(train_qc1_types)
128 | 
129 | validation_qc0_inputs = torch.tensor(validation_qc0_inputs)
130 | validation_qc0_masks = torch.tensor(validation_qc0_masks)
131 | validation_qc0_types = torch.tensor(validation_qc0_types)
132 | 
133 | validation_qc1_inputs = torch.tensor(validation_qc1_inputs)
134 | validation_qc1_masks = torch.tensor(validation_qc1_masks)
135 | validation_qc1_types = torch.tensor(validation_qc1_types)
136 | 
137 | train_labels = torch.tensor(train_labels)
138 | validation_labels = torch.tensor(validation_labels)
139 | 
140 | print(type(train_qc0_inputs), train_qc0_inputs.shape)
141 | print(type(train_qc0_masks), train_qc0_masks.shape)
142 | print(type(train_qc0_types), train_qc0_types.shape)
143 | 
144 | print(type(train_qc1_inputs), train_qc1_inputs.shape)
145 | print(type(train_qc1_masks), train_qc1_masks.shape)
146 | print(type(train_qc1_types), train_qc1_types.shape)
147 | 
148 | print(type(train_labels), train_labels.shape)
149 | 
150 | 
151 | # We'll also create an iterator for our dataset using the torch DataLoader class.
152 | # This helps save on memory during training 
153 | # unlike for loop, with an iterator the entire dataset does not need to be loaded into memory
154 | 
155 | config = Config()
156 | # batch_size = 32
157 | batch_size = config.batch_size
158 | print("batch_size:", batch_size)
159 | 
160 | # Create the DataLoader for our training set.
161 | train_data = TensorDataset(train_qc0_inputs, train_qc0_masks, train_qc0_types, \
162 |                            train_qc1_inputs, train_qc1_masks, train_qc1_types, \
163 |                            train_labels)
164 | train_sampler = RandomSampler(train_data)
165 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
166 | print(type(train_dataloader))
167 | 
168 | # Create the DataLoader for our validation set.
169 | validation_data = TensorDataset(validation_qc0_inputs, validation_qc0_masks, validation_qc0_types, \
170 |                                 validation_qc1_inputs, validation_qc1_masks, validation_qc1_types, \
171 |                                 validation_labels)
172 | validation_sampler = SequentialSampler(validation_data)
173 | validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
174 | print(type(validation_dataloader))
175 | 
176 | # Save DataLoader
177 | with open('./Data/train_dataloader.pkl', 'wb') as handle:
178 |     pickle.dump(train_dataloader, handle)
179 | 
180 | with open('./Data/validation_dataloader.pkl', 'wb') as handle:
181 |     pickle.dump(validation_dataloader, handle)
182 | 
183 | print("Finished!")
184 | 


--------------------------------------------------------------------------------
/CodeSelector/train/train.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | from Bert_MLP import Model, Config
  3 | # from Bert_CNN import Model, Config
  4 | 
  5 | def save_model(epoch, model, training_stats):
  6 |     # Saving & Loading Fine-tuned Model
  7 |     ## Saving best-practices: if you use defaults names for the model, 
  8 |     ## you can reload it using from_pretrained()
  9 | 
 10 |     base_dir = './model_save/epoch' + str(epoch) + '/'
 11 |     # sub_dir = 'epoch' + str(epoch) +'/model.ckpt' 
 12 |     output_dir = base_dir + 'model.ckpt' 
 13 |     ## Create output directory if needed
 14 |     if not os.path.exists(base_dir):
 15 |         os.makedirs(base_dir)
 16 | 
 17 |     print("Saving model to %s" % output_dir)
 18 | 
 19 |     # Save a trained model, configuration and tokenizer using `save_pretrained()`.
 20 |     # They can then be reloaded using `from_pretrained()`
 21 |     # model_to_save = model.module if hasattr(model, 'module') else model
 22 |     # model_to_save.save_pretrained(output_dir)
 23 |     # tokenizer.save_pretrained(output_dir)
 24 |     torch.save(model.state_dict(), output_dir)
 25 |     
 26 |     df_stats = pd.DataFrame(data=training_stats)
 27 |     df_stats.to_json(base_dir + "training_stats.json")
 28 |     # df_stats.to_pickle(output_dir + "training_stats.pkl")
 29 |     # Good practice: save your training arguments together with the trained model
 30 |     # torch.save(args, os.path.join(output_dir, 'training_args.bin'))
 31 | 
 32 | def save_model_step(step, model, training_stats):
 33 |     
 34 |     base_dir = './model_save/step' + str(step) + '/'  
 35 |     output_dir = base_dir + 'model.ckpt' 
 36 |     if not os.path.exists(base_dir):
 37 |         os.makedirs(base_dir)
 38 | 
 39 |     print("Saving model to %s" % output_dir)
 40 |     torch.save(model.state_dict(), output_dir)
 41 |     df_stats = pd.DataFrame(data=training_stats)
 42 |     df_stats.to_json(base_dir + "training_stats.json")
 43 | 
 44 | 
 45 | # Load the iterator  
 46 | with open('./Data/train_dataloader.pkl', 'rb') as handle:
 47 |     train_dataloader = pickle.load(handle)
 48 | 
 49 | with open('./Data/validation_dataloader.pkl', 'rb') as handle:
 50 |     validation_dataloader = pickle.load(handle)
 51 | 
 52 | print("dataloader loaded!")
 53 | 
 54 | 
 55 | config = Config()
 56 | model = Model(config).to(config.device)
 57 | print("Model created!")
 58 | 
 59 | # Optimizer & Learning Rate Scheduler 
 60 | optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
 61 | # Number of training epochs. The BERT authors recommend between 2 and 4. 
 62 | # We chose to run for 4, but we'll see later that this may be over-fitting the training data.
 63 | epochs = config.num_epochs 
 64 | # Total number of training steps is [number of batches] x [number of epochs]. 
 65 | # (Note that this is not the same as the number of training samples).
 66 | total_steps = len(train_dataloader) * epochs
 67 | # Create the learning rate scheduler.
 68 | scheduler = get_linear_schedule_with_warmup(optimizer, \
 69 |                                             num_warmup_steps = 0, # Default value in run_glue.py
 70 |                                             num_training_steps = total_steps)
 71 | 
 72 | # We are ready to kick off the training
 73 | # Set the seed value all over the place to make this reproducible.
 74 | seed_val = 42
 75 | random.seed(seed_val)
 76 | np.random.seed(seed_val)
 77 | torch.manual_seed(seed_val)
 78 | torch.cuda.manual_seed_all(seed_val)
 79 | 
 80 | # We'll store a number of quantities such as training and validation loss, validation accuracy, and timings.
 81 | training_stats = []
 82 | 
 83 | # Measure the total training time for the whole run.
 84 | total_t0 = time.time()
 85 | 
 86 | print("Training start ...")
 87 | # For each epoch
 88 | for epoch_i in range(0, epochs): 
 89 |     # ==========================
 90 |     #       Training 
 91 |     # ==========================
 92 | 
 93 |     # Perform one full pass over the training set.
 94 |     print("")
 95 |     print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
 96 |     print('Training...')
 97 |     
 98 |     # Measure how long the training epoch takes.
 99 |     t0 = time.time()
100 |     # Reset the total loss for this epoch.
101 |     total_train_loss = 0
102 | 
103 |     model.train()
104 |     # For each batch of training data...
105 |     for step, batch in enumerate(train_dataloader):
106 | 
107 | 
108 |         # Progress update every 40 batches
109 |         if step % 100 == 0 and not step == 0:
110 |             # Calculate elapsed time in minutes.
111 |             elapsed = format_time(time.time() - t0)
112 |             # Report progress.
113 |             print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
114 |             # break
115 | 
116 |         # Save by step size    
117 |         if step % 1000 == 0 and not step == 0:
118 |             # Record all statistics from this epoch.
119 |             training_stats.append(\
120 |             {'epoch': epoch_i + 1, \
121 |              'step': step, \
122 |              # 'Training Loss': avg_train_loss, \
123 |              # 'Training Time': training_time, \
124 |             })
125 |             step_marker = str(epoch_i +1 ) + '-' + str(step)
126 |             save_model_step(step_marker, model, training_stats)
127 |             # break
128 |         
129 |         if step % 10000 == 0 and not step == 0:
130 |             break
131 | 
132 |         # Unpack this training batch from our dataloader.  
133 |         # 
134 |         # As we unpack the batch, we'll also copy each tensor to the GPU using the 'to' method
135 |         # 
136 |         # `batch` contains seven pytorch tensors:
137 |         #   [0]: qc0 input ids 
138 |         #   [1]: qc0 attention masks
139 |         #   [2]: qc0 token type ids
140 |         #   [3]: qc1 input ids 
141 |         #   [4]: qc1 attention masks
142 |         #   [5]: qc1 token type ids
143 |         #   [6]: labels 
144 |         b_qc0_input_ids   = batch[0].to(config.device)
145 |         b_qc0_input_mask  = batch[1].to(config.device)
146 |         b_qc0_input_types = batch[2].to(config.device)
147 |         b_qc1_input_ids   = batch[3].to(config.device)
148 |         b_qc1_input_mask  = batch[4].to(config.device)
149 |         b_qc1_input_types = batch[5].to(config.device)
150 |         b_labels = batch[6].to(config.device)
151 |         # print('batch qc0 input_ids:', type(b_qc0_input_ids), b_qc0_input_ids.shape)
152 |         # print('batch qc0 input_mask:', type(b_qc0_input_mask), b_qc0_input_mask.shape)
153 |         # print('batch qc0 input_types:', type(b_qc0_input_types), b_qc0_input_types.shape)
154 |         # print('batch qc1 input_ids:', type(b_qc1_input_ids), b_qc1_input_ids.shape)
155 |         # print('batch qc1 input_mask:', type(b_qc1_input_mask), b_qc1_input_mask.shape)
156 |         # print('batch qc1 input_types:', type(b_qc1_input_types), b_qc1_input_types.shape)
157 |         # print('batch labels:', type(b_labels), b_labels.shape)
158 |         
159 |         model.zero_grad()
160 | 
161 |         b_qc0 = (b_qc0_input_ids, b_qc0_input_mask, b_qc0_input_types)
162 |         b_qc1 = (b_qc1_input_ids, b_qc1_input_mask, b_qc1_input_types)
163 |         b_outputs = model(b_qc0, b_qc1)
164 |         # print('batch outputs:', type(b_outputs), b_outputs.shape)
165 |         # exit()
166 | 
167 |         loss = F.cross_entropy(b_outputs, b_labels)
168 |         # print('loss:', type(loss), loss, loss.item())
169 | 
170 |         total_train_loss += loss.item()
171 |         # Perform a backward pass to calculate the gradients.
172 |         loss.backward()
173 |         
174 |         # Clip the norm of the gradients to 1.0.
175 |         # This is to help prevent the "exploding gradients" problem.
176 |         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
177 | 
178 |         # Update parameters and take a step using the computed gradient.
179 |         # The optimizer dictates the "update rule"--how the parameters are
180 |         # modified based on their gradients, the learning rate, etc.
181 |         optimizer.step()
182 | 
183 |         # Update the learning rate.
184 |         scheduler.step()
185 |         # break
186 |     # exit()
187 |     # Calculate the average loss over all of the batches.
188 |     avg_train_loss = total_train_loss / len(train_dataloader)
189 | 
190 |     # Measure how long this epoch took.
191 |     training_time = format_time(time.time() - t0)
192 |     
193 |     print("")
194 |     print("  Average training loss: {0:.2f}".format(avg_train_loss))
195 |     print("  Training epcoh took: {:}".format(training_time))
196 | 
197 |     # ========================================
198 |     #               Validation
199 |     # ========================================
200 |     
201 |     # After the completion of each training epoch, measure our performance on
202 |     # our validation set. 
203 |     
204 |     print("")
205 |     print("Running Validation...")
206 |     
207 |     t0 = time.time()
208 |     # Put the model in evaluation mode--the dropout layers behave differently
209 |     # during evaluation.
210 |     model.eval()
211 | 
212 |     # Tracking variables 
213 |     total_eval_accuracy = 0
214 |     total_eval_loss = 0
215 |     nb_eval_steps = 0
216 | 
217 |     # Evaluate data for one epoch
218 |     for batch in validation_dataloader:
219 |         # Unpack this training batch from our dataloader.  
220 |         # 
221 |         # As we unpack the batch, we'll also copy each tensor to the GPU using the 'to' method
222 |         # 
223 |         # `batch` contains three pytorch tensors:
224 |         #   [0]: input ids 
225 |         #   [1]: attention masks
226 |         #   [2]: labels 
227 |         b_qc0_input_ids   = batch[0].to(config.device)
228 |         b_qc0_input_mask  = batch[1].to(config.device)
229 |         b_qc0_input_types = batch[2].to(config.device)
230 |         b_qc1_input_ids   = batch[3].to(config.device)
231 |         b_qc1_input_mask  = batch[4].to(config.device)
232 |         b_qc1_input_types = batch[5].to(config.device)
233 |         b_labels = batch[6].to(config.device)
234 | 
235 | 
236 |         # Tell pytorch not to bother with constructing the compute graph during
237 |         # the forward pass, since this is only needed for backprop (training).
238 |         with torch.no_grad():
239 |             # Forward pass, calculate logit predictions.
240 |             # token_type_ids is the same as the "segment ids", which
241 |             # differentiates sentence 1 and 2 in 2-sentence tasks.
242 |             # values prior to applying an activation function like the softmax.
243 |             b_qc0 = (b_qc0_input_ids, b_qc0_input_mask, b_qc0_input_types)
244 |             b_qc1 = (b_qc1_input_ids, b_qc1_input_mask, b_qc1_input_types)
245 |             b_outputs = model(b_qc0, b_qc1)
246 |             # b_outputs = model(b_input_ids, b_input_mask, b_input_types)
247 |             # print("b_outputs:", type(b_outputs), b_outputs.shape)
248 | 
249 |         loss = F.cross_entropy(b_outputs, b_labels)
250 |              
251 |         # Accumulate the validation loss.
252 |         total_eval_loss += loss.item()
253 | 
254 |         # move labels to CPU 
255 |         preds = torch.max(b_outputs.data, 1)[1].cpu().numpy()
256 |         # print("preds:", type(preds), preds.shape)
257 |         labels = b_labels.to('cpu').numpy()
258 |         # print("labels:", type(labels), labels.shape)
259 |         # print(preds)
260 |         # print(labels)
261 | 
262 |         # Calculate the accuracy for this batch of test sentences, and
263 |         total_eval_accuracy += flat_accuracy(preds, labels)
264 |         # break
265 | 
266 |     # Report the final accuracy for this validation run.
267 |     avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
268 |     print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
269 |     
270 |     # Calculate the average loss over all of the batches.
271 |     avg_val_loss = total_eval_loss / len(validation_dataloader)
272 |     
273 |     # Measure how long the validation run took.
274 |     validation_time = format_time(time.time() - t0)
275 |     
276 |     print("  Validation Loss: {0:.2f}".format(avg_val_loss))
277 |     print("  Validation took: {:}".format(validation_time))
278 |     
279 |     # Record all statistics from this epoch.
280 |     training_stats.append(\
281 |         {'epoch': epoch_i + 1, \
282 |          'Training Loss': avg_train_loss, \
283 |          'Valid. Loss': avg_val_loss, \
284 |          'Valid. Accur.': avg_val_accuracy, \
285 |          'Training Time': training_time, \
286 |          'Validation Time': validation_time
287 |         })
288 |     
289 |     save_model(epoch_i + 1, model, training_stats)
290 |     # exit()
291 |     # break
292 | 
293 | print("")
294 | print("Training complete!")
295 | print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
296 | 
297 | 


--------------------------------------------------------------------------------