├── figures ├── workflow.png ├── QueryRewriter.png └── CodeSelector-Workflow.png ├── QueryRewriter ├── train │ ├── utils.py │ ├── prepare_data.py │ ├── ParaphraseDataset.py │ ├── train.py │ └── T5FineTuner.py └── eval │ ├── QueryRewriter_model.py │ └── evaluation.py ├── CodeSelector ├── eval │ ├── utils.py │ ├── Bert_MLP.py │ ├── CodeSelector_model.py │ └── evaluation.py └── train │ ├── utils.py │ ├── Bert_MLP.py │ ├── data_prepare.py │ └── train.py └── README.md /figures/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondacm/Que2Code/HEAD/figures/workflow.png -------------------------------------------------------------------------------- /figures/QueryRewriter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondacm/Que2Code/HEAD/figures/QueryRewriter.png -------------------------------------------------------------------------------- /figures/CodeSelector-Workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondacm/Que2Code/HEAD/figures/CodeSelector-Workflow.png -------------------------------------------------------------------------------- /QueryRewriter/train/utils.py: -------------------------------------------------------------------------------- 1 | # import packages 2 | import argparse 3 | import glob 4 | import os 5 | import json 6 | import time 7 | import logging 8 | import random 9 | import re 10 | from itertools import chain 11 | from string import punctuation 12 | 13 | import nltk 14 | nltk.download('punkt') 15 | from nltk.tokenize import sent_tokenize 16 | 17 | import pandas as pd 18 | import numpy as np 19 | import torch 20 | from torch.utils.data import Dataset, DataLoader 21 | import pytorch_lightning as pl 22 | 23 | from transformers import ( 24 | AdamW, 25 | T5ForConditionalGeneration, 26 | T5Tokenizer, 27 | get_linear_schedule_with_warmup 28 | ) 29 | 30 | # set a seed 31 | def set_seed(seed): 32 | random.seed(seed) 33 | np.random.seed(seed) 34 | torch.manual_seed(seed) 35 | set_seed(42) 36 | 37 | 38 | -------------------------------------------------------------------------------- /CodeSelector/eval/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | import time 6 | import datetime 7 | import random 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.model_selection import GridSearchCV 11 | from sklearn.model_selection import cross_val_score 12 | import torch 13 | import transformers as ppb 14 | import warnings 15 | from transformers import BertTokenizer 16 | from transformers import BertModel 17 | import torch.nn as nn 18 | import torch.nn.functional as F 19 | from transformers import AutoTokenizer 20 | from sklearn.model_selection import train_test_split 21 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 22 | from transformers import get_linear_schedule_with_warmup 23 | from transformers import AdamW, BertConfig 24 | warnings.filterwarnings('ignore') 25 | 26 | def flat_accuracy(preds, labels): 27 | pred_flat = preds.flatten() 28 | labels_flat = labels.flatten() 29 | return np.sum(pred_flat == labels_flat) / len(labels_flat) 30 | 31 | def format_time(elapsed): 32 | ''' 33 | Takes a time in seconds and returns a string hh:mm:ss 34 | ''' 35 | # round to the nearest second 36 | elapsed_rounded = int(round(elapsed)) 37 | # format as hh:mm:ss 38 | return str(datetime.timedelta(seconds=elapsed_rounded)) 39 | 40 | 41 | -------------------------------------------------------------------------------- /CodeSelector/train/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | import time 6 | import datetime 7 | import random 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.model_selection import GridSearchCV 11 | from sklearn.model_selection import cross_val_score 12 | import torch 13 | import transformers as ppb 14 | import warnings 15 | from transformers import BertTokenizer 16 | from transformers import BertModel 17 | import torch.nn as nn 18 | import torch.nn.functional as F 19 | from transformers import AutoTokenizer 20 | from sklearn.model_selection import train_test_split 21 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 22 | from transformers import get_linear_schedule_with_warmup 23 | from transformers import AdamW, BertConfig 24 | warnings.filterwarnings('ignore') 25 | 26 | def flat_accuracy(preds, labels): 27 | pred_flat = preds.flatten() 28 | labels_flat = labels.flatten() 29 | return np.sum(pred_flat == labels_flat) / len(labels_flat) 30 | 31 | def format_time(elapsed): 32 | ''' 33 | Takes a time in seconds and returns a string hh:mm:ss 34 | ''' 35 | # round to the nearest second 36 | elapsed_rounded = int(round(elapsed)) 37 | # format as hh:mm:ss 38 | return str(datetime.timedelta(seconds=elapsed_rounded)) 39 | 40 | 41 | -------------------------------------------------------------------------------- /QueryRewriter/train/prepare_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pickle 3 | 4 | train_question1 = [] 5 | train_question2 = [] 6 | 7 | # Make train data 8 | with open('./duplicate_questions.train', 'r') as fin: 9 | for line in fin: 10 | src_qid, src_qtitle, tgt_qid, tgt_qtitle = line.strip().split('\t') 11 | duplicate_question = src_qtitle.lower() 12 | master_question = tgt_qtitle.lower() 13 | train_question1.append(duplicate_question) 14 | train_question2.append(master_question) 15 | # break 16 | 17 | assert len(train_question1) == len(train_question2) 18 | train_df = pd.DataFrame({'question1':train_question1, 'question2':train_question2}) 19 | train_df.to_csv('./so_train.csv', index=False, sep="\t") 20 | 21 | 22 | val_question1 = [] 23 | val_question2 = [] 24 | 25 | # Make test data 26 | with open('./duplicate_questions.val', 'r') as fin: 27 | for line in fin: 28 | src_qid, src_qtitle, tgt_qid, tgt_qtitle = line.strip().split('\t') 29 | duplicate_question = src_qtitle.lower() 30 | master_question = tgt_qtitle.lower() 31 | val_question1.append(duplicate_question) 32 | val_question2.append(master_question) 33 | # break 34 | 35 | assert len(val_question1) == len(val_question2) 36 | val_df = pd.DataFrame({'question1':val_question1, 'question2':val_question2}) 37 | val_df.to_csv('./so_val.csv', index=False, sep="\t") 38 | 39 | 40 | -------------------------------------------------------------------------------- /QueryRewriter/train/ParaphraseDataset.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | 3 | class ParaphraseDataset(Dataset): 4 | def __init__(self, tokenizer, data_dir, type_path, max_len=512): 5 | self.path = os.path.join(data_dir, type_path + '.csv') 6 | 7 | self.source_column = "question1" 8 | self.target_column = "question2" 9 | self.data = pd.read_csv(self.path, sep="\t").astype(str) 10 | 11 | self.max_len = max_len 12 | self.tokenizer = tokenizer 13 | self.inputs = [] 14 | self.targets = [] 15 | 16 | self._build() 17 | 18 | def __len__(self): 19 | return len(self.inputs) 20 | 21 | def __getitem__(self, index): 22 | source_ids = self.inputs[index]["input_ids"].squeeze() 23 | target_ids = self.targets[index]["input_ids"].squeeze() 24 | 25 | src_mask = self.inputs[index]["attention_mask"].squeeze() # might need to squeeze 26 | target_mask = self.targets[index]["attention_mask"].squeeze() # might need to squeeze 27 | 28 | return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask} 29 | 30 | def _build(self): 31 | for idx in range(len(self.data)): 32 | input_, target = self.data.loc[idx, self.source_column], self.data.loc[idx, self.target_column] 33 | 34 | input_ = "paraphrase: "+ input_ + ' ' 35 | target = target + " " 36 | 37 | # tokenize inputs 38 | tokenized_inputs = self.tokenizer.batch_encode_plus( 39 | [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt", truncation='longest_first' 40 | ) 41 | # tokenize targets 42 | tokenized_targets = self.tokenizer.batch_encode_plus( 43 | [target], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt", truncation='longest_first' 44 | ) 45 | 46 | self.inputs.append(tokenized_inputs) 47 | self.targets.append(tokenized_targets) 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /CodeSelector/eval/Bert_MLP.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | 3 | class Config(object): 4 | 5 | def __init__(self): 6 | self.model_name = 'bert' 7 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 8 | self.num_classes = 2 9 | self.bert_path = './Model' 10 | self.hidden_size = 768 11 | self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) 12 | self.batch_size = 16 13 | self.num_epochs = 10 14 | 15 | 16 | class Model(nn.Module): 17 | 18 | def __init__(self, config): 19 | super(Model, self).__init__() 20 | self.bert = BertModel.from_pretrained(config.bert_path) 21 | for param in self.bert.parameters(): 22 | param.requires_grad = True 23 | self.fc0 = nn.Linear(2*config.hidden_size, 512) 24 | self.fc1 = nn.Linear(512, 128) 25 | self.fc2 = nn.Linear(128, config.num_classes) 26 | 27 | # def forward(self, input_ids, attention_mask, token_type_ids): 28 | def forward(self, qc0_pair, qc1_pair): 29 | 30 | qc0_input_ids, qc0_input_mask, qc0_input_types = qc0_pair[0], qc0_pair[1], qc0_pair[2] 31 | qc1_input_ids, qc1_input_mask, qc1_input_types = qc1_pair[0], qc1_pair[1], qc1_pair[2] 32 | ''' 33 | qc0_last_hidden_states = self.bert(input_ids = qc0_input_ids, \ 34 | attention_mask = qc0_input_mask, \ 35 | token_type_ids = qc0_input_types) 36 | 37 | qc1_last_hidden_states = self.bert(input_ids = qc1_input_ids, \ 38 | attention_mask = qc1_input_mask, \ 39 | token_type_ids = qc1_input_types) 40 | qc0_features = qc0_last_hidden_states[0][:,0,:] 41 | qc1_features = qc1_last_hidden_states[0][:,0,:] 42 | # print('qc0_features:', type(qc0_features), qc0_features.shape) 43 | # print('qc1_features:', type(qc1_features), qc1_features.shape) 44 | features = torch.cat((qc0_features, qc1_features), dim=1) 45 | # print("features:", type(features), features.shape) 46 | ''' 47 | _, qc0_pooled = self.bert(input_ids = qc0_input_ids, \ 48 | attention_mask = qc0_input_mask, \ 49 | token_type_ids = qc0_input_types) 50 | 51 | _, qc1_pooled = self.bert(input_ids = qc1_input_ids, \ 52 | attention_mask = qc1_input_mask, \ 53 | token_type_ids = qc1_input_types) 54 | 55 | features = torch.cat((qc0_pooled, qc1_pooled), dim=1) 56 | features = self.fc0(features) 57 | features = self.fc1(features) 58 | out = self.fc2(features) 59 | return out 60 | 61 | 62 | -------------------------------------------------------------------------------- /CodeSelector/train/Bert_MLP.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | 3 | class Config(object): 4 | 5 | def __init__(self): 6 | self.model_name = 'bert' 7 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 8 | self.num_classes = 2 9 | self.bert_path = './Model' 10 | self.hidden_size = 768 11 | self.tokenizer = BertTokenizer.from_pretrained(self.bert_path) 12 | self.batch_size = 16 13 | self.num_epochs = 1 14 | 15 | 16 | class Model(nn.Module): 17 | 18 | def __init__(self, config): 19 | super(Model, self).__init__() 20 | self.bert = BertModel.from_pretrained(config.bert_path) 21 | for param in self.bert.parameters(): 22 | param.requires_grad = True 23 | self.fc0 = nn.Linear(2*config.hidden_size, 512) 24 | self.fc1 = nn.Linear(512, 128) 25 | self.fc2 = nn.Linear(128, config.num_classes) 26 | 27 | # def forward(self, input_ids, attention_mask, token_type_ids): 28 | def forward(self, qc0_pair, qc1_pair): 29 | 30 | qc0_input_ids, qc0_input_mask, qc0_input_types = qc0_pair[0], qc0_pair[1], qc0_pair[2] 31 | qc1_input_ids, qc1_input_mask, qc1_input_types = qc1_pair[0], qc1_pair[1], qc1_pair[2] 32 | ''' 33 | qc0_last_hidden_states = self.bert(input_ids = qc0_input_ids, \ 34 | attention_mask = qc0_input_mask, \ 35 | token_type_ids = qc0_input_types) 36 | 37 | qc1_last_hidden_states = self.bert(input_ids = qc1_input_ids, \ 38 | attention_mask = qc1_input_mask, \ 39 | token_type_ids = qc1_input_types) 40 | qc0_features = qc0_last_hidden_states[0][:,0,:] 41 | qc1_features = qc1_last_hidden_states[0][:,0,:] 42 | # print('qc0_features:', type(qc0_features), qc0_features.shape) 43 | # print('qc1_features:', type(qc1_features), qc1_features.shape) 44 | features = torch.cat((qc0_features, qc1_features), dim=1) 45 | # print("features:", type(features), features.shape) 46 | ''' 47 | _, qc0_pooled = self.bert(input_ids = qc0_input_ids, \ 48 | attention_mask = qc0_input_mask, \ 49 | token_type_ids = qc0_input_types) 50 | 51 | _, qc1_pooled = self.bert(input_ids = qc1_input_ids, \ 52 | attention_mask = qc1_input_mask, \ 53 | token_type_ids = qc1_input_types) 54 | 55 | features = torch.cat((qc0_pooled, qc1_pooled), dim=1) 56 | features = self.fc0(features) 57 | features = self.fc1(features) 58 | out = self.fc2(features) 59 | return out 60 | 61 | 62 | -------------------------------------------------------------------------------- /QueryRewriter/train/train.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | from T5FineTuner import T5FineTuner 3 | from T5FineTuner import LoggingCallback 4 | from ParaphraseDataset import ParaphraseDataset 5 | 6 | # set arguments 7 | args_dict = dict( 8 | data_dir="", # path for data files 9 | output_dir="", # path to save the checkpoints 10 | model_name_or_path='./Paraphrse_Pretrained/', 11 | tokenizer_name_or_path='./Paraphrse_Pretrained/', 12 | max_seq_length=256, 13 | learning_rate=3e-4, 14 | weight_decay=0.0, 15 | adam_epsilon=1e-8, 16 | warmup_steps=0, 17 | train_batch_size=1, 18 | eval_batch_size=1, 19 | num_train_epochs=2, 20 | gradient_accumulation_steps=16, 21 | n_gpu=1, 22 | early_stop_callback=False, 23 | fp_16=False, # if you want to enable 16-bit training then install apex and set this to true 24 | opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties 25 | max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default 26 | seed=42, 27 | ) 28 | 29 | tokenizer = T5Tokenizer.from_pretrained('./Paraphrse_Pretrained/') 30 | # dataset = ParaphraseDataset(tokenizer, 'data', 'dev', 256) 31 | 32 | train_path = "./data/so_train.csv" 33 | val_path = "./data/so_val.csv" 34 | 35 | data_train = pd.read_csv(train_path, sep="\t")#.astype(str) 36 | # print(data_train.head()) 37 | data_val = pd.read_csv(val_path, sep="\t") 38 | print(data_train.shape, data_val.shape) 39 | 40 | if not os.path.exists('t5_paraphrase'): 41 | os.makedirs('t5_paraphrase') 42 | 43 | args_dict.update({'data_dir': 'data', 'output_dir': 't5_paraphrase', 'num_train_epochs':10,'max_seq_length':256}) 44 | args = argparse.Namespace(**args_dict) 45 | print("args_dict:") 46 | print(args_dict) 47 | 48 | checkpoint_callback = pl.callbacks.ModelCheckpoint(\ 49 | filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5 50 | ) 51 | 52 | train_params = dict( 53 | accumulate_grad_batches=args.gradient_accumulation_steps, 54 | gpus=args.n_gpu, 55 | max_epochs=args.num_train_epochs, 56 | # early_stop_callback=False, 57 | precision= 16 if args.fp_16 else 32, 58 | amp_level=args.opt_level, 59 | gradient_clip_val=args.max_grad_norm, 60 | checkpoint_callback=checkpoint_callback, 61 | callbacks=[LoggingCallback()], 62 | ) 63 | 64 | # def get_dataset(tokenizer, type_path, args): 65 | # return ParaphraseDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path, max_len=args.max_seq_length) 66 | 67 | print ("Initialize model") 68 | model = T5FineTuner(args) 69 | 70 | trainer = pl.Trainer(**train_params) 71 | 72 | print (" Training model") 73 | trainer.fit(model) 74 | print ("training finished") 75 | 76 | print ("Saving model") 77 | model.model.save_pretrained('t5_paraphrase') 78 | 79 | print ("Model saved") 80 | -------------------------------------------------------------------------------- /CodeSelector/eval/CodeSelector_model.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | from Bert_MLP import Model, Config 3 | 4 | class CodeSelector(object): 5 | 6 | def __init__(self): 7 | self.config = Config() 8 | self.model = self.load_model() 9 | self.tokenizer = self.load_tokenizer() 10 | pass 11 | 12 | def load_model(self): 13 | PATH = './model_save/epoch7/model.ckpt' 14 | model = Model(self.config).to(self.config.device) 15 | model.load_state_dict(torch.load(PATH)) 16 | model.eval() 17 | print('Model Loaded!') 18 | return model 19 | 20 | def load_tokenizer(self): 21 | tokenizer = AutoTokenizer.from_pretrained('./Model') 22 | print("Tokenizer Loaded!") 23 | return tokenizer 24 | pass 25 | 26 | def encode_qc(self, question, cs): 27 | encoded_qc = self.tokenizer(question, cs, padding=True, truncation=True, max_length=128, return_tensors='pt') 28 | return encoded_qc 29 | 30 | def get_score(self, question, cs): 31 | # encode qc_0 32 | encoded_qc0 = self.encode_qc(question, cs) 33 | qc0_input_ids = encoded_qc0['input_ids'] 34 | qc0_token_type_ids = encoded_qc0['token_type_ids'] 35 | qc0_attention_masks = encoded_qc0['attention_mask'] 36 | 37 | # encode qc_1 38 | encoded_qc1 = self.encode_qc('', '') 39 | qc1_input_ids = encoded_qc1['input_ids'] 40 | qc1_token_type_ids = encoded_qc1['token_type_ids'] 41 | qc1_attention_masks = encoded_qc1['attention_mask'] 42 | 43 | # to device 44 | b_qc0_input_ids = qc0_input_ids.to(self.config.device) 45 | b_qc0_input_mask = qc0_attention_masks.to(self.config.device) 46 | b_qc0_input_types = qc0_token_type_ids.to(self.config.device) 47 | b_qc1_input_ids = qc1_input_ids.to(self.config.device) 48 | b_qc1_input_mask = qc1_attention_masks.to(self.config.device) 49 | b_qc1_input_types = qc1_token_type_ids.to(self.config.device) 50 | with torch.no_grad(): 51 | qc0 = (b_qc0_input_ids, b_qc0_input_mask, b_qc0_input_types) 52 | qc1 = (b_qc1_input_ids, b_qc1_input_mask, b_qc1_input_types) 53 | outputs = self.model(qc0, qc1) 54 | 55 | score = outputs.data.cpu().numpy()[0][1] 56 | return score 57 | 58 | 59 | def get_candidate_scores(self, question, candidate_answers): 60 | candidate_scores = [] 61 | for cs in candidate_answers: 62 | score = self.get_score(question, cs) 63 | candidate_scores.append(score) 64 | return candidate_scores 65 | 66 | 67 | def main(): 68 | cs_model = CodeSelector() 69 | with open('../../BM25-IR/eval_data.pkl', 'rb') as handler: 70 | eval_data = pickle.load(handler) 71 | 72 | for k, v in eval_data.items(): 73 | question = v['qtitle'] 74 | candidate_answers = [] 75 | best_code = v['best_code'] 76 | similar_code1 = v['similar_code1'] 77 | similar_code2 = v['similar_code2'] 78 | similar_code3 = v['similar_code3'] 79 | similar_code4 = v['similar_code4'] 80 | candidate_answers.append(best_code) 81 | candidate_answers.append(similar_code1) 82 | candidate_answers.append(similar_code2) 83 | candidate_answers.append(similar_code3) 84 | candidate_answers.append(similar_code4) 85 | candidate_scores = cs_model.get_candidate_scores(question, candidate_answers) 86 | print(candidate_scores) 87 | break 88 | pass 89 | 90 | if __name__ == '__main__': 91 | main() 92 | 93 | -------------------------------------------------------------------------------- /QueryRewriter/eval/QueryRewriter_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pickle 3 | import numpy as np 4 | from transformers import T5ForConditionalGeneration,T5Tokenizer 5 | from transformers import T5Tokenizer, T5Model 6 | # from transformers import T5Tokenizer, T5EncoderModel 7 | 8 | def set_seed(seed): 9 | torch.manual_seed(seed) 10 | if torch.cuda.is_available(): 11 | torch.cuda.manual_seed_all(seed) 12 | 13 | set_seed(42) 14 | 15 | class QueryRewriter(object): 16 | 17 | def __init__(self): 18 | self.device = self.get_device() 19 | self.model = self.load_model() 20 | self.tokenizer = self.load_tokenizer() 21 | self.input_embeddings = self.load_input_embeddings() 22 | print( type(self.input_embeddings) ) 23 | pass 24 | 25 | def get_device(self): 26 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 27 | return device 28 | 29 | def load_model(self): 30 | model = T5ForConditionalGeneration.from_pretrained('./t5_paraphrase') 31 | # model = T5Model.from_pretrained('./t5_paraphrase') 32 | model = model.to(self.device) 33 | return model 34 | 35 | def load_tokenizer(self): 36 | # tokenizer = T5Tokenizer.from_pretrained('t5-base') 37 | tokenizer = T5Tokenizer.from_pretrained('./t5_paraphrase') 38 | # tokenizer = T5Tokenizer.from_pretrained('./Paraphrse_Pretrained') 39 | return tokenizer 40 | 41 | def load_input_embeddings(self): 42 | embeddings = self.model.get_input_embeddings() 43 | return embeddings 44 | 45 | def encode(self, query): 46 | query = query.strip() 47 | # text = "paraphrase: " + query + " " 48 | text = query 49 | input_ids = self.tokenizer.encode(text, return_tensors="pt").to(self.device) 50 | 51 | ################ Alternative Method ############################ 52 | # print("input_ids:", input_ids) 53 | outputs = self.input_embeddings(input_ids) 54 | # print("outputs:", outputs.shape) 55 | outputs = torch.squeeze(outputs) 56 | output_vec = outputs.cpu().detach().numpy() 57 | # print("output_vec:", output_vec.shape) 58 | output_vec = np.mean(output_vec, axis=0) 59 | # print("outputs:", type(outputs), len(outputs)) 60 | return output_vec 61 | 62 | def paraphrase(self, query): 63 | query = query.strip() 64 | text = "paraphrase: " + query + " " 65 | max_len = 256 66 | encoding = self.tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt") 67 | input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device) 68 | beam_outputs = self.model.generate(\ 69 | input_ids=input_ids, \ 70 | attention_mask=attention_masks,\ 71 | do_sample=True,\ 72 | max_length=256, \ 73 | top_k=120,\ 74 | top_p=0.95,\ 75 | early_stopping=True,\ 76 | num_return_sequences=3\ 77 | ) 78 | final_outputs =[] 79 | for beam_output in beam_outputs: 80 | sent = self.tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True) 81 | if sent.lower() != query.lower() and sent not in final_outputs: 82 | final_outputs.append(sent) 83 | return final_outputs 84 | 85 | 86 | def main(): 87 | query = "how to iterate the list in python" 88 | qr_model = QueryRewriter() 89 | query_vec = qr_model.encode(query) 90 | print("query_vec:", type(query_vec), query_vec.shape) 91 | 92 | paraphrase_q = qr_model.paraphrase(query) 93 | print(paraphrase_q) 94 | 95 | pass 96 | 97 | if __name__ == "__main__": 98 | main() 99 | 100 | 101 | -------------------------------------------------------------------------------- /CodeSelector/eval/evaluation.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | from model import Our_Model 4 | from CodeSelector_model import CodeSelector 5 | from scipy import spatial 6 | 7 | def hits_count(candidate_ranks, k): 8 | ''' 9 | candidate_ranks: 10 | list of candidates' ranks; one rank per question; 11 | length is a number of questions 12 | rank is a number from q to len(candidates of the question) 13 | e.g. [2, 3] means that first candidate has the rank 2, 14 | second candidate has the rank 3 15 | k: number of top-ranked elements (k in hits@k metric) 16 | result: return Hits@k value for current ranking 17 | ''' 18 | count = 0 19 | for rank in candidate_ranks: 20 | if rank <= k: 21 | count += 1 22 | return count/(len(candidate_ranks)+1e-8) 23 | 24 | def dcg_score(candidate_ranks, k): 25 | ''' 26 | candidate_ranks: 27 | list of candidates' ranks; one rank per question; 28 | length is a number of questions 29 | rank is a number from q to len(candidates of the question) 30 | e.g. [2, 3] means that first candidate has the rank 2, 31 | second candidate has the rank 3 32 | k: number of top-ranked elements (k in hits@k metric) 33 | 34 | result: return DCG@k value for current ranking 35 | ''' 36 | score = 0 37 | for rank in candidate_ranks: 38 | if rank <= k: 39 | score += 1/np.log2(1+rank) 40 | return score/(len(candidate_ranks)+1e-8) 41 | 42 | def rank_candidates(candidate_answers, candidate_scores): 43 | ''' 44 | question: a string 45 | candidate_answers: a list of strings 46 | result: a list of pairs (initial position in the list, question) 47 | ''' 48 | # question_vec = model.encode(question) 49 | # print("question_vec:", question_vec) 50 | # candidate_scores = [] 51 | # for answer in candidate_answers: 52 | # answer_vec = model.encode(answer) 53 | # print("answer_vec:", answer_vec) 54 | # score = 1 - spatial.distance.cosine(question_vec, answer_vec) 55 | # print("score:", score) 56 | # candidate_scores.append( score ) 57 | # print("candidate_scores:", candidate_scores) 58 | tl = [(i, candidate_answers[i], candidate_scores[i]) for i in range(len(candidate_answers))] 59 | # print("tl:", tl) 60 | stl = sorted(tl, key=lambda x:x[2], reverse=True) 61 | # print("stl:", stl) 62 | result = [(t[0], t[1]) for t in stl] 63 | # print("result:", result) 64 | return result 65 | 66 | with open('../../BM25-IR/eval_data.pkl', 'rb') as handler: 67 | eval_data = pickle.load(handler) 68 | print('eval_data:', type(eval_data), len(eval_data)) 69 | 70 | 71 | eval_pairs = [] 72 | for k, v in eval_data.items(): 73 | question = v['qtitle'] 74 | best_code = v['best_code'] 75 | similar_code1 = v['similar_code1'] 76 | similar_code2 = v['similar_code2'] 77 | similar_code3 = v['similar_code3'] 78 | similar_code4 = v['similar_code4'] 79 | 80 | eval_pairs.append(( question, \ 81 | best_code, \ 82 | similar_code1, \ 83 | similar_code2, \ 84 | similar_code3, \ 85 | similar_code4)) 86 | print('eval_pairs:', type(eval_pairs), len(eval_pairs)) 87 | 88 | # model = Our_Model() 89 | model = CodeSelector() 90 | 91 | model_ranking = [] 92 | for i, e in enumerate(eval_pairs[:1500]): 93 | # print(i) 94 | question = e[0] 95 | best_code = e[1] 96 | similar_code1 = e[2] 97 | similar_code2 = e[3] 98 | similar_code3 = e[4] 99 | similar_code4 = e[5] 100 | candidate_answers = [] 101 | candidate_answers.append(best_code) 102 | candidate_answers.append(similar_code1) 103 | candidate_answers.append(similar_code2) 104 | candidate_answers.append(similar_code3) 105 | candidate_answers.append(similar_code4) 106 | 107 | # scores_map = model.get_scores_map(question, candidate_answers) 108 | # candidate_scores = model.get_candidate_scores(scores_map) 109 | candidate_scores = model.get_candidate_scores(question, candidate_answers) 110 | # print("question:", question) 111 | # print("candidate_answers:", candidate_answers) 112 | 113 | ranks = rank_candidates(candidate_answers, candidate_scores) 114 | # print("ranks:", ranks) 115 | model_ranking.append( [r[0] for r in ranks].index(0) + 1 ) 116 | # print("model_ranking:", model_ranking) 117 | # break 118 | 119 | for k in [1, 2, 3, 4, 5]: 120 | print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(model_ranking, k), \ 121 | k, hits_count(model_ranking, k))) 122 | 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # I Know What You Are Searching For: Code Snippet Recommendation from Stack Overflow Posts 2 | 3 | ![Workflow of Que2Code](./figures/workflow.png) 4 | 5 | 6 | Stack Overflow has been heavily used by software developers to seek programming-related information. 7 | Typically, when developers encounter a technical problem, they formulate the problem as a query and use a search engine to obtain a list of possible relevant posts that may containuseful solutions to their problem. 8 | However, this kind of solution-seeking experience can be difficult and painful because the **_Query Mistmatch_** and **_Information Overload_** problems. To alleviate these challenges, in this work we present a query-driven code recommendation tool, named _Que2Code_, that identifies the best code snippets for a user query from Stack Overflow posts. 9 | The material used in our can be accessed with: 10 | 11 | - [Source Code Link](https://github.com/beyondacm/Que2Code) 12 | - [Dataset Download Link](https://drive.google.com/drive/folders/1-qlk1clhgy1Lzx4BIE5bW5fmEQsFSMjv?usp=sharing) 13 | - [Pretrained Model Download Link](https://drive.google.com/drive/folders/1-E8pPL3ze7jHkR4_J6htAPk7iN94yInt?usp=sharing) 14 | - [User Study](https://drive.google.com/file/d/1TJdpLwBFfdUcfvK42jLMGKNB4Ny87C_L/view?usp=sharing) 15 | 16 | 17 | Our model contains two stages: 18 | 19 | 1. _Semantically-Equivalent Question Retrieval_ 20 | 2. _Best Code Snippet Recommendation_ 21 | 22 | 23 | Our model has two sub-components, i.e., **QueryRewriter** and **CodeSelector**. **QueryRewriter** can qualitatively retrieve semantically-equivalent questions, and the **CodeSelector** can quantitatively rank the most relevant code snippets to the top of the recommendation candidates. 24 | 25 | ## QueryRewriter 26 | ![Workflow of Que2Code](./figures/QueryRewriter.png) 27 | In the first stage, our **QueryRewriter** component tackles the _query mistmatch_ problem. 28 | The idea of **QueryRewriter** is to use a rewritten version of a query question to cover a variety of different forms of semantically equivalent expressions. 29 | In particular, we first collect the duplicate question pairs from Stack Overflow, because duplicate questions can be considered as semantically-equivalent questions of various user descriptions. 30 | We then frame this problem as a sequence-to-sequence learning problem, which directly maps a technical question to its corresponding duplicate question. 31 | We train a text-to-text transformer, named **QueryRewriter**, by using the collected duplicate question pairs. 32 | 33 | 34 | 35 | To train the **QueryRewriter**, please download our duplicate question dataset from the following link: [Dataset Download Link](https://drive.google.com/drive/folders/1-qlk1clhgy1Lzx4BIE5bW5fmEQsFSMjv?usp=sharing) 36 | 37 | ```shell 38 | cd QueryRewriter/train/ 39 | python prepare_data.py 40 | python train.py 41 | ``` 42 | 43 | Or, we have released the pre-trained **QueryRewriter** model as described in the paper. You can use the following link to download our pretrained model: [Pretrained Model Download Link](https://drive.google.com/drive/folders/1-E8pPL3ze7jHkR4_J6htAPk7iN94yInt?usp=sharing) 44 | 45 | The **QueryRewriter** can be easily used with the following way: 46 | 47 | ``` 48 | query = "how to iterate the list in python" 49 | # Initialize the model 50 | QR_model = QueryRewriter() 51 | # Get the embedding of a query 52 | query_vec = QR_model.encode(query) 53 | # Get the paraphrase questions of a query 54 | paraphrase_q = QR_model.paraphrase(query) 55 | ``` 56 | 57 | ## CodeSelector 58 | ![Workflow of Que2Code](./figures/CodeSelector-Workflow.png) 59 | In the second stage, our **CodeSelector** component tackles the _information overload_ challenge. 60 | To do this, we first collect all the answers of the semantic relevant questions retrieved in the first stage. 61 | We then extract all the code snippets from the collected answer posts to construct a candidate code snippets pool. 62 | For the given query question, we pair it with each of the code snippet candidates. We then fit them into the trained **CodeSelector** to estimate their matching scores and judge the preference orders. 63 | **CodeSelector** can then select the best code snippet from the code snippet candidates via pairwise comparison. 64 | Our approach is fully data-driven and does not rely on hand-crafted rules. 65 | 66 | To train the **CodeSelector**, please download our label query-code dataset from the following link: [Dataset Download Link](https://drive.google.com/drive/folders/1-qlk1clhgy1Lzx4BIE5bW5fmEQsFSMjv?usp=sharing). 67 | Then train the model with the following scripts: 68 | 69 | ```shell 70 | cd CodeSelector/train/ 71 | python data_prepare.py 72 | python train.py 73 | ``` 74 | 75 | 76 | Or, we have aslo released the pre-trained **CodeSelector** model as described in the paper. You can use the following link to download our pretrained model: [Pretrained Model Download Link](https://drive.google.com/drive/folders/1-E8pPL3ze7jHkR4_J6htAPk7iN94yInt?usp=sharing) 77 | 78 | Our **CodeSelector** can also be used easily as following: 79 | 80 | ```shell 81 | # Initilize the CodeSelector Model 82 | cs_model = CodeSelector() 83 | # Estimate the matching score between a query and a code snippet 84 | score = cs_model.get_score(query, code_snippet) 85 | ``` 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /QueryRewriter/eval/evaluation.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import random 3 | import numpy as np 4 | import pandas as pd 5 | from scipy import spatial 6 | from T5_model import T5_model 7 | 8 | random.seed(779) 9 | 10 | def hits_count(candidate_ranks, k): 11 | ''' 12 | candidate_ranks: 13 | list of candidates' ranks; one rank per question; 14 | length is a number of questions 15 | rank is a number from q to len(candidates of the question) 16 | e.g. [2, 3] means that first candidate has the rank 2, 17 | second candidate has the rank 3 18 | k: number of top-ranked elements (k in hits@k metric) 19 | result: return Hits@k value for current ranking 20 | ''' 21 | count = 0 22 | for rank in candidate_ranks: 23 | if rank <= k: 24 | count += 1 25 | return count/(len(candidate_ranks)+1e-8) 26 | 27 | def dcg_score(candidate_ranks, k): 28 | ''' 29 | candidate_ranks: 30 | list of candidates' ranks; one rank per question; 31 | length is a number of questions 32 | rank is a number from q to len(candidates of the question) 33 | e.g. [2, 3] means that first candidate has the rank 2, 34 | second candidate has the rank 3 35 | k: number of top-ranked elements (k in hits@k metric) 36 | 37 | result: return DCG@k value for current ranking 38 | ''' 39 | score = 0 40 | for rank in candidate_ranks: 41 | if rank <= k: 42 | score += 1/np.log2(1+rank) 43 | return score/(len(candidate_ranks)+1e-8) 44 | 45 | def rank_candidates(question_vec, candidate_answers, model): 46 | ''' 47 | question: a string 48 | candidate_answers: a list of strings 49 | result: a list of pairs (initial position in the list, question) 50 | ''' 51 | # question_vec = model.encode(question) 52 | # print("question_vec:", question_vec) 53 | candidate_scores = [] 54 | for answer in candidate_answers: 55 | answer_vec = model.encode(answer) 56 | # print("answer_vec:", answer_vec) 57 | score = 1 - spatial.distance.cosine(question_vec, answer_vec) 58 | # print("score:", score) 59 | candidate_scores.append( score ) 60 | # print("candidate_scores:", candidate_scores) 61 | tl = [(i, candidate_answers[i], candidate_scores[i]) for i in range(len(candidate_answers))] 62 | # print("tl:", tl) 63 | stl = sorted(tl, key=lambda x:x[2], reverse=True) 64 | # print("stl:", stl) 65 | result = [(t[0], t[1]) for t in stl] 66 | # print("result:", result) 67 | return result 68 | 69 | with open('../../BM25-IR/eval_data.pkl', 'rb') as handler: 70 | eval_data = pickle.load(handler) 71 | print('eval_data:', type(eval_data), len(eval_data)) 72 | 73 | with open('./eval_data_embed/eval_data_embed_0.pkl', 'rb') as handler: 74 | eval_data_embed = pickle.load(handler) 75 | print('eval_data_embed:', type(eval_data_embed), len(eval_data_embed)) 76 | 77 | eval_pairs = [] 78 | for i in range(len(eval_data)): 79 | eval_pairs.append((eval_data[i]['src_qtitle'], \ 80 | eval_data[i]['tgt_qtitle'], \ 81 | eval_data[i]['similar_q1_title'], \ 82 | eval_data[i]['similar_q2_title'], \ 83 | eval_data[i]['similar_q3_title'], \ 84 | eval_data[i]['similar_q4_title'])) 85 | 86 | print('eval_pairs:', type(eval_pairs), len(eval_pairs)) 87 | 88 | model = T5_model() 89 | 90 | model_ranking = [] 91 | for i, e in enumerate(eval_pairs): 92 | if i not in eval_data_embed: 93 | continue 94 | src_question = e[0].strip().lower() 95 | tgt_question = e[1].strip().lower() 96 | similar_q1 = e[2].strip().lower() 97 | similar_q2 = e[3].strip().lower() 98 | similar_q3 = e[4].strip().lower() 99 | similar_q4 = e[5].strip().lower() 100 | question = src_question 101 | candidate_answers = [] 102 | candidate_answers.append(tgt_question) 103 | candidate_answers.append(similar_q1) 104 | candidate_answers.append(similar_q2) 105 | candidate_answers.append(similar_q3) 106 | candidate_answers.append(similar_q4) 107 | # print("question:", question) 108 | # print("candidate_answers:", candidate_answers) 109 | 110 | question_vec = eval_data_embed[i]['question_embedding'] 111 | ranks = rank_candidates(question_vec, candidate_answers, model) 112 | # print("ranks:", ranks) 113 | model_ranking.append( [r[0] for r in ranks].index(0) + 1 ) 114 | # print("model_ranking:", model_ranking) 115 | # break 116 | 117 | print( "len of model_ranking:", len(model_ranking) ) 118 | 119 | def evaluate( sample_ranking ): 120 | eval_dcg_scores = [] 121 | eval_hits_count = [] 122 | for k in [1, 2 ,3, 4, 5]: 123 | eval_dcg_scores.append( dcg_score(sample_ranking, k) ) 124 | eval_hits_count.append( hits_count(sample_ranking, k)) 125 | return eval_dcg_scores, eval_hits_count 126 | 127 | dcg_scores_result = [] 128 | hits_count_result = [] 129 | 130 | for i in range(10): 131 | sample_ranking = random.sample(model_ranking, 200) 132 | eval_dcg_scores, eval_hits_count = evaluate( sample_ranking ) 133 | # Append 134 | dcg_scores_result.append(eval_dcg_scores) 135 | hits_count_result.append(eval_hits_count) 136 | 137 | dcg_scores_df = pd.DataFrame(dcg_scores_result, columns=['dcg1', 'dcg2', 'dcg3', 'dcg4', 'dcg5']) 138 | hits_count_df = pd.DataFrame(hits_count_result, columns=['hits1', 'hits2', 'hits3', 'hits4', 'hits5']) 139 | # print(dcg_scores_df) 140 | # print(hits_count_df) 141 | print(hits_count_df.describe()) 142 | print(dcg_scores_df.describe()) 143 | 144 | # for k in [1, 2, 3, 4, 5]: 145 | # print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(model_ranking, k), \ 146 | # k, hits_count(model_ranking, k))) 147 | 148 | -------------------------------------------------------------------------------- /QueryRewriter/train/T5FineTuner.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | from ParaphraseDataset import * 3 | 4 | class T5FineTuner(pl.LightningModule): 5 | def __init__(self, hparams): 6 | super(T5FineTuner, self).__init__() 7 | self.hparams = hparams 8 | 9 | self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path) 10 | self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path) 11 | 12 | def is_logger(self): 13 | return True #self.trainer.proc_rank <= 0 14 | 15 | def forward( 16 | self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None 17 | ): 18 | return self.model( 19 | input_ids, 20 | attention_mask=attention_mask, 21 | decoder_input_ids=decoder_input_ids, 22 | decoder_attention_mask=decoder_attention_mask, 23 | lm_labels=lm_labels, 24 | ) 25 | 26 | def _step(self, batch): 27 | lm_labels = batch["target_ids"] 28 | lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100 29 | 30 | outputs = self( 31 | input_ids=batch["source_ids"], 32 | attention_mask=batch["source_mask"], 33 | lm_labels=lm_labels, 34 | decoder_attention_mask=batch['target_mask'] 35 | ) 36 | 37 | loss = outputs[0] 38 | 39 | return loss 40 | 41 | def training_step(self, batch, batch_idx): 42 | loss = self._step(batch) 43 | 44 | tensorboard_logs = {"train_loss": loss} 45 | return {"loss": loss, "log": tensorboard_logs} 46 | 47 | def training_epoch_end(self, outputs): 48 | avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean() 49 | tensorboard_logs = {"avg_train_loss": avg_train_loss} 50 | return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs} 51 | 52 | def validation_step(self, batch, batch_idx): 53 | loss = self._step(batch) 54 | return {"val_loss": loss} 55 | 56 | def validation_epoch_end(self, outputs): 57 | avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() 58 | tensorboard_logs = {"val_loss": avg_loss} 59 | return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs} 60 | 61 | def configure_optimizers(self): 62 | "Prepare optimizer and schedule (linear warmup and decay)" 63 | 64 | model = self.model 65 | no_decay = ["bias", "LayerNorm.weight"] 66 | optimizer_grouped_parameters = [ 67 | { 68 | "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 69 | "weight_decay": self.hparams.weight_decay, 70 | }, 71 | { 72 | "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 73 | "weight_decay": 0.0, 74 | }, 75 | ] 76 | optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) 77 | self.opt = optimizer 78 | return [optimizer] 79 | 80 | def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None, on_tpu=False, using_native_amp=False, using_lbfgs=False): 81 | if self.trainer.use_tpu: 82 | xm.optimizer_step(optimizer) 83 | else: 84 | optimizer.step() 85 | optimizer.zero_grad() 86 | self.lr_scheduler.step() 87 | 88 | def get_tqdm_dict(self): 89 | tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]} 90 | 91 | return tqdm_dict 92 | 93 | def train_dataloader(self): 94 | train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="so_train", args=self.hparams) 95 | dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, 96 | num_workers=4) 97 | t_total = ( 98 | (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu))) 99 | // self.hparams.gradient_accumulation_steps 100 | * float(self.hparams.num_train_epochs) 101 | ) 102 | scheduler = get_linear_schedule_with_warmup( 103 | self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total 104 | ) 105 | self.lr_scheduler = scheduler 106 | return dataloader 107 | 108 | def val_dataloader(self): 109 | val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="so_val", args=self.hparams) 110 | return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4) 111 | 112 | logger = logging.getLogger(__name__) 113 | 114 | class LoggingCallback(pl.Callback): 115 | def on_validation_end(self, trainer, pl_module): 116 | logger.info("***** Validation results *****") 117 | if pl_module.is_logger(): 118 | metrics = trainer.callback_metrics 119 | # Log results 120 | for key in sorted(metrics): 121 | if key not in ["log", "progress_bar"]: 122 | logger.info("{} = {}\n".format(key, str(metrics[key]))) 123 | 124 | def on_test_end(self, trainer, pl_module): 125 | logger.info("***** Test results *****") 126 | 127 | if pl_module.is_logger(): 128 | metrics = trainer.callback_metrics 129 | 130 | # Log and save results to file 131 | output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt") 132 | with open(output_test_results_file, "w") as writer: 133 | for key in sorted(metrics): 134 | if key not in ["log", "progress_bar"]: 135 | logger.info("{} = {}\n".format(key, str(metrics[key]))) 136 | writer.write("{} = {}\n".format(key, str(metrics[key]))) 137 | 138 | def get_dataset(tokenizer, type_path, args): 139 | return ParaphraseDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path, max_len=args.max_seq_length) 140 | 141 | 142 | -------------------------------------------------------------------------------- /CodeSelector/train/data_prepare.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | from Bert_MLP import Config 3 | 4 | # Import DataSet 5 | df = pd.read_csv('./Data/train_label_data', \ 6 | delimiter='\t', \ 7 | header = None, \ 8 | names = ['qid', 'question', 'aid0', 'cs0', 'aid1', 'cs1', 'label']) 9 | print(type(df), df.shape) 10 | print(df['label'].value_counts()) 11 | 12 | # Get the lists of questions 13 | questions = df.question.values.tolist() 14 | cs0 = df.cs0.values.tolist() 15 | cs1 = df.cs1.values.tolist() 16 | labels = df.label.values.tolist() 17 | 18 | print('questions:', type(questions), len(questions)) 19 | print('cs0:', type(cs0), len(cs0)) 20 | print('cs1:', type(cs1), len(cs1)) 21 | print('labels:', type(labels), len(labels)) 22 | 23 | # Tokenize & Input Formatting 24 | ## Import model/tokenizer 25 | ## Load the BERT model 26 | print("Loading BERT Model...") 27 | bert_model = BertModel.from_pretrained('./Model') 28 | bert_model.cuda() 29 | print("Loading BERT Tokenizer...") 30 | tokenizer = AutoTokenizer.from_pretrained('./Model') 31 | # tokenizer = tokenizer_class.from_pretrained('./Model', do_lower_case=True) 32 | 33 | 34 | # Required Formatting 35 | ## 1. sentences to ids 36 | ## 2. Padding & Truncating 37 | ## 3. Attention Masks 38 | ## 4. 39 | # Combine question + cs0 as the first inputs 40 | encoded_qc0 = tokenizer(questions, cs0, padding=True, truncation=True, max_length=128, return_tensors='pt') 41 | print("encoded_qc0:", type(encoded_qc0), len(encoded_qc0)) 42 | qc0_input_ids = encoded_qc0['input_ids'] 43 | qc0_token_type_ids = encoded_qc0['token_type_ids'] 44 | qc0_attention_masks = encoded_qc0['attention_mask'] 45 | 46 | print("qc0_input_ids:", type(qc0_input_ids), qc0_input_ids.shape) 47 | print("qc0_type_ids:", type(qc0_token_type_ids), qc0_token_type_ids.shape) 48 | print("qc0_attn_mask:", type(qc0_attention_masks), qc0_attention_masks.shape) 49 | 50 | # Convert list to numpy array 51 | qc0_input_ids = qc0_input_ids.cpu().detach().numpy() 52 | qc0_token_type_ids = qc0_token_type_ids.cpu().detach().numpy() 53 | qc0_attention_masks = qc0_attention_masks.cpu().detach().numpy() 54 | print("qc0_input_ids:", type(qc0_input_ids), qc0_input_ids.shape ) 55 | print("qc0_type_ids:", type(qc0_token_type_ids), qc0_token_type_ids.shape ) 56 | print("qc0_attn_mask:", type(qc0_attention_masks), qc0_attention_masks.shape ) 57 | 58 | 59 | encoded_qc1 = tokenizer(questions, cs1, padding=True, truncation=True, max_length=128, return_tensors='pt') 60 | 61 | qc1_input_ids = encoded_qc1['input_ids'] 62 | qc1_token_type_ids = encoded_qc1['token_type_ids'] 63 | qc1_attention_masks = encoded_qc1['attention_mask'] 64 | 65 | # Convert list to numpy array 66 | qc1_input_ids = qc1_input_ids.cpu().detach().numpy() 67 | qc1_token_type_ids = qc1_token_type_ids.cpu().detach().numpy() 68 | qc1_attention_masks = qc1_attention_masks.cpu().detach().numpy() 69 | print("qc1_input_ids:", type(qc1_input_ids), qc1_input_ids.shape ) 70 | print("qc1_type_ids:", type(qc1_token_type_ids), qc1_token_type_ids.shape ) 71 | print("qc1_attn_mask:", type(qc1_attention_masks), qc1_attention_masks.shape ) 72 | 73 | labels = np.asarray(labels) 74 | 75 | with open('./Data/encoded_qc0.pkl', 'wb') as handle: 76 | pickle.dump(encoded_qc0, handle) 77 | 78 | with open('./Data/encoded_qc1.pkl', 'wb') as handle: 79 | pickle.dump(encoded_qc1, handle) 80 | 81 | with open('./Data/labels.pkl', 'wb') as handle: 82 | pickle.dump(labels, handle) 83 | 84 | # Training and Validation Split on qc0 85 | # Use 97% for training and 3% for validation 86 | train_qc0_inputs, validation_qc0_inputs, train_labels, validation_labels = train_test_split(qc0_input_ids, \ 87 | labels, \ 88 | random_state=2018, \ 89 | test_size=0.03) 90 | # Do the same for attention_mask 91 | train_qc0_masks, validation_qc0_masks, _, _ = train_test_split(qc0_attention_masks, \ 92 | labels, \ 93 | random_state=2018, \ 94 | test_size = 0.03) 95 | 96 | # Do the same for token_type_ids 97 | train_qc0_types, validation_qc0_types, _, _ = train_test_split(qc0_token_type_ids, \ 98 | labels, \ 99 | random_state=2018, \ 100 | test_size = 0.03) 101 | 102 | # Training and Validation Split on qc1 103 | # Use 97% for training and 3% for validation 104 | train_qc1_inputs, validation_qc1_inputs, _, _ = train_test_split(qc1_input_ids, \ 105 | labels, \ 106 | random_state=2018, \ 107 | test_size=0.03) 108 | # Do the same for attention_mask 109 | train_qc1_masks, validation_qc1_masks, _, _ = train_test_split(qc1_attention_masks, \ 110 | labels, \ 111 | random_state=2018, \ 112 | test_size = 0.03) 113 | 114 | # Do the same for token_type_ids 115 | train_qc1_types, validation_qc1_types, _, _ = train_test_split(qc1_token_type_ids, \ 116 | labels, \ 117 | random_state=2018, \ 118 | test_size = 0.03) 119 | 120 | # Convert to Pytorch Data Types 121 | train_qc0_inputs = torch.tensor(train_qc0_inputs) 122 | train_qc0_masks = torch.tensor(train_qc0_masks) 123 | train_qc0_types = torch.tensor(train_qc0_types) 124 | 125 | train_qc1_inputs = torch.tensor(train_qc1_inputs) 126 | train_qc1_masks = torch.tensor(train_qc1_masks) 127 | train_qc1_types = torch.tensor(train_qc1_types) 128 | 129 | validation_qc0_inputs = torch.tensor(validation_qc0_inputs) 130 | validation_qc0_masks = torch.tensor(validation_qc0_masks) 131 | validation_qc0_types = torch.tensor(validation_qc0_types) 132 | 133 | validation_qc1_inputs = torch.tensor(validation_qc1_inputs) 134 | validation_qc1_masks = torch.tensor(validation_qc1_masks) 135 | validation_qc1_types = torch.tensor(validation_qc1_types) 136 | 137 | train_labels = torch.tensor(train_labels) 138 | validation_labels = torch.tensor(validation_labels) 139 | 140 | print(type(train_qc0_inputs), train_qc0_inputs.shape) 141 | print(type(train_qc0_masks), train_qc0_masks.shape) 142 | print(type(train_qc0_types), train_qc0_types.shape) 143 | 144 | print(type(train_qc1_inputs), train_qc1_inputs.shape) 145 | print(type(train_qc1_masks), train_qc1_masks.shape) 146 | print(type(train_qc1_types), train_qc1_types.shape) 147 | 148 | print(type(train_labels), train_labels.shape) 149 | 150 | 151 | # We'll also create an iterator for our dataset using the torch DataLoader class. 152 | # This helps save on memory during training 153 | # unlike for loop, with an iterator the entire dataset does not need to be loaded into memory 154 | 155 | config = Config() 156 | # batch_size = 32 157 | batch_size = config.batch_size 158 | print("batch_size:", batch_size) 159 | 160 | # Create the DataLoader for our training set. 161 | train_data = TensorDataset(train_qc0_inputs, train_qc0_masks, train_qc0_types, \ 162 | train_qc1_inputs, train_qc1_masks, train_qc1_types, \ 163 | train_labels) 164 | train_sampler = RandomSampler(train_data) 165 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) 166 | print(type(train_dataloader)) 167 | 168 | # Create the DataLoader for our validation set. 169 | validation_data = TensorDataset(validation_qc0_inputs, validation_qc0_masks, validation_qc0_types, \ 170 | validation_qc1_inputs, validation_qc1_masks, validation_qc1_types, \ 171 | validation_labels) 172 | validation_sampler = SequentialSampler(validation_data) 173 | validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) 174 | print(type(validation_dataloader)) 175 | 176 | # Save DataLoader 177 | with open('./Data/train_dataloader.pkl', 'wb') as handle: 178 | pickle.dump(train_dataloader, handle) 179 | 180 | with open('./Data/validation_dataloader.pkl', 'wb') as handle: 181 | pickle.dump(validation_dataloader, handle) 182 | 183 | print("Finished!") 184 | -------------------------------------------------------------------------------- /CodeSelector/train/train.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | from Bert_MLP import Model, Config 3 | # from Bert_CNN import Model, Config 4 | 5 | def save_model(epoch, model, training_stats): 6 | # Saving & Loading Fine-tuned Model 7 | ## Saving best-practices: if you use defaults names for the model, 8 | ## you can reload it using from_pretrained() 9 | 10 | base_dir = './model_save/epoch' + str(epoch) + '/' 11 | # sub_dir = 'epoch' + str(epoch) +'/model.ckpt' 12 | output_dir = base_dir + 'model.ckpt' 13 | ## Create output directory if needed 14 | if not os.path.exists(base_dir): 15 | os.makedirs(base_dir) 16 | 17 | print("Saving model to %s" % output_dir) 18 | 19 | # Save a trained model, configuration and tokenizer using `save_pretrained()`. 20 | # They can then be reloaded using `from_pretrained()` 21 | # model_to_save = model.module if hasattr(model, 'module') else model 22 | # model_to_save.save_pretrained(output_dir) 23 | # tokenizer.save_pretrained(output_dir) 24 | torch.save(model.state_dict(), output_dir) 25 | 26 | df_stats = pd.DataFrame(data=training_stats) 27 | df_stats.to_json(base_dir + "training_stats.json") 28 | # df_stats.to_pickle(output_dir + "training_stats.pkl") 29 | # Good practice: save your training arguments together with the trained model 30 | # torch.save(args, os.path.join(output_dir, 'training_args.bin')) 31 | 32 | def save_model_step(step, model, training_stats): 33 | 34 | base_dir = './model_save/step' + str(step) + '/' 35 | output_dir = base_dir + 'model.ckpt' 36 | if not os.path.exists(base_dir): 37 | os.makedirs(base_dir) 38 | 39 | print("Saving model to %s" % output_dir) 40 | torch.save(model.state_dict(), output_dir) 41 | df_stats = pd.DataFrame(data=training_stats) 42 | df_stats.to_json(base_dir + "training_stats.json") 43 | 44 | 45 | # Load the iterator 46 | with open('./Data/train_dataloader.pkl', 'rb') as handle: 47 | train_dataloader = pickle.load(handle) 48 | 49 | with open('./Data/validation_dataloader.pkl', 'rb') as handle: 50 | validation_dataloader = pickle.load(handle) 51 | 52 | print("dataloader loaded!") 53 | 54 | 55 | config = Config() 56 | model = Model(config).to(config.device) 57 | print("Model created!") 58 | 59 | # Optimizer & Learning Rate Scheduler 60 | optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8) 61 | # Number of training epochs. The BERT authors recommend between 2 and 4. 62 | # We chose to run for 4, but we'll see later that this may be over-fitting the training data. 63 | epochs = config.num_epochs 64 | # Total number of training steps is [number of batches] x [number of epochs]. 65 | # (Note that this is not the same as the number of training samples). 66 | total_steps = len(train_dataloader) * epochs 67 | # Create the learning rate scheduler. 68 | scheduler = get_linear_schedule_with_warmup(optimizer, \ 69 | num_warmup_steps = 0, # Default value in run_glue.py 70 | num_training_steps = total_steps) 71 | 72 | # We are ready to kick off the training 73 | # Set the seed value all over the place to make this reproducible. 74 | seed_val = 42 75 | random.seed(seed_val) 76 | np.random.seed(seed_val) 77 | torch.manual_seed(seed_val) 78 | torch.cuda.manual_seed_all(seed_val) 79 | 80 | # We'll store a number of quantities such as training and validation loss, validation accuracy, and timings. 81 | training_stats = [] 82 | 83 | # Measure the total training time for the whole run. 84 | total_t0 = time.time() 85 | 86 | print("Training start ...") 87 | # For each epoch 88 | for epoch_i in range(0, epochs): 89 | # ========================== 90 | # Training 91 | # ========================== 92 | 93 | # Perform one full pass over the training set. 94 | print("") 95 | print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) 96 | print('Training...') 97 | 98 | # Measure how long the training epoch takes. 99 | t0 = time.time() 100 | # Reset the total loss for this epoch. 101 | total_train_loss = 0 102 | 103 | model.train() 104 | # For each batch of training data... 105 | for step, batch in enumerate(train_dataloader): 106 | 107 | 108 | # Progress update every 40 batches 109 | if step % 100 == 0 and not step == 0: 110 | # Calculate elapsed time in minutes. 111 | elapsed = format_time(time.time() - t0) 112 | # Report progress. 113 | print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) 114 | # break 115 | 116 | # Save by step size 117 | if step % 1000 == 0 and not step == 0: 118 | # Record all statistics from this epoch. 119 | training_stats.append(\ 120 | {'epoch': epoch_i + 1, \ 121 | 'step': step, \ 122 | # 'Training Loss': avg_train_loss, \ 123 | # 'Training Time': training_time, \ 124 | }) 125 | step_marker = str(epoch_i +1 ) + '-' + str(step) 126 | save_model_step(step_marker, model, training_stats) 127 | # break 128 | 129 | if step % 10000 == 0 and not step == 0: 130 | break 131 | 132 | # Unpack this training batch from our dataloader. 133 | # 134 | # As we unpack the batch, we'll also copy each tensor to the GPU using the 'to' method 135 | # 136 | # `batch` contains seven pytorch tensors: 137 | # [0]: qc0 input ids 138 | # [1]: qc0 attention masks 139 | # [2]: qc0 token type ids 140 | # [3]: qc1 input ids 141 | # [4]: qc1 attention masks 142 | # [5]: qc1 token type ids 143 | # [6]: labels 144 | b_qc0_input_ids = batch[0].to(config.device) 145 | b_qc0_input_mask = batch[1].to(config.device) 146 | b_qc0_input_types = batch[2].to(config.device) 147 | b_qc1_input_ids = batch[3].to(config.device) 148 | b_qc1_input_mask = batch[4].to(config.device) 149 | b_qc1_input_types = batch[5].to(config.device) 150 | b_labels = batch[6].to(config.device) 151 | # print('batch qc0 input_ids:', type(b_qc0_input_ids), b_qc0_input_ids.shape) 152 | # print('batch qc0 input_mask:', type(b_qc0_input_mask), b_qc0_input_mask.shape) 153 | # print('batch qc0 input_types:', type(b_qc0_input_types), b_qc0_input_types.shape) 154 | # print('batch qc1 input_ids:', type(b_qc1_input_ids), b_qc1_input_ids.shape) 155 | # print('batch qc1 input_mask:', type(b_qc1_input_mask), b_qc1_input_mask.shape) 156 | # print('batch qc1 input_types:', type(b_qc1_input_types), b_qc1_input_types.shape) 157 | # print('batch labels:', type(b_labels), b_labels.shape) 158 | 159 | model.zero_grad() 160 | 161 | b_qc0 = (b_qc0_input_ids, b_qc0_input_mask, b_qc0_input_types) 162 | b_qc1 = (b_qc1_input_ids, b_qc1_input_mask, b_qc1_input_types) 163 | b_outputs = model(b_qc0, b_qc1) 164 | # print('batch outputs:', type(b_outputs), b_outputs.shape) 165 | # exit() 166 | 167 | loss = F.cross_entropy(b_outputs, b_labels) 168 | # print('loss:', type(loss), loss, loss.item()) 169 | 170 | total_train_loss += loss.item() 171 | # Perform a backward pass to calculate the gradients. 172 | loss.backward() 173 | 174 | # Clip the norm of the gradients to 1.0. 175 | # This is to help prevent the "exploding gradients" problem. 176 | torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 177 | 178 | # Update parameters and take a step using the computed gradient. 179 | # The optimizer dictates the "update rule"--how the parameters are 180 | # modified based on their gradients, the learning rate, etc. 181 | optimizer.step() 182 | 183 | # Update the learning rate. 184 | scheduler.step() 185 | # break 186 | # exit() 187 | # Calculate the average loss over all of the batches. 188 | avg_train_loss = total_train_loss / len(train_dataloader) 189 | 190 | # Measure how long this epoch took. 191 | training_time = format_time(time.time() - t0) 192 | 193 | print("") 194 | print(" Average training loss: {0:.2f}".format(avg_train_loss)) 195 | print(" Training epcoh took: {:}".format(training_time)) 196 | 197 | # ======================================== 198 | # Validation 199 | # ======================================== 200 | 201 | # After the completion of each training epoch, measure our performance on 202 | # our validation set. 203 | 204 | print("") 205 | print("Running Validation...") 206 | 207 | t0 = time.time() 208 | # Put the model in evaluation mode--the dropout layers behave differently 209 | # during evaluation. 210 | model.eval() 211 | 212 | # Tracking variables 213 | total_eval_accuracy = 0 214 | total_eval_loss = 0 215 | nb_eval_steps = 0 216 | 217 | # Evaluate data for one epoch 218 | for batch in validation_dataloader: 219 | # Unpack this training batch from our dataloader. 220 | # 221 | # As we unpack the batch, we'll also copy each tensor to the GPU using the 'to' method 222 | # 223 | # `batch` contains three pytorch tensors: 224 | # [0]: input ids 225 | # [1]: attention masks 226 | # [2]: labels 227 | b_qc0_input_ids = batch[0].to(config.device) 228 | b_qc0_input_mask = batch[1].to(config.device) 229 | b_qc0_input_types = batch[2].to(config.device) 230 | b_qc1_input_ids = batch[3].to(config.device) 231 | b_qc1_input_mask = batch[4].to(config.device) 232 | b_qc1_input_types = batch[5].to(config.device) 233 | b_labels = batch[6].to(config.device) 234 | 235 | 236 | # Tell pytorch not to bother with constructing the compute graph during 237 | # the forward pass, since this is only needed for backprop (training). 238 | with torch.no_grad(): 239 | # Forward pass, calculate logit predictions. 240 | # token_type_ids is the same as the "segment ids", which 241 | # differentiates sentence 1 and 2 in 2-sentence tasks. 242 | # values prior to applying an activation function like the softmax. 243 | b_qc0 = (b_qc0_input_ids, b_qc0_input_mask, b_qc0_input_types) 244 | b_qc1 = (b_qc1_input_ids, b_qc1_input_mask, b_qc1_input_types) 245 | b_outputs = model(b_qc0, b_qc1) 246 | # b_outputs = model(b_input_ids, b_input_mask, b_input_types) 247 | # print("b_outputs:", type(b_outputs), b_outputs.shape) 248 | 249 | loss = F.cross_entropy(b_outputs, b_labels) 250 | 251 | # Accumulate the validation loss. 252 | total_eval_loss += loss.item() 253 | 254 | # move labels to CPU 255 | preds = torch.max(b_outputs.data, 1)[1].cpu().numpy() 256 | # print("preds:", type(preds), preds.shape) 257 | labels = b_labels.to('cpu').numpy() 258 | # print("labels:", type(labels), labels.shape) 259 | # print(preds) 260 | # print(labels) 261 | 262 | # Calculate the accuracy for this batch of test sentences, and 263 | total_eval_accuracy += flat_accuracy(preds, labels) 264 | # break 265 | 266 | # Report the final accuracy for this validation run. 267 | avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) 268 | print(" Accuracy: {0:.2f}".format(avg_val_accuracy)) 269 | 270 | # Calculate the average loss over all of the batches. 271 | avg_val_loss = total_eval_loss / len(validation_dataloader) 272 | 273 | # Measure how long the validation run took. 274 | validation_time = format_time(time.time() - t0) 275 | 276 | print(" Validation Loss: {0:.2f}".format(avg_val_loss)) 277 | print(" Validation took: {:}".format(validation_time)) 278 | 279 | # Record all statistics from this epoch. 280 | training_stats.append(\ 281 | {'epoch': epoch_i + 1, \ 282 | 'Training Loss': avg_train_loss, \ 283 | 'Valid. Loss': avg_val_loss, \ 284 | 'Valid. Accur.': avg_val_accuracy, \ 285 | 'Training Time': training_time, \ 286 | 'Validation Time': validation_time 287 | }) 288 | 289 | save_model(epoch_i + 1, model, training_stats) 290 | # exit() 291 | # break 292 | 293 | print("") 294 | print("Training complete!") 295 | print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0))) 296 | 297 | --------------------------------------------------------------------------------