├── Appendix ├── README.md ├── baseline_comparison.png ├── propagation model.png ├── subgraph_compare.png └── vfg_of_loop_recur.png ├── Baseline methods ├── AST-Att │ ├── .DS_Store │ ├── __pycache__ │ │ ├── configs.cpython-36.pyc │ │ ├── data_loader.cpython-36.pyc │ │ ├── modules.cpython-36.pyc │ │ ├── util_ast.cpython-36.pyc │ │ └── utils.cpython-36.pyc │ ├── configs.py │ ├── data │ │ └── .DS_Store │ ├── data_loader.py │ ├── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── astemb.cpython-36.pyc │ │ │ ├── jointemb.cpython-36.pyc │ │ │ └── tokenemb.cpython-36.pyc │ │ └── astemb.py │ ├── modules.py │ ├── output │ │ ├── .DS_Store │ │ └── ASTEmbeder │ │ │ └── .DS_Store │ ├── test.py │ ├── train.py │ ├── util_ast.py │ ├── util_desc.py │ └── utils.py ├── CFG-Att │ ├── .DS_Store │ ├── __pycache__ │ │ ├── configs.cpython-36.pyc │ │ ├── data_loader.cpython-36.pyc │ │ ├── modules.cpython-36.pyc │ │ ├── util_cfg.cpython-36.pyc │ │ └── utils.cpython-36.pyc │ ├── configs.py │ ├── data │ │ └── .DS_Store │ ├── data_loader.py │ ├── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── cfgemb.cpython-36.pyc │ │ │ ├── jointemb.cpython-36.pyc │ │ │ └── tokenemb.cpython-36.pyc │ │ └── cfgemb.py │ ├── modules.py │ ├── output │ │ ├── .DS_Store │ │ └── CFGEmbeder │ │ │ └── .DS_Store │ ├── test.py │ ├── train.py │ ├── util_cfg.py │ ├── util_desc.py │ └── utils.py ├── DeepCS │ ├── .DS_Store │ ├── Tok-Att.code-workspace │ ├── __pycache__ │ │ ├── configs.cpython-36.pyc │ │ ├── data_loader.cpython-36.pyc │ │ ├── modules.cpython-36.pyc │ │ └── utils.cpython-36.pyc │ ├── configs.py │ ├── data │ │ └── .DS_Store │ ├── data_loader.py │ ├── data_prepare.py │ ├── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── jointemb.cpython-36.pyc │ │ │ └── tokenemb.cpython-36.pyc │ │ └── jointemb.py │ ├── modules.py │ ├── output │ │ └── .DS_Store │ ├── shuffle_index.npy │ ├── test.py │ ├── train.py │ ├── user_study.py │ ├── util_desc.py │ ├── util_name.py │ ├── util_tok.py │ └── utils.py ├── MMAN(TDC) │ ├── .DS_Store │ ├── __pycache__ │ │ ├── configs.cpython-36.pyc │ │ ├── data_loader.cpython-36.pyc │ │ ├── modules.cpython-36.pyc │ │ ├── util_cfg.cpython-36.pyc │ │ ├── util_dfg.cpython-36.pyc │ │ └── utils.cpython-36.pyc │ ├── configs.py │ ├── data │ │ └── .DS_Store │ ├── data_loader.py │ ├── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── cfgemb.cpython-36.pyc │ │ │ ├── jointemb.cpython-36.pyc │ │ │ └── tokenemb.cpython-36.pyc │ │ └── cfgemb.py │ ├── modules.py │ ├── output │ │ └── .DS_Store │ ├── shuffle_index.npy │ ├── test.py │ ├── train.py │ ├── util_cfg.py │ ├── util_desc.py │ ├── util_dfg.py │ ├── util_tok.py │ └── utils.py ├── MMAN │ ├── .DS_Store │ ├── __pycache__ │ │ ├── configs.cpython-36.pyc │ │ ├── data_loader.cpython-36.pyc │ │ ├── modules.cpython-36.pyc │ │ ├── util_cfg.cpython-36.pyc │ │ └── utils.cpython-36.pyc │ ├── configs.py │ ├── data │ │ └── .DS_Store │ ├── data_loader.py │ ├── data_prepare │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── util_ast.cpython-36.pyc │ │ │ ├── util_cfg.cpython-36.pyc │ │ │ ├── util_desc.cpython-36.pyc │ │ │ ├── util_tok.cpython-36.pyc │ │ │ └── utils.cpython-36.pyc │ │ ├── util_ast.py │ │ ├── util_cfg.py │ │ ├── util_desc.py │ │ ├── util_tok.py │ │ └── utils.py │ ├── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── cfgemb.cpython-36.pyc │ │ │ ├── jointemb.cpython-36.pyc │ │ │ ├── multiemb.cpython-36.pyc │ │ │ └── tokenemb.cpython-36.pyc │ │ └── multiemb.py │ ├── modules.py │ ├── output │ │ ├── .DS_Store │ │ └── MultiEmbeder │ │ │ └── .DS_Store │ ├── shuffle_index.npy │ ├── test.py │ └── train.py ├── Tok-Att │ ├── .DS_Store │ ├── Tok-Att.code-workspace │ ├── __pycache__ │ │ ├── configs.cpython-36.pyc │ │ ├── data_loader.cpython-36.pyc │ │ ├── modules.cpython-36.pyc │ │ └── utils.cpython-36.pyc │ ├── configs.py │ ├── data │ │ └── .DS_Store │ ├── data_loader.py │ ├── data_prepare.py │ ├── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── jointemb.cpython-36.pyc │ │ │ └── tokenemb.cpython-36.pyc │ │ └── tokenemb.py │ ├── modules.py │ ├── output │ │ ├── .DS_Store │ │ └── TokenEmbeder │ │ │ └── .DS_Store │ ├── shuffle_index.npy │ ├── test.py │ ├── train.py │ ├── util_desc.py │ ├── util_tok.py │ └── utils.py └── readme.md ├── IR2graph ├── readme.md └── vfg_construct.py ├── README.md ├── dataset ├── README.md ├── preprocessed_dataset │ ├── origin.desc.txt │ ├── origin.ir.txt │ ├── readme.md │ ├── test.desc.txt │ ├── test.ir.txt │ ├── train.desc.txt │ └── train.ir.txt └── raw_dataset │ └── readme.md ├── src ├── README.md ├── configs.py ├── data_loader.py ├── generate_interface.py ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── cfgemb.cpython-36.pyc │ │ ├── iremb.cpython-36.pyc │ │ ├── jointemb.cpython-36.pyc │ │ └── tokenemb.cpython-36.pyc │ └── iremb.py ├── modules.py ├── test.py ├── train.py ├── util_desc.py ├── util_ir.py └── utils.py └── user study ├── code_search_DeGraphCS.txt ├── code_search_DeepCS.txt ├── code_search_MMAN.txt ├── code_search_UNIF.txt ├── queries.txt └── readme.md /Appendix/README.md: -------------------------------------------------------------------------------- 1 | # Online-Repo 2 | 3 | ## An example to show the recursive call and loop functions can be represented as close as possible from our variable-based flow graph 4 | #### An example "get sum" function realized by loop function 5 | ``` 6 | int get_sum(int N){ 7 | int sum = 0; 8 | while(N != 0){ 9 | sum += N; 10 | N-=1; 11 | } 12 | return sum; 13 | } 14 | ``` 15 | #### An example "get sum" function realized by recursive call 16 | ``` 17 | int get_sum(int N){ 18 | if(N == 0){return N;} 19 | else{ 20 | int sum; 21 | sum = N + get_sum(N-1); 22 | return sum; 23 | } 24 | } 25 | ``` 26 | #### The corresponding generated variable-based flow graphs are shown as below: 27 | Annotation:for recursive call "get_sum(N-1)" in (b), we link the result of "N-1" to input parameter "N", and we regard the return value "sum" of get_sum as the result of "get_sum(N-1)" so that we link return value "sum" to "add" operation): 28 | the constructed graph
29 | 30 | #### To better illustrate the common charateristics of variable-based flow graph constructed by deGraphCS from the above two different realizations, we extract the core part of the two realizations to make comparison: 31 | ``` 32 | sum += N; N-= 1; // in loop function 33 | sum = N + get_sum(N-1) // in recursive call 34 | ``` 35 | #### The corresponding sub-graphs of the core part are shown as below, from which we can clearly capture the common part: 36 | the constructed graph
37 | 38 | #### The corresponding generated AST and CFG of the two above realizations, the difference is obvious: 39 | the constructed graph
40 | 41 | ## The details of the equations and algorithms in deGraphCS 42 | ### The realization details of the attention mechanism on the whole graph and the comments 43 | ``` 44 | self_attn = nn.Linear(self.n_hidden, self.n_hidden) 45 | self_attn_scalar = nn.Linear(self.n_hidden, 1) 46 | ``` 47 | 48 | Here, function f() in Equation (2) and Equation (4) means the first MLP layer: nn.Linear(self.n_hidden, self.n_hidden). 49 | 50 | u_vfg means the second MLP layer: nn.Linear(self.n_hidden, 1), which can be seen as a high level representation of the VFG nodes. 51 | 52 | h_vfg means the final weighted sum embedding of the whole graph (the weighted sum of self_attn_scalar and each node's final embedding). The difference between u_vfg and h_vfg is the same for the corresponding part of Equation (4) and Equation (5). 53 | 54 | ### The aggragation function used in Equation (1) 55 | The aggregation function used in Equation (1) can be illustrated as follows: 56 | propogation model
57 | 58 | In the functions above, Eq. 1 is the initialization step, which copies node annotations into the first components 59 | of the hidden state and pads the rest with zeros. 60 | 61 | Eq. 2 is the step that passes information between 62 | different nodes of the graph via incoming and outgoing edges with parameters dependent on the edge 63 | type and direction. 64 | 65 | The remaining are GRU-like updates that incorporate information from the other nodes and from the previous timestep 66 | to update each node’s hidden state. 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /Appendix/baseline_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/baseline_comparison.png -------------------------------------------------------------------------------- /Appendix/propagation model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/propagation model.png -------------------------------------------------------------------------------- /Appendix/subgraph_compare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/subgraph_compare.png -------------------------------------------------------------------------------- /Appendix/vfg_of_loop_recur.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/vfg_of_loop_recur.png -------------------------------------------------------------------------------- /Baseline methods/AST-Att/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/AST-Att/__pycache__/configs.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/configs.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/AST-Att/__pycache__/data_loader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/data_loader.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/AST-Att/__pycache__/modules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/modules.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/AST-Att/__pycache__/util_ast.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/util_ast.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/AST-Att/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/AST-Att/configs.py: -------------------------------------------------------------------------------- 1 | 2 | def config_ASTEmbeder(): 3 | conf = { 4 | # added_params 5 | 'transform_attn_out': 0, 6 | 'transform_every_modal': 0, # to make modal more complex? 7 | 'save_attn_weight': 0, 8 | 'use_tanh': 1, 9 | 'use_attn': 1, 10 | 'use_desc_attn': 1, 11 | 12 | # tree lstm 13 | 'treelstm_cell_type': 'nary', # nary or childsum 14 | 15 | # data_params 16 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader 17 | #training data 18 | 'train_ast':'train.ast.json', 19 | 'train_desc':'train.desc.h5', 20 | # test data 21 | 'test_ast':'test.ast.json', 22 | 'test_desc':'test.desc.h5', 23 | 24 | #parameters 25 | 'desc_len': 30, 26 | 'n_ast_words': 16000, # len(vocabulary) + 1 27 | 'n_desc_words': 10000, # wait to decide 28 | #vocabulary info 29 | 'vocab_ast':'vocab.ast.json', 30 | 'vocab_desc':'vocab.desc.json', 31 | 32 | #training_params 33 | 'batch_size': 32, 34 | 'nb_epoch': 200, 35 | #'optimizer': 'adam', 36 | 'learning_rate':0.0003, # try 1e-4(paper) 37 | 'adam_epsilon':1e-8, 38 | 'warmup_steps':5000, 39 | 'fp16': False, 40 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. 41 | #"See details at https://nvidia.github.io/apex/amp.html" 42 | 43 | # model_params 44 | 'emb_size': 300, 45 | 'n_hidden': 512,#number of hidden dimension of code/desc representation 46 | # recurrent 47 | 'margin': 0.6, 48 | 'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf 49 | #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization. 50 | 'dropout':0.1 51 | } 52 | return conf 53 | 54 | -------------------------------------------------------------------------------- /Baseline methods/AST-Att/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/data/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/AST-Att/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .astemb import ASTEmbeder -------------------------------------------------------------------------------- /Baseline methods/AST-Att/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/AST-Att/models/__pycache__/astemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/astemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/AST-Att/models/__pycache__/jointemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/jointemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/AST-Att/models/__pycache__/tokenemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/tokenemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/AST-Att/output/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/output/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/AST-Att/output/ASTEmbeder/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/output/ASTEmbeder/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/AST-Att/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | import numpy as np 5 | import argparse 6 | import threading 7 | import codecs 8 | import logging 9 | from tqdm import tqdm 10 | logger = logging.getLogger(__name__) 11 | logging.basicConfig(level=logging.INFO, format="%(message)s") 12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package 13 | 14 | import torch 15 | 16 | import models, configs, data_loader 17 | from modules import get_cosine_schedule_with_warmup 18 | from utils import similarity, normalize 19 | from data_loader import * 20 | 21 | 22 | def test(config, model, device): 23 | logger.info('test begin...') 24 | 25 | model.eval() 26 | model.to(device) 27 | 28 | # load data 29 | data_path = args.data_path+args.dataset+'/' 30 | test_set = eval(config['dataset_name'])(data_path, 31 | config['test_ast'], config['vocab_ast'], 32 | config['test_desc'], config['desc_len']) 33 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32, 34 | collate_fn=batcher(device), shuffle=False, drop_last=False, num_workers=0) 35 | # encode asts and descs 36 | code_reprs, desc_reprs = [], [] 37 | n_processed = 0 38 | for batch in data_loader: 39 | code_batch = [tensor for tensor in batch[:2]] 40 | desc_batch = [tensor for tensor in batch[2:4]] 41 | with torch.no_grad(): 42 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32) 43 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size] 44 | # normalize when sim_measure=='cos' 45 | code_repr = normalize(code_repr) 46 | desc_repr = normalize(desc_repr) 47 | code_reprs.append(code_repr) 48 | desc_reprs.append(desc_repr) 49 | n_processed += batch[2].size(0) # +batch_size 50 | # code_reprs: [n_processed x n_hidden] 51 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 52 | 53 | # calculate similarity 54 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], [] 55 | test_sim_result, test_rank_result = [], [] 56 | for i in tqdm(range(0, n_processed)): 57 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden] 58 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed] 59 | negsims = np.negative(sims) 60 | predict = np.argsort(negsims) 61 | 62 | # SuccessRate@k 63 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]] 64 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0) 65 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0) 66 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0) 67 | # MRR 68 | predict_list = predict.tolist() 69 | rank = predict_list.index(i) 70 | sum_mrr.append(1/float(rank+1)) 71 | 72 | # results need to be saved 73 | predict_20 = [int(k) for k in predict[0:20]] 74 | sim_20 = [sims[k] for k in predict_20] 75 | test_sim_result.append(zip(predict_20, sim_20)) 76 | test_rank_result.append(rank+1) 77 | 78 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}') 79 | save_path = args.data_path + 'result/' 80 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy' 81 | #np.save(save_path+sim_result_filename, test_sim_result) 82 | #np.save(save_path+rank_result_filename, test_rank_result) 83 | 84 | 85 | def parse_args(): 86 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model") 87 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 88 | parser.add_argument('--model', type=str, default='ASTEmbeder', help='model name') 89 | parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python') 90 | parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from') 91 | parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\ 92 | 'Note: should be consistent with the same argument in the repr_code.py') 93 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 94 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard") 95 | return parser.parse_args() 96 | 97 | 98 | if __name__ == '__main__': 99 | args = parse_args() 100 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 101 | config = getattr(configs, 'config_'+args.model)() 102 | 103 | ##### Define model ###### 104 | logger.info('Constructing Model..') 105 | model = getattr(models, args.model)(config) # initialize the model 106 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5' 107 | model.load_state_dict(torch.load(ckpt, map_location=device)) 108 | 109 | test(config, model, device) 110 | 111 | 112 | -------------------------------------------------------------------------------- /Baseline methods/AST-Att/util_desc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from collections import Counter 4 | import json 5 | import h5py 6 | from utils import UNK_ID 7 | 8 | def make_shuffle_index(args): 9 | dir_path = args.data_path + args.dataset 10 | all_desc_file_path = dir_path + args.all_desc_file 11 | with open(all_desc_file_path, 'r') as all_desc_file: 12 | lines = all_desc_file.readlines() 13 | all_num = int(len(lines)/2) 14 | 15 | index = np.arange(all_num) 16 | np.random.seed(16) 17 | np.random.shuffle(index) 18 | #print(index) 19 | np.save(args.shuffle_index_file, index) 20 | 21 | def split_data(args): 22 | 23 | dir_path = args.data_path + args.dataset 24 | all_desc_file_path = dir_path + args.all_desc_file 25 | train_desc_file_path = dir_path + args.train_desc_file 26 | test_desc_file_path = dir_path + args.test_desc_file 27 | 28 | input_desc = [] 29 | with open(all_desc_file_path, 'r') as all_desc_file: 30 | lines = all_desc_file.readlines() 31 | for line in lines: 32 | if (line[0:10] != 'BeginFunc:'): 33 | input_desc.append(line) 34 | print('number of input desc:\n', len(input_desc)) 35 | 36 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file: 37 | for i in range(0, args.trainset_num): 38 | train_desc_file.write(input_desc[i]) 39 | for i in range(args.testset_start_ind, args.testset_start_ind+args.testset_num): 40 | test_desc_file.write(input_desc[i]) 41 | 42 | 43 | def create_dict_file(args): 44 | dir_path = args.data_path + args.dataset 45 | desc_file_path = dir_path + args.train_desc_file 46 | 47 | input_desc = [] 48 | with open(desc_file_path, 'r') as desc_file: 49 | input_desc = desc_file.readlines() 50 | desc_words = [] 51 | for i in range(0, len(input_desc)): 52 | input_desc[i] = input_desc[i].rstrip('\n') 53 | desc_word_list = input_desc[i].split() 54 | for desc_word in desc_word_list: 55 | desc_words.append(desc_word) 56 | vocab_desc_info = Counter(desc_words) 57 | print(len(vocab_desc_info)) 58 | 59 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]] 60 | vocab_desc_index = {'':0, '':1} 61 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))])) 62 | 63 | 64 | vocab_desc_file_path = dir_path + args.vocab_desc_file 65 | desc_dic_str = json.dumps(vocab_desc_index) 66 | with open(vocab_desc_file_path, 'w') as vocab_desc_file: 67 | vocab_desc_file.write(desc_dic_str) 68 | 69 | 70 | def sents2indexes(sent_file_path, vocab_file_path, maxlen): 71 | phrases, indices = [], [] 72 | with open(sent_file_path, 'r') as sent_file: 73 | sents = sent_file.readlines() 74 | vocab = json.loads(open(vocab_file_path, "r").readline()) 75 | start_index = 0 76 | for i in range(0, len(sents)): 77 | sent = sents[i].rstrip('\n') 78 | word_list = sent.split() 79 | sent_len = min(len(word_list), maxlen) 80 | indices.append((sent_len, start_index)) 81 | for j in range(0, sent_len): 82 | word = word_list[j] 83 | phrases.append(vocab.get(word, UNK_ID)) 84 | start_index += sent_len 85 | output_file_path = sent_file_path[0:-3] + 'h5' 86 | output_file = h5py.File(output_file_path, 'w') 87 | output_file['phrases'] = phrases 88 | output_file['indices'] = indices 89 | output_file.close() 90 | 91 | 92 | def parse_args(): 93 | parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder") 94 | 95 | parser.add_argument('--data_path', type=str, default='./data/') 96 | parser.add_argument('--dataset', type=str, default='github/') 97 | 98 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt') 99 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt') 100 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt') 101 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json') 102 | 103 | parser.add_argument('--trainset_num', type=int, default=32000) 104 | parser.add_argument('--testset_num', type=int, default=1000) 105 | parser.add_argument('--testset_start_ind', type=int, default=32000) 106 | parser.add_argument('--desc_word_num', type=int, default=10000) 107 | parser.add_argument('--desc_maxlen', type=int, default=30) 108 | 109 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy') 110 | 111 | return parser.parse_args() 112 | 113 | if __name__ == '__main__': 114 | args = parse_args() 115 | 116 | split_data(args) 117 | create_dict_file(args) 118 | 119 | dir_path = args.data_path + args.dataset 120 | # train.desc.txt -> train.desc.h5(and test...) 121 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 122 | sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 123 | 124 | -------------------------------------------------------------------------------- /Baseline methods/AST-Att/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import math 4 | import torch 5 | from torch.nn import functional as F 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def cos_approx(data1,data2): 10 | """numpy implementation of cosine similarity for matrix""" 11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 12 | dotted = np.dot(data1,np.transpose(data2)) 13 | norm1 = np.linalg.norm(data1,axis=1) 14 | norm2 = np.linalg.norm(data2,axis=1) 15 | matrix_vector_norms = np.multiply(norm1, norm2) 16 | neighbors = np.divide(dotted, matrix_vector_norms) 17 | return neighbors 18 | 19 | def normalize(data): 20 | """normalize matrix by rows""" 21 | return data/np.linalg.norm(data,axis=1,keepdims=True) 22 | 23 | def dot_np(data1,data2): 24 | """cosine similarity for normalized vectors""" 25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 26 | return np.dot(data1, data2.T) 27 | 28 | def sigmoid(x): 29 | return 1/(1 + np.exp(-x)) 30 | 31 | def similarity(vec1, vec2, measure='cos'): 32 | if measure=='cos': 33 | vec1_norm = normalize(vec1) 34 | vec2_norm = normalize(vec2) 35 | return np.dot(vec1_norm, vec2_norm.T)[:,0] 36 | elif measure=='poly': 37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T) 38 | elif measure=='sigmoid': 39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1) 40 | elif measure in ['enc', 'gesd', 'aesd']: 41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1) 42 | euc_sim = 1 / (1 + euc_dist) 43 | if measure=='euc': return euc_sim 44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1) 45 | if measure == 'gesd': return euc_sim * sigmoid_sim 46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim) 47 | 48 | ####################################################################### 49 | 50 | def asMinutes(s): 51 | m = math.floor(s / 60) 52 | s -= m * 60 53 | return '%d:%d'% (m, s) 54 | 55 | def timeSince(since, percent): 56 | now = time.time() 57 | s = now - since 58 | es = s / (percent) 59 | rs = es - s 60 | return '%s<%s'%(asMinutes(s), asMinutes(rs)) 61 | 62 | ####################################################################### 63 | ''' 64 | import nltk 65 | try: nltk.word_tokenize("hello world") 66 | except LookupError: nltk.download('punkt') 67 | 68 | def sent2indexes(sentence, vocab, maxlen): 69 | 70 | def convert_sent(sent, vocab, maxlen): 71 | idxes = np.zeros(maxlen, dtype=np.int64) 72 | idxes.fill(PAD_ID) 73 | tokens = nltk.word_tokenize(sent.strip()) 74 | idx_len = min(len(tokens), maxlen) 75 | for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID) 76 | return idxes, idx_len 77 | if type(sentence) is list: 78 | inds, lens = [], [] 79 | for sent in sentence: 80 | idxes, idx_len = convert_sent(sent, vocab, maxlen) 81 | #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len]) 82 | inds.append(idxes) 83 | lens.append(idx_len) 84 | return np.vstack(inds), np.vstack(lens) 85 | else: 86 | inds, lens = sent2indexes([sentence], vocab, maxlen) 87 | return inds[0], lens[0] 88 | ''' 89 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 90 | '''indexes: numpy array''' 91 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID): 92 | indexes=filter(lambda i: i!=ignore_tok, indexes) 93 | toks, length = [], 0 94 | for idx in indexes: 95 | toks.append(ivocab.get(idx, '')) 96 | length+=1 97 | return ' '.join(toks), length 98 | 99 | ivocab = {v: k for k, v in vocab.items()} 100 | if indexes.ndim==1:# one sentence 101 | return revert_sent(indexes, ivocab, ignore_tok) 102 | else:# dim>1 103 | sentences, lens =[], [] # a batch of sentences 104 | for inds in indexes: 105 | sentence, length = revert_sent(inds, ivocab, ignore_tok) 106 | sentences.append(sentence) 107 | lens.append(length) 108 | return sentences, lens 109 | 110 | ######################################################################## 111 | -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/__pycache__/configs.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/configs.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/__pycache__/data_loader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/data_loader.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/__pycache__/modules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/modules.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/__pycache__/util_cfg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/util_cfg.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/configs.py: -------------------------------------------------------------------------------- 1 | 2 | def config_CFGEmbeder(): 3 | conf = { 4 | # added_params 5 | 'transform_every_modal': 0, # to make modal more complex? 6 | 'save_attn_weight': 0, 7 | 'use_tanh': 1, 8 | 'use_attn': 1, 9 | 10 | # GGNN 11 | 'state_dim': 512, # GGNN hidden state size 12 | 'annotation_dim': 5, 13 | 'n_edge_types': 2, 14 | 'n_node': 200, # could be less than 512, like the maximum nodenum 15 | 'n_steps': 5, # propogation steps number of GGNN 16 | 'output_type': 'no_reduce', 17 | 'batch_size': 32, 18 | 'n_layers': 1, 19 | 'n_hidden': 512, 20 | 'cfg_attn_mode': 'sigmoid_scalar', 21 | 22 | # data_params 23 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader 24 | #training data 25 | 'train_cfg':'train.cfg.txt', 26 | 'train_desc':'train.desc.h5', 27 | # test data 28 | 'test_cfg':'test.cfg.txt', 29 | 'test_desc':'test.desc.h5', 30 | 31 | #parameters 32 | 'desc_len': 30, 33 | 'n_desc_words': 10000, # wait to decide 34 | #vocabulary info 35 | 'vocab_desc':'vocab.desc.json', 36 | 37 | #training_params 38 | 'chunk_size': 200000, 39 | 'nb_epoch': 200, 40 | #'optimizer': 'adam', 41 | 'learning_rate':0.0003, # try 1e-4(paper) 42 | 'adam_epsilon':1e-8, 43 | 'warmup_steps':5000, 44 | 'fp16': False, 45 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. 46 | #"See details at https://nvidia.github.io/apex/amp.html" 47 | 48 | # model_params 49 | 'emb_size': 300, 50 | # recurrent 51 | 'margin': 0.6, 52 | 'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf 53 | #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization. 54 | 'dropout': 0.1 55 | } 56 | return conf 57 | 58 | -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/data/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/data_loader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import torch.utils.data as data 4 | import torch.nn as nn 5 | import tables 6 | import json 7 | import random 8 | import numpy as np 9 | import pickle 10 | 11 | from utils import PAD_ID, UNK_ID, indexes2sent 12 | import configs 13 | from util_cfg import get_cfg_npy_info, get_one_cfg_npy_info 14 | 15 | import logging 16 | logger = logging.getLogger(__name__) 17 | logging.basicConfig(level=logging.INFO, format="%(message)s") 18 | 19 | 20 | class CodeSearchDataset(data.Dataset): 21 | """ 22 | Dataset that has only positive samples. 23 | """ 24 | def __init__(self, config, data_dir, f_cfgs, max_node_num, f_descs=None, max_desc_len=None): 25 | 26 | self.max_node_num = max_node_num 27 | self.max_desc_len = max_desc_len 28 | self.n_edge_types = config['n_edge_types'] 29 | self.state_dim = config['state_dim'] 30 | self.annotation_dim = config['annotation_dim'] 31 | 32 | # initialize file path or list of file names 33 | self.training = False 34 | print("Loading Data...") 35 | 36 | self.mark_list = [] 37 | start_index, end_index = [0, 0] 38 | with open(data_dir+f_cfgs, 'r') as cfg_file: 39 | self.cfg_lines = cfg_file.readlines() 40 | for i in range(0, len(self.cfg_lines)): 41 | self.cfg_lines[i] = self.cfg_lines[i].rstrip('\n') 42 | if self.cfg_lines[i][0:10] == 'BeginFunc:' and i != 0: 43 | end_index = i 44 | self.mark_list.append([start_index, end_index]) 45 | start_index = i 46 | self.mark_list.append([start_index, len(self.cfg_lines)]) 47 | 48 | ''' 49 | # cfg_adjmat: [all_num x n_node x (n_node * n_edge_types * 2)] 50 | # cfg_init_input: [all_num x n_node x state_dim] 51 | # cfg_node_mask: [all_num x n_node] 52 | self.cfg_adjmat, self.cfg_init_input, self.cfg_node_mask = get_cfg_npy_info(self.cfg_lines, 53 | self.max_node_num, self.n_edge_types, self.state_dim, self.annotation_dim) 54 | ''' 55 | 56 | if f_descs is not None: 57 | self.training = True 58 | table_desc = tables.open_file(data_dir+f_descs) 59 | self.descs = table_desc.get_node('/phrases')[:].astype(np.long) 60 | self.idx_descs = table_desc.get_node('/indices')[:] 61 | ''' 62 | if f_descs is not None: 63 | assert len(self.cfg_adjmat)==self.idx_descs.shape[0] 64 | ''' 65 | self.data_len = self.idx_descs.shape[0] 66 | print("{} entries".format(self.data_len)) 67 | 68 | def pad_seq(self, seq, maxlen): 69 | if len(seq) < maxlen: 70 | seq = np.append(seq, [PAD_ID]*(maxlen-len(seq))) 71 | seq = seq[:maxlen] 72 | return seq 73 | 74 | def __getitem__(self, offset): 75 | #print('offset:\n', offset) 76 | #print('cfg start_index = {}, end_index = {}'.format(self.mark_list[offset][0], self.mark_list[offset][1])) 77 | 78 | input_cfg_lines = self.cfg_lines[self.mark_list[offset][0]: self.mark_list[offset][1]] 79 | adjmat, init_input, node_mask = get_one_cfg_npy_info(input_cfg_lines, 80 | self.max_node_num, self.n_edge_types, self.state_dim, self.annotation_dim) 81 | 82 | if self.training: 83 | len, pos = self.idx_descs[offset][0], self.idx_descs[offset][1] 84 | good_desc_len = min(int(len), self.max_desc_len) 85 | good_desc = self.descs[pos: pos+good_desc_len] 86 | good_desc = self.pad_seq(good_desc, self.max_desc_len) 87 | 88 | rand_offset = random.randint(0, self.data_len-1) 89 | len, pos = self.idx_descs[rand_offset][0], self.idx_descs[rand_offset][1] 90 | bad_desc_len = min(int(len), self.max_desc_len) 91 | bad_desc = self.descs[pos: pos+bad_desc_len] 92 | bad_desc = self.pad_seq(bad_desc, self.max_desc_len) 93 | 94 | return torch.Tensor(init_input), torch.Tensor(adjmat), torch.Tensor(node_mask), good_desc, good_desc_len, bad_desc, bad_desc_len 95 | return torch.Tensor(init_input), torch.Tensor(adjmat), torch.Tensor(node_mask), good_desc, good_desc_len 96 | 97 | def __len__(self): 98 | return self.data_len 99 | 100 | def load_dict(filename): 101 | return json.loads(open(filename, "r").readline()) 102 | #return pickle.load(open(filename, 'rb')) 103 | 104 | 105 | if __name__ == '__main__': 106 | device = 'cpu' 107 | config = getattr(configs, 'config_CFGEmbeder')() 108 | input_dir = './data/github/' 109 | 110 | train_set = CodeSearchDataset(config, input_dir, 'train.cfg.txt', 512, 'train.desc.h5', 30) 111 | train_data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=5, shuffle=False, drop_last=False, num_workers=1) 112 | print('number of batch:\n', len(train_data_loader)) 113 | ''' 114 | use_set = CodeSearchDataset(input_dir, 'use.tokens.h5', 30) 115 | use_data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=1, shuffle=False, num_workers=1) 116 | #print(len(use_data_loader)) 117 | vocab_tokens = load_dict(input_dir+'vocab.tokens.json') 118 | vocab_desc = load_dict(input_dir+'vocab.desc.json') 119 | ''' 120 | vocab_desc = load_dict(input_dir+'vocab.desc.json') 121 | print('============ Train Data ================') 122 | k = 0 123 | for epo in range(0,3): 124 | for batch in train_data_loader: 125 | print("batch[1].size(): ", batch[1].size()) 126 | #batch = tuple([t.numpy() for t in batch]) 127 | init_input, adjmat, node_mask, good_desc, good_desc_len, bad_desc, bad_desc_len = [tensor.to(device) for tensor in batch] 128 | print(adjmat.dtype) 129 | #print(batch) 130 | k+=1 131 | #if k>0: break 132 | print('-------------------------------') 133 | print(indexes2sent(good_desc, vocab_desc)) 134 | #print(indexes2sent(good_desc, vocab_desc)) 135 | 136 | 137 | -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .cfgemb import CFGEmbeder -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/models/__pycache__/cfgemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/cfgemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/models/__pycache__/jointemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/jointemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/models/__pycache__/tokenemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/tokenemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/output/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/output/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/output/CFGEmbeder/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/output/CFGEmbeder/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | import numpy as np 5 | import argparse 6 | import threading 7 | import codecs 8 | import logging 9 | from tqdm import tqdm 10 | logger = logging.getLogger(__name__) 11 | logging.basicConfig(level=logging.INFO, format="%(message)s") 12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package 13 | 14 | import torch 15 | 16 | import models, configs, data_loader 17 | from modules import get_cosine_schedule_with_warmup 18 | from utils import similarity, normalize 19 | from data_loader import * 20 | 21 | 22 | def test(config, model, device): 23 | logger.info('Test Begin...') 24 | 25 | model.eval() 26 | model.to(device) 27 | 28 | # load data 29 | data_path = args.data_path+args.dataset+'/' 30 | test_set = eval(config['dataset_name'])(config, data_path, 31 | config['test_cfg'], config['n_node'], 32 | config['test_desc'], config['desc_len']) 33 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32, 34 | shuffle=False, drop_last=False, num_workers=1) 35 | # encode tokens and descs 36 | code_reprs, desc_reprs = [], [] 37 | n_processed = 0 38 | for batch in data_loader: 39 | # batch[0:3]: init_input, adjmat, node_mask 40 | code_batch = [tensor.to(device) for tensor in batch[:3]] 41 | # batch[3:5]: good_desc, good_desc_len 42 | desc_batch = [tensor.to(device) for tensor in batch[3:5]] 43 | with torch.no_grad(): 44 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32) 45 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size] 46 | # normalize when sim_measure=='cos' 47 | code_repr = normalize(code_repr) 48 | desc_repr = normalize(desc_repr) 49 | code_reprs.append(code_repr) 50 | desc_reprs.append(desc_repr) 51 | n_processed += batch[0].size(0) # +batch_size 52 | # code_reprs: [n_processed x n_hidden] 53 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 54 | 55 | # calculate similarity 56 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], [] 57 | test_sim_result, test_rank_result = [], [] 58 | for i in tqdm(range(0, n_processed)): 59 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden] 60 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed] 61 | negsims = np.negative(sims) 62 | predict = np.argsort(negsims) 63 | 64 | # SuccessRate@k 65 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]] 66 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0) 67 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0) 68 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0) 69 | # MRR 70 | predict_list = predict.tolist() 71 | rank = predict_list.index(i) 72 | sum_mrr.append(1/float(rank+1)) 73 | 74 | # results need to be saved 75 | predict_20 = [int(k) for k in predict[0:20]] 76 | sim_20 = [sims[k] for k in predict_20] 77 | test_sim_result.append(zip(predict_20, sim_20)) 78 | test_rank_result.append(rank+1) 79 | 80 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}') 81 | save_path = args.data_path + 'result/' 82 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy' 83 | np.save(save_path+sim_result_filename, test_sim_result) 84 | np.save(save_path+rank_result_filename, test_rank_result) 85 | 86 | 87 | def parse_args(): 88 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model") 89 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 90 | parser.add_argument('--model', type=str, default='CFGEmbeder', help='model name') 91 | parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python') 92 | parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from') 93 | parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\ 94 | 'Note: should be consistent with the same argument in the repr_code.py') 95 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 96 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard") 97 | return parser.parse_args() 98 | 99 | 100 | if __name__ == '__main__': 101 | args = parse_args() 102 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 103 | config = getattr(configs, 'config_'+args.model)() 104 | 105 | ##### Define model ###### 106 | logger.info('Constructing Model..') 107 | model = getattr(models, args.model)(config) # initialize the model 108 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5' 109 | model.load_state_dict(torch.load(ckpt, map_location=device)) 110 | 111 | test(config, model, device) 112 | 113 | 114 | -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/util_desc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from collections import Counter 4 | import json 5 | import h5py 6 | from utils import UNK_ID 7 | 8 | def make_shuffle_index_num(args, all_num): 9 | index = np.arange(all_num) 10 | np.random.seed(16) 11 | np.random.shuffle(index) 12 | print('index:\n', index) 13 | np.save(args.shuffle_index_file, index) 14 | 15 | def make_shuffle_index(args): 16 | dir_path = args.data_path + args.dataset 17 | all_desc_file_path = dir_path + args.all_desc_file 18 | with open(all_desc_file_path, 'r') as all_desc_file: 19 | lines = all_desc_file.readlines() 20 | all_num = int(len(lines)/2) 21 | print('all_num of desc:\n', all_num) 22 | 23 | index = np.arange(all_num) 24 | np.random.seed(16) 25 | np.random.shuffle(index) 26 | print('index:\n', index) 27 | np.save(args.shuffle_index_file, index) 28 | 29 | def split_data(args): 30 | index = np.load(args.shuffle_index_file) 31 | 32 | dir_path = args.data_path + args.dataset 33 | all_desc_file_path = dir_path + args.all_desc_file 34 | train_desc_file_path = dir_path + args.train_desc_file 35 | test_desc_file_path = dir_path + args.test_desc_file 36 | 37 | input_desc = [] 38 | with open(all_desc_file_path, 'r') as all_desc_file: 39 | lines = all_desc_file.readlines() 40 | for line in lines: 41 | if (line[0:10] != 'BeginFunc:'): 42 | input_desc.append(line) 43 | print('number of input desc:\n', len(input_desc)) 44 | 45 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file: 46 | for i in range(0, args.trainset_num): 47 | train_desc_file.write(input_desc[index[i]]) 48 | for i in range(32000, 32000+args.testset_num): 49 | test_desc_file.write(input_desc[index[i]]) 50 | 51 | 52 | def create_dict_file(args): 53 | dir_path = args.data_path + args.dataset 54 | desc_file_path = dir_path + args.train_desc_file 55 | 56 | input_desc = [] 57 | with open(desc_file_path, 'r') as desc_file: 58 | input_desc = desc_file.readlines() 59 | desc_words = [] 60 | for i in range(0, len(input_desc)): 61 | input_desc[i] = input_desc[i].rstrip('\n') 62 | desc_word_list = input_desc[i].split() 63 | for desc_word in desc_word_list: 64 | desc_words.append(desc_word) 65 | vocab_desc_info = Counter(desc_words) 66 | print(len(vocab_desc_info)) 67 | 68 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]] 69 | vocab_desc_index = {'':0, '':1} 70 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))])) 71 | 72 | 73 | vocab_desc_file_path = dir_path + args.vocab_desc_file 74 | desc_dic_str = json.dumps(vocab_desc_index) 75 | with open(vocab_desc_file_path, 'w') as vocab_desc_file: 76 | vocab_desc_file.write(desc_dic_str) 77 | 78 | 79 | def sents2indexes(sent_file_path, vocab_file_path, maxlen): 80 | phrases, indices = [], [] 81 | with open(sent_file_path, 'r') as sent_file: 82 | sents = sent_file.readlines() 83 | vocab = json.loads(open(vocab_file_path, "r").readline()) 84 | start_index = 0 85 | for i in range(0, len(sents)): 86 | sent = sents[i].rstrip('\n') 87 | word_list = sent.split() 88 | sent_len = min(len(word_list), maxlen) 89 | indices.append((sent_len, start_index)) 90 | for j in range(0, sent_len): 91 | word = word_list[j] 92 | phrases.append(vocab.get(word, UNK_ID)) 93 | start_index += sent_len 94 | output_file_path = sent_file_path[0:-3] + 'h5' 95 | output_file = h5py.File(output_file_path, 'w') 96 | output_file['phrases'] = phrases 97 | output_file['indices'] = indices 98 | output_file.close() 99 | 100 | 101 | def parse_args(): 102 | parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder") 103 | 104 | parser.add_argument('--data_path', type=str, default='./data/') 105 | parser.add_argument('--dataset', type=str, default='github/') 106 | 107 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt') 108 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt') 109 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt') 110 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json') 111 | 112 | parser.add_argument('--trainset_num', type=int, default=32000) 113 | parser.add_argument('--testset_num', type=int, default=1000) 114 | parser.add_argument('--desc_word_num', type=int, default=10000) 115 | parser.add_argument('--desc_maxlen', type=int, default=30) 116 | 117 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy') 118 | 119 | return parser.parse_args() 120 | 121 | if __name__ == '__main__': 122 | args = parse_args() 123 | 124 | #make_shuffle_index_num(args, 33000) 125 | #split_data(args) 126 | #create_dict_file(args) 127 | 128 | dir_path = args.data_path + args.dataset 129 | # train.desc.txt -> train.desc.h5(and test...) 130 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 131 | sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 132 | 133 | -------------------------------------------------------------------------------- /Baseline methods/CFG-Att/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import math 4 | import torch 5 | from torch.nn import functional as F 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def cos_approx(data1,data2): 10 | """numpy implementation of cosine similarity for matrix""" 11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 12 | dotted = np.dot(data1,np.transpose(data2)) 13 | norm1 = np.linalg.norm(data1,axis=1) 14 | norm2 = np.linalg.norm(data2,axis=1) 15 | matrix_vector_norms = np.multiply(norm1, norm2) 16 | neighbors = np.divide(dotted, matrix_vector_norms) 17 | return neighbors 18 | 19 | def normalize(data): 20 | """normalize matrix by rows""" 21 | return data/np.linalg.norm(data,axis=1,keepdims=True) 22 | 23 | def dot_np(data1,data2): 24 | """cosine similarity for normalized vectors""" 25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 26 | return np.dot(data1, data2.T) 27 | 28 | def sigmoid(x): 29 | return 1/(1 + np.exp(-x)) 30 | 31 | def similarity(vec1, vec2, measure='cos'): 32 | if measure=='cos': 33 | vec1_norm = normalize(vec1) 34 | vec2_norm = normalize(vec2) 35 | return np.dot(vec1_norm, vec2_norm.T)[:,0] 36 | elif measure=='poly': 37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T) 38 | elif measure=='sigmoid': 39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1) 40 | elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf 41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1) 42 | euc_sim = 1 / (1 + euc_dist) 43 | if measure=='euc': return euc_sim 44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1) 45 | if measure == 'gesd': return euc_sim * sigmoid_sim 46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim) 47 | 48 | ####################################################################### 49 | 50 | def asMinutes(s): 51 | m = math.floor(s / 60) 52 | s -= m * 60 53 | return '%d:%d'% (m, s) 54 | 55 | def timeSince(since, percent): 56 | now = time.time() 57 | s = now - since 58 | es = s / (percent) 59 | rs = es - s 60 | return '%s<%s'%(asMinutes(s), asMinutes(rs)) 61 | 62 | ####################################################################### 63 | 64 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 65 | '''indexes: numpy array''' 66 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID): 67 | indexes=filter(lambda i: i!=ignore_tok, indexes) 68 | toks, length = [], 0 69 | for idx in indexes: 70 | toks.append(ivocab.get(idx, '')) 71 | length+=1 72 | return ' '.join(toks), length 73 | 74 | ivocab = {v: k for k, v in vocab.items()} 75 | if indexes.ndim==1:# one sentence 76 | return revert_sent(indexes, ivocab, ignore_tok) 77 | else:# dim>1 78 | sentences, lens =[], [] # a batch of sentences 79 | for inds in indexes: 80 | sentence, length = revert_sent(inds, ivocab, ignore_tok) 81 | sentences.append(sentence) 82 | lens.append(length) 83 | return sentences, lens 84 | 85 | ######################################################################## 86 | -------------------------------------------------------------------------------- /Baseline methods/DeepCS/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/DeepCS/Tok-Att.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "../CFG-Att" 5 | }, 6 | { 7 | "path": "../AST-Att" 8 | }, 9 | { 10 | "path": "." 11 | } 12 | ] 13 | } -------------------------------------------------------------------------------- /Baseline methods/DeepCS/__pycache__/configs.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/configs.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/DeepCS/__pycache__/data_loader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/data_loader.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/DeepCS/__pycache__/modules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/modules.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/DeepCS/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/DeepCS/configs.py: -------------------------------------------------------------------------------- 1 | 2 | def config_JointEmbeder(): 3 | conf = { 4 | # data_params 5 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader 6 | #training data 7 | 'train_name':'train.name.h5', 8 | 'train_tokens':'train.token.h5', 9 | 'train_desc':'train.desc.h5', 10 | # test data 11 | 'test_name':'test.name.h5', 12 | 'test_tokens':'test.token.h5', 13 | 'test_desc':'test.desc.h5', 14 | # user study data 15 | 'all_name': 'all.name.h5', 16 | 'all_tokens': 'all.token.h5', 17 | 'query_desc': 'query.desc.h5', 18 | #parameters 19 | 'name_len': 6, 20 | 'tokens_len': 50, 21 | 'desc_len': 30, 22 | 'n_words': 10000, # len(vocabulary) + 1 23 | #vocabulary info 24 | 'vocab_name':'vocab.name.json', 25 | 'vocab_tokens':'vocab.token.json', 26 | 'vocab_desc':'vocab.desc.json', 27 | 28 | #training_params 29 | 'batch_size': 32, 30 | 'nb_epoch': 200, 31 | #'optimizer': 'adam', 32 | 'learning_rate':0.0003, # try 1e-4(paper) 33 | 'adam_epsilon':1e-8, 34 | 'warmup_steps':5000, 35 | 'fp16': False, 36 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. 37 | #"See details at https://nvidia.github.io/apex/amp.html" 38 | 39 | # model_params 40 | 'use_desc_attn': 1, 41 | 'use_tanh': 1, 42 | 'emb_size': 512, 43 | 'n_hidden': 512,#number of hidden dimension of code/desc representation 44 | 'lstm_dims': 256, 45 | # recurrent 46 | 'margin': 0.6, 47 | 'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf 48 | #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization. 49 | 'dropout':0, 50 | } 51 | return conf 52 | 53 | -------------------------------------------------------------------------------- /Baseline methods/DeepCS/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/data/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/DeepCS/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .jointemb import JointEmbeder -------------------------------------------------------------------------------- /Baseline methods/DeepCS/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/DeepCS/models/__pycache__/jointemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/models/__pycache__/jointemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/DeepCS/models/__pycache__/tokenemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/models/__pycache__/tokenemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/DeepCS/models/jointemb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as weight_init 8 | import torch.nn.functional as F 9 | 10 | import logging 11 | logger = logging.getLogger(__name__) 12 | parentPath = os.path.abspath("..") 13 | sys.path.insert(0, parentPath)# add parent folder to path so as to import common modules 14 | from modules import SeqEncoder, BOWEncoder, SeqEncoder2 15 | 16 | class JointEmbeder(nn.Module): 17 | def __init__(self, config): 18 | super(JointEmbeder, self).__init__() 19 | self.conf = config 20 | self.margin = config['margin'] 21 | self.dropout = config['dropout'] 22 | self.n_hidden = config['n_hidden'] 23 | 24 | self.name_encoder = SeqEncoder(config['n_words'],config['emb_size'],config['lstm_dims']) 25 | self.tok_encoder = BOWEncoder(config['n_words'],config['emb_size'],config['n_hidden']) 26 | self.desc_encoder = SeqEncoder2(config['n_words'],config['emb_size'],config['n_hidden']) 27 | 28 | self.w_name = nn.Linear(2*config['lstm_dims'], config['n_hidden']) 29 | self.w_tok = nn.Linear(config['emb_size'], config['n_hidden']) 30 | #self.w_desc = nn.Linear(2*config['lstm_dims'], config['n_hidden']) 31 | self.fuse3 = nn.Linear(config['n_hidden'], config['n_hidden']) 32 | 33 | self.self_attn2 = nn.Linear(self.n_hidden, self.n_hidden) 34 | self.self_attn_scalar2 = nn.Linear(self.n_hidden, 1) 35 | 36 | self.init_weights() 37 | 38 | def init_weights(self):# Initialize Linear Weight 39 | for m in [self.w_name, self.w_tok, self.fuse3]: 40 | m.weight.data.uniform_(-0.1, 0.1) #nn.init.xavier_normal_(m.weight) 41 | nn.init.constant_(m.bias, 0.) 42 | 43 | def code_encoding(self, name, name_len, tokens, tok_len): 44 | name_repr = self.name_encoder(name, name_len) 45 | tok_repr = self.tok_encoder(tokens, tok_len) 46 | code_repr = self.fuse3(torch.tanh(self.w_name(name_repr)+self.w_tok(tok_repr))) 47 | return code_repr 48 | 49 | 50 | def desc_encoding(self, desc, desc_len): 51 | batch_size = desc.size()[0] 52 | desc_enc_hidden = self.desc_encoder.init_hidden(batch_size) 53 | # desc_enc_hidden: [2 x batch_size x n_hidden] 54 | desc_feat, desc_enc_hidden = self.desc_encoder(desc, desc_len, desc_enc_hidden) 55 | desc_enc_hidden = desc_enc_hidden[0] 56 | 57 | if self.conf['use_desc_attn']: 58 | seq_len = desc_feat.size()[1] 59 | 60 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 61 | unpack_len_list = desc_len.long().to(device) 62 | range_tensor = torch.arange(seq_len).to(device) 63 | mask_1forgt0 = range_tensor[None, :] < unpack_len_list[:, None] 64 | mask_1forgt0 = mask_1forgt0.reshape(-1, seq_len) 65 | 66 | desc_sa_tanh = torch.tanh(self.self_attn2(desc_feat.reshape(-1, self.n_hidden))) # [(batch_sz * seq_len) x n_hidden] 67 | desc_sa_tanh = F.dropout(desc_sa_tanh, self.dropout, training=self.training) 68 | desc_sa_tanh = self.self_attn_scalar2(desc_sa_tanh).reshape(-1, seq_len) # [batch_sz x seq_len] 69 | desc_feat = desc_feat.reshape(-1, seq_len, self.n_hidden) 70 | 71 | self_attn_desc_feat = None 72 | for _i in range(batch_size): 73 | desc_sa_tanh_one = torch.masked_select(desc_sa_tanh[_i, :], mask_1forgt0[_i, :]).reshape(1, -1) 74 | # attn_w_one: [1 x 1 x seq_len] 75 | attn_w_one = F.softmax(desc_sa_tanh_one, dim=1).reshape(1, 1, -1) 76 | 77 | # attn_feat_one: [1 x seq_len x n_hidden] 78 | attn_feat_one = torch.masked_select(desc_feat[_i, :, :].reshape(1, seq_len, self.n_hidden), 79 | mask_1forgt0[_i, :].reshape(1, seq_len, 1)).reshape(1, -1, self.n_hidden) 80 | # out_to_cat: [1 x n_hidden] 81 | out_to_cat = torch.bmm(attn_w_one, attn_feat_one).reshape(1, self.n_hidden) 82 | # self_attn_code_feat: [batch_sz x n_hidden] 83 | self_attn_desc_feat = out_to_cat if self_attn_desc_feat is None else torch.cat( 84 | (self_attn_desc_feat, out_to_cat), 0) 85 | 86 | else: 87 | self_attn_desc_feat = desc_enc_hidden.reshape(batch_size, self.n_hidden) 88 | 89 | if self.conf['use_tanh']: 90 | self_attn_desc_feat = torch.tanh(self_attn_desc_feat) 91 | 92 | # desc_feat: [batch_size x n_hidden] 93 | return self_attn_desc_feat 94 | 95 | 96 | def similarity(self, code_vec, desc_vec): 97 | assert self.conf['sim_measure'] in ['cos', 'poly', 'euc', 'sigmoid', 'gesd', 'aesd'], "invalid similarity measure" 98 | if self.conf['sim_measure']=='cos': 99 | return F.cosine_similarity(code_vec, desc_vec) 100 | elif self.conf['sim_measure']=='poly': 101 | return (0.5*torch.matmul(code_vec, desc_vec.t()).diag()+1)**2 102 | elif self.conf['sim_measure']=='sigmoid': 103 | return torch.tanh(torch.matmul(code_vec, desc_vec.t()).diag()+1) 104 | elif self.conf['sim_measure'] in ['euc', 'gesd', 'aesd']: 105 | euc_dist = torch.dist(code_vec, desc_vec, 2) # or torch.norm(code_vec-desc_vec,2) 106 | euc_sim = 1 / (1 + euc_dist) 107 | if self.conf['sim_measure']=='euc': return euc_sim 108 | sigmoid_sim = torch.sigmoid(torch.matmul(code_vec, desc_vec.t()).diag()+1) 109 | if self.conf['sim_measure']=='gesd': 110 | return euc_sim * sigmoid_sim 111 | elif self.conf['sim_measure']=='aesd': 112 | return 0.5*(euc_sim+sigmoid_sim) 113 | 114 | def forward(self, name, name_len, tokens, tok_len, desc_anchor, desc_anchor_len, desc_neg, desc_neg_len): 115 | # code_repr: [batch_sz x n_hidden] 116 | code_repr = self.code_encoding(name, name_len, tokens, tok_len) 117 | # desc_repr: [batch_sz x n_hidden] 118 | desc_anchor_repr = self.desc_encoding(desc_anchor, desc_anchor_len) 119 | desc_neg_repr = self.desc_encoding(desc_neg, desc_neg_len) 120 | 121 | # sim: [batch_sz] 122 | anchor_sim = self.similarity(code_repr, desc_anchor_repr) 123 | neg_sim = self.similarity(code_repr, desc_neg_repr) 124 | 125 | loss = (self.margin-anchor_sim+neg_sim).clamp(min=1e-6).mean() 126 | 127 | return loss -------------------------------------------------------------------------------- /Baseline methods/DeepCS/output/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/output/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/DeepCS/shuffle_index.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/shuffle_index.npy -------------------------------------------------------------------------------- /Baseline methods/DeepCS/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['KMP_DUPLICATE_LIB_OK']='True' 3 | import sys 4 | import traceback 5 | import numpy as np 6 | import argparse 7 | import threading 8 | import codecs 9 | import logging 10 | from tqdm import tqdm 11 | logger = logging.getLogger(__name__) 12 | logging.basicConfig(level=logging.INFO, format="%(message)s") 13 | 14 | import torch 15 | 16 | import models, configs, data_loader 17 | from modules import get_cosine_schedule_with_warmup 18 | from utils import similarity, normalize 19 | from data_loader import * 20 | 21 | 22 | def test(config, model, device): 23 | logger.info('test begin...') 24 | 25 | model.eval() 26 | model.to(device) 27 | 28 | # load data 29 | data_path = args.data_path+args.dataset+'/' 30 | test_set = eval(config['dataset_name'])(data_path, 31 | config['test_name'], config['name_len'], 32 | config['test_tokens'], config['tokens_len'], 33 | config['test_desc'], config['desc_len']) 34 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=2000, 35 | shuffle=False, drop_last=False, num_workers=1) 36 | # encode tokens and descs 37 | code_reprs, desc_reprs = [], [] 38 | n_processed = 0 39 | for batch in data_loader: 40 | code_batch = [tensor.to(device) for tensor in batch[:4]] 41 | desc_batch = [tensor.to(device) for tensor in batch[4:6]] 42 | with torch.no_grad(): 43 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32) 44 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size] 45 | # normalize when sim_measure=='cos' 46 | code_repr = normalize(code_repr) 47 | desc_repr = normalize(desc_repr) 48 | code_reprs.append(code_repr) 49 | desc_reprs.append(desc_repr) 50 | n_processed += batch[0].size(0) # +batch_size 51 | # code_reprs: [n_processed x n_hidden] 52 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 53 | 54 | # calculate similarity 55 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], [] 56 | test_sim_result, test_rank_result = [], [] 57 | for i in tqdm(range(0, n_processed)): 58 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden] 59 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed] 60 | negsims = np.negative(sims) 61 | predict = np.argsort(negsims) 62 | 63 | # SuccessRate@k 64 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]] 65 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0) 66 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0) 67 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0) 68 | # MRR 69 | predict_list = predict.tolist() 70 | rank = predict_list.index(i) 71 | sum_mrr.append(1/float(rank+1)) 72 | 73 | # results need to be saved 74 | predict_20 = [int(k) for k in predict[0:20]] 75 | sim_20 = [sims[k] for k in predict_20] 76 | test_sim_result.append(zip(predict_20, sim_20)) 77 | test_rank_result.append(rank+1) 78 | 79 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}') 80 | save_path = args.data_path + 'result/' 81 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy' 82 | np.save(save_path+sim_result_filename, test_sim_result) 83 | np.save(save_path+rank_result_filename, test_rank_result) 84 | 85 | 86 | def parse_args(): 87 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model") 88 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 89 | parser.add_argument('--model', type=str, default='JointEmbeder', help='model name') 90 | parser.add_argument('-d', '--dataset', type=str, default='github11', help='name of dataset.java, python') 91 | parser.add_argument('--reload_from', type=int, default=200, help='epoch to reload from') 92 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 93 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard") 94 | return parser.parse_args() 95 | 96 | 97 | if __name__ == '__main__': 98 | args = parse_args() 99 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 100 | config = getattr(configs, 'config_'+args.model)() 101 | 102 | ##### Define model ###### 103 | logger.info('Constructing Model..') 104 | model = getattr(models, args.model)(config) # initialize the model 105 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5' 106 | model.load_state_dict(torch.load(ckpt, map_location=device)) 107 | 108 | test(config, model, device) 109 | 110 | 111 | -------------------------------------------------------------------------------- /Baseline methods/DeepCS/user_study.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | import numpy as np 5 | import argparse 6 | import threading 7 | import codecs 8 | import logging 9 | from tqdm import tqdm 10 | logger = logging.getLogger(__name__) 11 | logging.basicConfig(level=logging.INFO, format="%(message)s") 12 | 13 | import torch 14 | 15 | import models, configs, data_loader 16 | from modules import get_cosine_schedule_with_warmup 17 | from utils import similarity, normalize 18 | from data_loader import * 19 | 20 | 21 | def test(config, model, device): 22 | logger.info('Test Begin...') 23 | 24 | model.eval() 25 | model.to(device) 26 | 27 | 28 | data_path = args.data_path+args.dataset+'/' 29 | 30 | code_base_set = eval(config['dataset_name'])(data_path, 31 | config['all_name'], config['name_len'], 32 | config['all_tokens'], config['tokens_len']) 33 | code_data_loader = torch.utils.data.DataLoader(dataset=code_base_set, batch_size=32, 34 | shuffle=False, drop_last=False, num_workers=1) 35 | 36 | code_reprs = [] 37 | code_processed = 0 38 | for batch in code_data_loader: 39 | # batch[0:4]: name, name_len, token, token_len 40 | code_batch = [tensor.to(device) for tensor in batch[:4]] 41 | with torch.no_grad(): 42 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32) 43 | code_repr = normalize(code_repr) 44 | code_reprs.append(code_repr) 45 | code_processed += batch[0].size(0) # +batch_size 46 | # code_reprs: [code_processed x n_hidden] 47 | code_reprs = np.vstack(code_reprs) 48 | print('processed code num: ', code_processed) 49 | 50 | 51 | query_desc_set = eval(config['dataset_name'])(data_path, 52 | f_descs=config['query_desc'], max_desc_len=config['desc_len']) 53 | desc_data_loader = torch.utils.data.DataLoader(dataset=query_desc_set, batch_size=32, 54 | shuffle=False, drop_last=False, num_workers=1) 55 | 56 | desc_reprs = [] 57 | desc_processed = 0 58 | for batch in desc_data_loader: 59 | # batch[0:2]: good_desc, good_desc_len 60 | desc_batch = [tensor.to(device) for tensor in batch[0:2]] 61 | with torch.no_grad(): 62 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hidden_size] 63 | desc_repr = normalize(desc_repr) 64 | desc_reprs.append(desc_repr) 65 | desc_processed += batch[0].size(0) # +batch_size 66 | # desc_reprs: [desc_processed x n_hidden] 67 | desc_reprs = np.vstack(desc_reprs) 68 | print('processed desc num: ', desc_processed) 69 | 70 | 71 | query_desc_index_file_path = data_path + args.query_desc_index_file 72 | desc_index = [] 73 | with open(query_desc_index_file_path, 'r') as query_desc_index_file: 74 | lines = query_desc_index_file.readlines() 75 | for i in range(0, len(lines)): 76 | line = lines[i].strip() 77 | desc_index.append(int(line)) 78 | print('desc_index: ', desc_index) 79 | 80 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], [] 81 | test_sim_result, test_rank_result = [], [] 82 | for i in tqdm(range(0, desc_processed)): 83 | ind = desc_index[i] 84 | 85 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden] 86 | sims = np.dot(code_reprs, desc_vec.T)[:, 0] # [code_processed] 87 | negsims = np.negative(sims) 88 | predict = np.argsort(negsims) 89 | 90 | # SuccessRate@k 91 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]] 92 | sum_1.append(1.0) if ind in predict_1 else sum_1.append(0.0) 93 | sum_5.append(1.0) if ind in predict_5 else sum_5.append(0.0) 94 | sum_10.append(1.0) if ind in predict_10 else sum_10.append(0.0) 95 | # MRR 96 | predict_list = predict.tolist() 97 | rank = predict_list.index(ind) 98 | sum_mrr.append(1/float(rank+1)) 99 | 100 | # results need to be saved 101 | predict_20 = [int(k) for k in predict[0:20]] 102 | sim_20 = [sims[k] for k in predict_20] 103 | test_sim_result.append(zip(predict_20, sim_20)) 104 | test_rank_result.append(rank+1) 105 | 106 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}') 107 | save_path = args.data_path + 'result/' 108 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy' 109 | np.save(save_path+sim_result_filename, test_sim_result) 110 | np.save(save_path+rank_result_filename, test_rank_result) 111 | 112 | 113 | def parse_args(): 114 | parser = argparse.ArgumentParser("Test Code Search(Embedding) Model For User Study") 115 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 116 | parser.add_argument('--model', type=str, default='JointEmbeder', help='model name') 117 | parser.add_argument('-d', '--dataset', type=str, default='github_user_3', help='name of dataset.java, python') 118 | parser.add_argument('--query_desc_index_file', type=str, default='query.desc.index.txt') 119 | parser.add_argument('--reload_from', type=int, default=185, help='epoch to reload from') 120 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 121 | return parser.parse_args() 122 | 123 | 124 | if __name__ == '__main__': 125 | args = parse_args() 126 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 127 | config = getattr(configs, 'config_'+args.model)() 128 | 129 | ##### Define model ###### 130 | logger.info('Constructing Model..') 131 | 132 | model = getattr(models, args.model)(config) # initialize the model 133 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5' 134 | model.load_state_dict(torch.load(ckpt, map_location=device)) 135 | 136 | test(config, model, device) 137 | 138 | 139 | -------------------------------------------------------------------------------- /Baseline methods/DeepCS/util_desc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from collections import Counter 4 | import json 5 | import h5py 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def make_shuffle_index(args): 10 | dir_path = args.data_path + args.dataset 11 | all_desc_file_path = dir_path + args.all_desc_file 12 | with open(all_desc_file_path, 'r') as all_desc_file: 13 | lines = all_desc_file.readlines() 14 | all_num = int(len(lines)/2) 15 | 16 | index = np.arange(all_num) 17 | np.random.seed(16) 18 | np.random.shuffle(index) 19 | print(len(index)) 20 | np.save(args.shuffle_index_file, index) 21 | 22 | def split_desc_data(args): 23 | index = np.load(args.shuffle_index_file) 24 | 25 | dir_path = args.data_path + args.dataset 26 | all_desc_file_path = dir_path + args.all_desc_file 27 | train_desc_file_path = dir_path + args.train_desc_file 28 | test_desc_file_path = dir_path + args.test_desc_file 29 | 30 | input_desc = [] 31 | with open(all_desc_file_path, 'r') as all_desc_file: 32 | lines = all_desc_file.readlines() 33 | for line in lines: 34 | if (line[0:10] != 'BeginFunc:'): 35 | input_desc.append(line) 36 | print('number of input desc:\n', len(input_desc)) 37 | 38 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file: 39 | for i in range(0, args.trainset_num): 40 | train_desc_file.write(input_desc[index[i]]) 41 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num): 42 | test_desc_file.write(input_desc[index[i]]) 43 | 44 | 45 | def create_desc_dict_file(args): 46 | dir_path = args.data_path + args.dataset 47 | desc_file_path = dir_path + args.train_desc_file 48 | 49 | input_desc = [] 50 | with open(desc_file_path, 'r') as desc_file: 51 | input_desc = desc_file.readlines() 52 | desc_words = [] 53 | for i in range(0, len(input_desc)): 54 | input_desc[i] = input_desc[i].rstrip('\n') 55 | desc_word_list = input_desc[i].split() 56 | for desc_word in desc_word_list: 57 | desc_words.append(desc_word) 58 | vocab_desc_info = Counter(desc_words) 59 | print(len(vocab_desc_info)) 60 | 61 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]] 62 | vocab_desc_index = {'':0, '':1} 63 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))])) 64 | 65 | 66 | vocab_desc_file_path = dir_path + args.vocab_desc_file 67 | desc_dic_str = json.dumps(vocab_desc_index) 68 | with open(vocab_desc_file_path, 'w') as vocab_desc_file: 69 | vocab_desc_file.write(desc_dic_str) 70 | 71 | 72 | def sents2indexes(sent_file_path, vocab_file_path, maxlen): 73 | phrases, indices = [], [] 74 | with open(sent_file_path, 'r') as sent_file: 75 | sents = sent_file.readlines() 76 | vocab = json.loads(open(vocab_file_path, "r").readline()) 77 | start_index = 0 78 | for i in range(0, len(sents)): 79 | sent = sents[i].rstrip('\n') 80 | word_list = sent.split() 81 | sent_len = min(len(word_list), maxlen) 82 | indices.append((sent_len, start_index)) 83 | for j in range(0, sent_len): 84 | word = word_list[j] 85 | phrases.append(vocab.get(word, UNK_ID)) 86 | start_index += sent_len 87 | output_file_path = sent_file_path[0:-3] + 'h5' 88 | output_file = h5py.File(output_file_path, 'w') 89 | output_file['phrases'] = phrases 90 | output_file['indices'] = indices 91 | output_file.close() 92 | 93 | 94 | def parse_args(): 95 | parser = argparse.ArgumentParser("Parse Description data for TokenEmbedder") 96 | 97 | parser.add_argument('--data_path', type=str, default='./data/') 98 | parser.add_argument('--dataset', type=str, default='github11/') 99 | 100 | parser.add_argument('--origin_desc_file', type=str, default='origin.desc.txt') 101 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt') 102 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt') 103 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt') 104 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json') 105 | 106 | parser.add_argument('--trainset_num', type=int, default=39152) 107 | parser.add_argument('--testset_num', type=int, default=2000) 108 | parser.add_argument('--desc_word_num', type=int, default=10000) 109 | parser.add_argument('--desc_maxlen', type=int, default=30) 110 | parser.add_argument('--testset_start_index', type=int, default=39152) 111 | 112 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy') 113 | 114 | return parser.parse_args() 115 | 116 | if __name__ == '__main__': 117 | 118 | args = parse_args() 119 | 120 | #make_shuffle_index(args) 121 | 122 | #split_desc_data(args) 123 | 124 | create_desc_dict_file(args) 125 | 126 | dir_path = args.data_path + args.dataset 127 | # train.desc.txt -> train.desc.h5(and test...) 128 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 129 | sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 130 | 131 | -------------------------------------------------------------------------------- /Baseline methods/DeepCS/util_tok.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from collections import Counter 4 | import json 5 | import h5py 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def split_token_data(args): 10 | index = np.load(args.shuffle_index_file) 11 | 12 | dir_path = args.data_path + args.dataset 13 | all_token_file_path = dir_path + args.all_token_file 14 | train_token_file_path = dir_path + args.train_token_file 15 | test_token_file_path = dir_path + args.test_token_file 16 | 17 | input_token = [] 18 | with open(all_token_file_path, 'r') as all_token_file: 19 | lines = all_token_file.readlines() 20 | for line in lines: 21 | if (line[0:10] != 'BeginFunc:'): 22 | input_token.append(line) 23 | print('number of input token:\n', len(input_token)) 24 | 25 | with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file: 26 | for i in range(0, args.trainset_num): 27 | train_token_file.write(input_token[index[i]]) 28 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num): 29 | test_token_file.write(input_token[index[i]]) 30 | 31 | 32 | def create_token_dict_file(args): 33 | dir_path = args.data_path + args.dataset 34 | token_file_path = dir_path + args.train_token_file 35 | 36 | input_token = [] 37 | with open(token_file_path, 'r') as token_file: 38 | input_token = token_file.readlines() 39 | token_words = [] 40 | for i in range(0, len(input_token)): 41 | input_token[i] = input_token[i].rstrip('\n') 42 | token_word_list = input_token[i].split() 43 | for token_word in token_word_list: 44 | token_words.append(token_word) 45 | vocab_token_info = Counter(token_words) 46 | print(len(vocab_token_info)) 47 | 48 | vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]] 49 | vocab_token_index = {'':0, '':1} 50 | vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))])) 51 | 52 | 53 | vocab_token_file_path = dir_path + args.vocab_token_file 54 | token_dic_str = json.dumps(vocab_token_index) 55 | with open(vocab_token_file_path, 'w') as vocab_token_file: 56 | vocab_token_file.write(token_dic_str) 57 | 58 | def sents2indexes(sent_file_path, vocab_file_path, maxlen): 59 | phrases, indices = [], [] 60 | with open(sent_file_path, 'r') as sent_file: 61 | sents = sent_file.readlines() 62 | vocab = json.loads(open(vocab_file_path, "r").readline()) 63 | start_index = 0 64 | for i in range(0, len(sents)): 65 | sent = sents[i].rstrip('\n') 66 | word_list = sent.split() 67 | sent_len = min(len(word_list), maxlen) 68 | indices.append((sent_len, start_index)) 69 | for j in range(0, sent_len): 70 | word = word_list[j] 71 | phrases.append(vocab.get(word, UNK_ID)) 72 | start_index += sent_len 73 | output_file_path = sent_file_path[0:-3] + 'h5' 74 | output_file = h5py.File(output_file_path, 'w') 75 | output_file['phrases'] = phrases 76 | output_file['indices'] = indices 77 | output_file.close() 78 | 79 | def remove_dup_tokens(args): 80 | dir_path = args.data_path + args.dataset 81 | origin_token_file_path = dir_path + args.origin_token_file 82 | all_token_file_path = dir_path + args.all_token_file 83 | 84 | with open(origin_token_file_path, 'r') as origin_token_file, open(all_token_file_path, 'w') as all_token_file: 85 | lines = origin_token_file.readlines() 86 | for i in range(0, len(lines)): 87 | if lines[i][0:10] != 'BeginFunc:': 88 | line = lines[i].strip() 89 | words = line.split() 90 | new_words = list(set(words)) 91 | new_line = ' '.join(new_words) 92 | all_token_file.write(new_line + '\n') 93 | 94 | 95 | def parse_args(): 96 | parser = argparse.ArgumentParser("Parse token data for TokenEmbedder") 97 | 98 | parser.add_argument('--data_path', type=str, default='./data/') 99 | parser.add_argument('--dataset', type=str, default='github_user_3/') 100 | 101 | parser.add_argument('--origin_token_file', type=str, default='origin.token.txt') 102 | parser.add_argument('--all_token_file', type=str, default='all.token.txt') 103 | parser.add_argument('--train_token_file', type=str, default='train.token.txt') 104 | parser.add_argument('--test_token_file', type=str, default='test.token.txt') 105 | parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json') 106 | 107 | parser.add_argument('--trainset_num', type=int, default=39152) 108 | parser.add_argument('--testset_num', type=int, default=2000) 109 | parser.add_argument('--token_word_num', type=int, default=10000) 110 | parser.add_argument('--token_maxlen', type=int, default=50) 111 | parser.add_argument('--testset_start_index', type=int, default=39152) 112 | 113 | 114 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy') 115 | 116 | return parser.parse_args() 117 | 118 | if __name__ == '__main__': 119 | args = parse_args() 120 | ''' 121 | dir_path = args.data_path + args.dataset 122 | with open(dir_path+'origin.test.token.txt', 'r') as in_file, open(dir_path+'test.token.txt', 'w') as out_file: 123 | lines = in_file.readlines() 124 | for i in range(0, len(lines)): 125 | if lines[i][0:10] != 'BeginFunc:': 126 | out_file.write(lines[i]) 127 | ''' 128 | remove_dup_tokens(args) 129 | 130 | #split_token_data(args) 131 | #create_token_dict_file(args) 132 | 133 | dir_path = args.data_path + args.dataset 134 | sents2indexes(dir_path+args.all_token_file, dir_path+args.vocab_token_file, args.token_maxlen) 135 | 136 | ''' 137 | dir_path = args.data_path + args.dataset 138 | # train.token.txt -> train.token.h5(and test...) 139 | sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen) 140 | sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen) 141 | ''' 142 | 143 | ''' 144 | dir_path = args.data_path + args.dataset 145 | all_token_file_path = dir_path + args.all_token_file 146 | with open(all_token_file_path, 'r') as all_token_file: 147 | lines = all_token_file.readlines() 148 | print(len(lines)) 149 | for i in range(0, len(lines)): 150 | line = lines[i] 151 | if line[0:10] != 'BeginFunc:': 152 | words = line.split() 153 | if len(words) == 0: 154 | print(lines[i-1]) 155 | #print(lines[i]) 156 | ''' 157 | -------------------------------------------------------------------------------- /Baseline methods/DeepCS/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import math 4 | import torch 5 | from torch.nn import functional as F 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def cos_approx(data1,data2): 10 | """numpy implementation of cosine similarity for matrix""" 11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 12 | dotted = np.dot(data1,np.transpose(data2)) 13 | norm1 = np.linalg.norm(data1,axis=1) 14 | norm2 = np.linalg.norm(data2,axis=1) 15 | matrix_vector_norms = np.multiply(norm1, norm2) 16 | neighbors = np.divide(dotted, matrix_vector_norms) 17 | return neighbors 18 | 19 | def normalize(data): 20 | """normalize matrix by rows""" 21 | return data/np.linalg.norm(data,axis=1,keepdims=True) 22 | 23 | def dot_np(data1,data2): 24 | """cosine similarity for normalized vectors""" 25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 26 | return np.dot(data1, data2.T) 27 | 28 | def sigmoid(x): 29 | return 1/(1 + np.exp(-x)) 30 | 31 | def similarity(vec1, vec2, measure='cos'): 32 | if measure=='cos': 33 | vec1_norm = normalize(vec1) 34 | vec2_norm = normalize(vec2) 35 | return np.dot(vec1_norm, vec2_norm.T)[:,0] 36 | elif measure=='poly': 37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T) 38 | elif measure=='sigmoid': 39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1) 40 | elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf 41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1) 42 | euc_sim = 1 / (1 + euc_dist) 43 | if measure=='euc': return euc_sim 44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1) 45 | if measure == 'gesd': return euc_sim * sigmoid_sim 46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim) 47 | 48 | ####################################################################### 49 | 50 | def asMinutes(s): 51 | m = math.floor(s / 60) 52 | s -= m * 60 53 | return '%d:%d'% (m, s) 54 | 55 | def timeSince(since, percent): 56 | now = time.time() 57 | s = now - since 58 | es = s / (percent) 59 | rs = es - s 60 | return '%s<%s'%(asMinutes(s), asMinutes(rs)) 61 | 62 | ####################################################################### 63 | ''' 64 | import nltk 65 | try: nltk.word_tokenize("hello world") 66 | except LookupError: nltk.download('punkt') 67 | 68 | def sent2indexes(sentence, vocab, maxlen): 69 | 70 | def convert_sent(sent, vocab, maxlen): 71 | idxes = np.zeros(maxlen, dtype=np.int64) 72 | idxes.fill(PAD_ID) 73 | tokens = nltk.word_tokenize(sent.strip()) 74 | idx_len = min(len(tokens), maxlen) 75 | for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID) 76 | return idxes, idx_len 77 | if type(sentence) is list: 78 | inds, lens = [], [] 79 | for sent in sentence: 80 | idxes, idx_len = convert_sent(sent, vocab, maxlen) 81 | #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len]) 82 | inds.append(idxes) 83 | lens.append(idx_len) 84 | return np.vstack(inds), np.vstack(lens) 85 | else: 86 | inds, lens = sent2indexes([sentence], vocab, maxlen) 87 | return inds[0], lens[0] 88 | ''' 89 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 90 | '''indexes: numpy array''' 91 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID): 92 | indexes=filter(lambda i: i!=ignore_tok, indexes) 93 | toks, length = [], 0 94 | for idx in indexes: 95 | toks.append(ivocab.get(idx, '')) 96 | length+=1 97 | return ' '.join(toks), length 98 | 99 | ivocab = {v: k for k, v in vocab.items()} 100 | if indexes.ndim==1:# one sentence 101 | return revert_sent(indexes, ivocab, ignore_tok) 102 | else:# dim>1 103 | sentences, lens =[], [] # a batch of sentences 104 | for inds in indexes: 105 | sentence, length = revert_sent(inds, ivocab, ignore_tok) 106 | sentences.append(sentence) 107 | lens.append(length) 108 | return sentences, lens 109 | 110 | ######################################################################## 111 | -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/__pycache__/configs.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/configs.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/__pycache__/data_loader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/data_loader.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/__pycache__/modules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/modules.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/__pycache__/util_cfg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/util_cfg.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/__pycache__/util_dfg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/util_dfg.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/configs.py: -------------------------------------------------------------------------------- 1 | 2 | def config_CFGEmbeder(): 3 | conf = { 4 | # added_params 5 | 'transform_every_modal': 0, # to make modal more complex? 6 | 'save_attn_weight': 0, 7 | 'use_tanh': 1, 8 | 'use_attn': 1, 9 | 'use_desc_attn': 1, 10 | 11 | # GGNN 12 | 'state_dim': 512, # GGNN hidden state size 13 | 'annotation_dim': 5, 14 | 'n_edge_types': 1, 15 | 'n_node': 150, # could be less than 512, like the maximum nodenum 16 | 'n_steps': 5, # propogation steps number of GGNN 17 | 'output_type': 'no_reduce', 18 | 'batch_size': 32, 19 | 'n_layers': 1, 20 | 'n_hidden': 512, 21 | 'cfg_attn_mode': 'sigmoid_scalar', 22 | 23 | # data_params 24 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader 25 | #training data 26 | 'train_token':'train.token.h5', 27 | 'train_dfg':'train.dfg.txt', 28 | 'train_cfg':'train.cfg.txt', 29 | 'train_desc':'train.desc.h5', 30 | # test data 31 | 'test_token':'test.token.h5', 32 | 'test_dfg':'test.dfg.txt', 33 | 'test_cfg':'test.cfg.txt', 34 | 'test_desc':'test.desc.h5', 35 | #vocabulary info 36 | 'vocab_desc':'vocab.desc.json', 37 | 'vocab_token':'vocab.token.json', 38 | 39 | #parameters 40 | 'desc_len': 30, 41 | 'tok_len': 100, 42 | 'n_desc_words': 10000, # wait to decide 43 | 'n_token_words': 20000, 44 | 45 | #training_params 46 | 'nb_epoch': 200, 47 | #'optimizer': 'adam', 48 | 'learning_rate':0.0003, # try 1e-4(paper) 49 | 'adam_epsilon':1e-8, 50 | 'warmup_steps':5000, 51 | 'fp16': False, 52 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. 53 | #"See details at https://nvidia.github.io/apex/amp.html" 54 | 55 | # model_params 56 | 'emb_size': 300, 57 | # recurrent 58 | 'margin': 0.6, 59 | 'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf 60 | #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization. 61 | 'dropout': 0.1 62 | } 63 | return conf 64 | 65 | -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/data/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .cfgemb import CFGEmbeder -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/models/__pycache__/cfgemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/cfgemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/models/__pycache__/jointemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/jointemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/models/__pycache__/tokenemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/tokenemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/output/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/output/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/shuffle_index.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/shuffle_index.npy -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | import numpy as np 5 | import argparse 6 | import threading 7 | import codecs 8 | import logging 9 | from tqdm import tqdm 10 | logger = logging.getLogger(__name__) 11 | logging.basicConfig(level=logging.INFO, format="%(message)s") 12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package 13 | 14 | import torch 15 | 16 | import models, configs, data_loader 17 | from modules import get_cosine_schedule_with_warmup 18 | from utils import similarity, normalize 19 | from data_loader import * 20 | 21 | 22 | def test(config, model, device): 23 | logger.info('Test Begin...') 24 | 25 | model.eval() 26 | model.to(device) 27 | 28 | # load data 29 | data_path = args.data_path+args.dataset+'/' 30 | test_set = eval(config['dataset_name'])(config, data_path, 31 | config['test_cfg'], config['n_node'], 32 | config['test_desc'], config['desc_len']) 33 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32, 34 | shuffle=False, drop_last=False, num_workers=1) 35 | # encode tokens and descs 36 | code_reprs, desc_reprs = [], [] 37 | n_processed = 0 38 | for batch in data_loader: 39 | # batch[0:3]: init_input, adjmat, node_mask 40 | code_batch = [tensor.to(device) for tensor in batch[:3]] 41 | # batch[3:5]: good_desc, good_desc_len 42 | desc_batch = [tensor.to(device) for tensor in batch[3:5]] 43 | with torch.no_grad(): 44 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32) 45 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size] 46 | # normalize when sim_measure=='cos' 47 | code_repr = normalize(code_repr) 48 | desc_repr = normalize(desc_repr) 49 | code_reprs.append(code_repr) 50 | desc_reprs.append(desc_repr) 51 | n_processed += batch[0].size(0) # +batch_size 52 | # code_reprs: [n_processed x n_hidden] 53 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 54 | 55 | # calculate similarity 56 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], [] 57 | test_sim_result, test_rank_result = [], [] 58 | for i in tqdm(range(0, n_processed)): 59 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden] 60 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed] 61 | negsims = np.negative(sims) 62 | predict = np.argsort(negsims) 63 | 64 | # SuccessRate@k 65 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]] 66 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0) 67 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0) 68 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0) 69 | # MRR 70 | predict_list = predict.tolist() 71 | rank = predict_list.index(i) 72 | sum_mrr.append(1/float(rank+1)) 73 | 74 | # results need to be saved 75 | predict_20 = [int(k) for k in predict[0:20]] 76 | sim_20 = [sims[k] for k in predict_20] 77 | test_sim_result.append(zip(predict_20, sim_20)) 78 | test_rank_result.append(rank+1) 79 | 80 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}') 81 | save_path = args.data_path + 'result/' 82 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy' 83 | np.save(save_path+sim_result_filename, test_sim_result) 84 | np.save(save_path+rank_result_filename, test_rank_result) 85 | 86 | 87 | def parse_args(): 88 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model") 89 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 90 | parser.add_argument('--model', type=str, default='CFGEmbeder', help='model name') 91 | parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python') 92 | parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from') 93 | parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\ 94 | 'Note: should be consistent with the same argument in the repr_code.py') 95 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 96 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard") 97 | return parser.parse_args() 98 | 99 | 100 | if __name__ == '__main__': 101 | args = parse_args() 102 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 103 | config = getattr(configs, 'config_'+args.model)() 104 | 105 | ##### Define model ###### 106 | logger.info('Constructing Model..') 107 | model = getattr(models, args.model)(config) # initialize the model 108 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5' 109 | model.load_state_dict(torch.load(ckpt, map_location=device)) 110 | 111 | test(config, model, device) 112 | 113 | 114 | -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/util_desc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from collections import Counter 4 | import json 5 | import h5py 6 | from utils import UNK_ID 7 | 8 | def make_shuffle_index_num(args, all_num): 9 | index = np.arange(all_num) 10 | np.random.seed(16) 11 | np.random.shuffle(index) 12 | print('index:\n', index) 13 | np.save(args.shuffle_index_file, index) 14 | 15 | def make_shuffle_index(args): 16 | dir_path = args.data_path + args.dataset 17 | all_desc_file_path = dir_path + args.all_desc_file 18 | with open(all_desc_file_path, 'r') as all_desc_file: 19 | lines = all_desc_file.readlines() 20 | all_num = int(len(lines)/2) 21 | print('all_num of desc:\n', all_num) 22 | 23 | index = np.arange(all_num) 24 | np.random.seed(16) 25 | np.random.shuffle(index) 26 | print('index:\n', index) 27 | np.save(args.shuffle_index_file, index) 28 | 29 | def split_data(args): 30 | index = np.load(args.shuffle_index_file) 31 | 32 | dir_path = args.data_path + args.dataset 33 | all_desc_file_path = dir_path + args.all_desc_file 34 | train_desc_file_path = dir_path + args.train_desc_file 35 | test_desc_file_path = dir_path + args.test_desc_file 36 | 37 | input_desc = [] 38 | with open(all_desc_file_path, 'r') as all_desc_file: 39 | lines = all_desc_file.readlines() 40 | for line in lines: 41 | if (line[0:10] != 'BeginFunc:'): 42 | input_desc.append(line) 43 | print('number of input desc:\n', len(input_desc)) 44 | 45 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file: 46 | for i in range(0, args.trainset_num): 47 | train_desc_file.write(input_desc[index[i]]) 48 | for i in range(32000, 32000+args.testset_num): 49 | test_desc_file.write(input_desc[index[i]]) 50 | 51 | 52 | def create_dict_file(args): 53 | dir_path = args.data_path + args.dataset 54 | desc_file_path = dir_path + args.train_desc_file 55 | 56 | input_desc = [] 57 | with open(desc_file_path, 'r') as desc_file: 58 | input_desc = desc_file.readlines() 59 | desc_words = [] 60 | for i in range(0, len(input_desc)): 61 | input_desc[i] = input_desc[i].rstrip('\n') 62 | desc_word_list = input_desc[i].split() 63 | for desc_word in desc_word_list: 64 | desc_words.append(desc_word) 65 | vocab_desc_info = Counter(desc_words) 66 | print(len(vocab_desc_info)) 67 | 68 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]] 69 | vocab_desc_index = {'':0, '':1} 70 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))])) 71 | 72 | 73 | vocab_desc_file_path = dir_path + args.vocab_desc_file 74 | desc_dic_str = json.dumps(vocab_desc_index) 75 | with open(vocab_desc_file_path, 'w') as vocab_desc_file: 76 | vocab_desc_file.write(desc_dic_str) 77 | 78 | 79 | def sents2indexes(sent_file_path, vocab_file_path, maxlen): 80 | phrases, indices = [], [] 81 | with open(sent_file_path, 'r') as sent_file: 82 | sents = sent_file.readlines() 83 | vocab = json.loads(open(vocab_file_path, "r").readline()) 84 | start_index = 0 85 | for i in range(0, len(sents)): 86 | sent = sents[i].rstrip('\n') 87 | word_list = sent.split() 88 | sent_len = min(len(word_list), maxlen) 89 | indices.append((sent_len, start_index)) 90 | for j in range(0, sent_len): 91 | word = word_list[j] 92 | phrases.append(vocab.get(word, UNK_ID)) 93 | start_index += sent_len 94 | output_file_path = sent_file_path[0:-3] + 'h5' 95 | output_file = h5py.File(output_file_path, 'w') 96 | output_file['phrases'] = phrases 97 | output_file['indices'] = indices 98 | output_file.close() 99 | 100 | 101 | def parse_args(): 102 | parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder") 103 | 104 | parser.add_argument('--data_path', type=str, default='./data/') 105 | parser.add_argument('--dataset', type=str, default='github/') 106 | 107 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt') 108 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt') 109 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt') 110 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json') 111 | 112 | parser.add_argument('--trainset_num', type=int, default=32000) 113 | parser.add_argument('--testset_num', type=int, default=1000) 114 | parser.add_argument('--desc_word_num', type=int, default=10000) 115 | parser.add_argument('--desc_maxlen', type=int, default=30) 116 | 117 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy') 118 | 119 | return parser.parse_args() 120 | 121 | if __name__ == '__main__': 122 | args = parse_args() 123 | 124 | #make_shuffle_index_num(args, 33000) 125 | #split_data(args) 126 | #create_dict_file(args) 127 | 128 | dir_path = args.data_path + args.dataset 129 | # train.desc.txt -> train.desc.h5(and test...) 130 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 131 | sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 132 | 133 | -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/util_tok.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from collections import Counter 4 | import json 5 | import h5py 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def split_token_data(args): 10 | index = np.load(args.shuffle_index_file) 11 | 12 | dir_path = args.data_path + args.dataset 13 | all_token_file_path = dir_path + args.all_token_file 14 | train_token_file_path = dir_path + args.train_token_file 15 | test_token_file_path = dir_path + args.test_token_file 16 | 17 | input_token = [] 18 | with open(all_token_file_path, 'r') as all_token_file: 19 | lines = all_token_file.readlines() 20 | for line in lines: 21 | if (line[0:10] != 'BeginFunc:'): 22 | input_token.append(line) 23 | print('number of input token:\n', len(input_token)) 24 | 25 | with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file: 26 | for i in range(0, args.trainset_num): 27 | train_token_file.write(input_token[index[i]]) 28 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num): 29 | test_token_file.write(input_token[index[i]]) 30 | 31 | 32 | def create_token_dict_file(args): 33 | dir_path = args.data_path + args.dataset 34 | token_file_path = dir_path + args.train_token_file 35 | 36 | input_token = [] 37 | with open(token_file_path, 'r') as token_file: 38 | input_token = token_file.readlines() 39 | token_words = [] 40 | for i in range(0, len(input_token)): 41 | input_token[i] = input_token[i].rstrip('\n') 42 | token_word_list = input_token[i].split() 43 | for token_word in token_word_list: 44 | token_words.append(token_word) 45 | vocab_token_info = Counter(token_words) 46 | print(len(vocab_token_info)) 47 | 48 | vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]] 49 | vocab_token_index = {'':0, '':1} 50 | vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))])) 51 | 52 | 53 | vocab_token_file_path = dir_path + args.vocab_token_file 54 | token_dic_str = json.dumps(vocab_token_index) 55 | with open(vocab_token_file_path, 'w') as vocab_token_file: 56 | vocab_token_file.write(token_dic_str) 57 | 58 | def sents2indexes(sent_file_path, vocab_file_path, maxlen): 59 | phrases, indices = [], [] 60 | with open(sent_file_path, 'r') as sent_file: 61 | sents = sent_file.readlines() 62 | vocab = json.loads(open(vocab_file_path, "r").readline()) 63 | start_index = 0 64 | for i in range(0, len(sents)): 65 | sent = sents[i].rstrip('\n') 66 | word_list = sent.split() 67 | sent_len = min(len(word_list), maxlen) 68 | indices.append((sent_len, start_index)) 69 | for j in range(0, sent_len): 70 | word = word_list[j] 71 | phrases.append(vocab.get(word, UNK_ID)) 72 | start_index += sent_len 73 | output_file_path = sent_file_path[0:-3] + 'h5' 74 | output_file = h5py.File(output_file_path, 'w') 75 | output_file['phrases'] = phrases 76 | output_file['indices'] = indices 77 | output_file.close() 78 | 79 | def parse_args(): 80 | parser = argparse.ArgumentParser("Parse token data for TokenEmbedder") 81 | 82 | parser.add_argument('--data_path', type=str, default='./data/') 83 | parser.add_argument('--dataset', type=str, default='github11/') 84 | 85 | parser.add_argument('--all_token_file', type=str, default='all.token.txt') 86 | parser.add_argument('--train_token_file', type=str, default='train.token.txt') 87 | parser.add_argument('--test_token_file', type=str, default='test.token.txt') 88 | parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json') 89 | 90 | parser.add_argument('--trainset_num', type=int, default=39152) 91 | parser.add_argument('--testset_num', type=int, default=2000) 92 | parser.add_argument('--token_word_num', type=int, default=20000) 93 | parser.add_argument('--token_maxlen', type=int, default=100) 94 | parser.add_argument('--testset_start_index', type=int, default=39152) 95 | 96 | 97 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy') 98 | 99 | return parser.parse_args() 100 | 101 | if __name__ == '__main__': 102 | args = parse_args() 103 | ''' 104 | dir_path = args.data_path + args.dataset 105 | with open(dir_path+'origin.test.token.txt', 'r') as in_file, open(dir_path+'test.token.txt', 'w') as out_file: 106 | lines = in_file.readlines() 107 | for i in range(0, len(lines)): 108 | if lines[i][0:10] != 'BeginFunc:': 109 | out_file.write(lines[i]) 110 | ''' 111 | 112 | split_token_data(args) 113 | create_token_dict_file(args) 114 | 115 | dir_path = args.data_path + args.dataset 116 | # train.token.txt -> train.token.h5(and test...) 117 | sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen) 118 | sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen) 119 | 120 | 121 | ''' 122 | dir_path = args.data_path + args.dataset 123 | all_token_file_path = dir_path + args.all_token_file 124 | with open(all_token_file_path, 'r') as all_token_file: 125 | lines = all_token_file.readlines() 126 | print(len(lines)) 127 | for i in range(0, len(lines)): 128 | line = lines[i] 129 | if line[0:10] != 'BeginFunc:': 130 | words = line.split() 131 | if len(words) == 0: 132 | print(lines[i-1]) 133 | #print(lines[i]) 134 | ''' 135 | -------------------------------------------------------------------------------- /Baseline methods/MMAN(TDC)/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import math 4 | import torch 5 | from torch.nn import functional as F 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def cos_approx(data1,data2): 10 | """numpy implementation of cosine similarity for matrix""" 11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 12 | dotted = np.dot(data1,np.transpose(data2)) 13 | norm1 = np.linalg.norm(data1,axis=1) 14 | norm2 = np.linalg.norm(data2,axis=1) 15 | matrix_vector_norms = np.multiply(norm1, norm2) 16 | neighbors = np.divide(dotted, matrix_vector_norms) 17 | return neighbors 18 | 19 | def normalize(data): 20 | """normalize matrix by rows""" 21 | return data/np.linalg.norm(data,axis=1,keepdims=True) 22 | 23 | def dot_np(data1,data2): 24 | """cosine similarity for normalized vectors""" 25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 26 | return np.dot(data1, data2.T) 27 | 28 | def sigmoid(x): 29 | return 1/(1 + np.exp(-x)) 30 | 31 | def similarity(vec1, vec2, measure='cos'): 32 | if measure=='cos': 33 | vec1_norm = normalize(vec1) 34 | vec2_norm = normalize(vec2) 35 | return np.dot(vec1_norm, vec2_norm.T)[:,0] 36 | elif measure=='poly': 37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T) 38 | elif measure=='sigmoid': 39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1) 40 | elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf 41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1) 42 | euc_sim = 1 / (1 + euc_dist) 43 | if measure=='euc': return euc_sim 44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1) 45 | if measure == 'gesd': return euc_sim * sigmoid_sim 46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim) 47 | 48 | ####################################################################### 49 | 50 | def asMinutes(s): 51 | m = math.floor(s / 60) 52 | s -= m * 60 53 | return '%d:%d'% (m, s) 54 | 55 | def timeSince(since, percent): 56 | now = time.time() 57 | s = now - since 58 | es = s / (percent) 59 | rs = es - s 60 | return '%s<%s'%(asMinutes(s), asMinutes(rs)) 61 | 62 | ####################################################################### 63 | 64 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 65 | '''indexes: numpy array''' 66 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID): 67 | indexes=filter(lambda i: i!=ignore_tok, indexes) 68 | toks, length = [], 0 69 | for idx in indexes: 70 | toks.append(ivocab.get(idx, '')) 71 | length+=1 72 | return ' '.join(toks), length 73 | 74 | ivocab = {v: k for k, v in vocab.items()} 75 | if indexes.ndim==1:# one sentence 76 | return revert_sent(indexes, ivocab, ignore_tok) 77 | else:# dim>1 78 | sentences, lens =[], [] # a batch of sentences 79 | for inds in indexes: 80 | sentence, length = revert_sent(inds, ivocab, ignore_tok) 81 | sentences.append(sentence) 82 | lens.append(length) 83 | return sentences, lens 84 | 85 | ######################################################################## 86 | -------------------------------------------------------------------------------- /Baseline methods/MMAN/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/MMAN/__pycache__/configs.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/configs.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/__pycache__/data_loader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/data_loader.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/__pycache__/modules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/modules.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/__pycache__/util_cfg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/util_cfg.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/configs.py: -------------------------------------------------------------------------------- 1 | 2 | def config_MultiEmbeder(): 3 | conf = { 4 | # GGNN 5 | 'state_dim': 512, # GGNN hidden state size 6 | 'annotation_dim': 5, 7 | 'n_edge_types': 2, 8 | 'n_node': 200, # could be less than 512, like the maximum nodenum 9 | 'n_steps': 5, # propogation steps number of GGNN 10 | 'output_type': 'no_reduce', 11 | 'batch_size': 32, 12 | 'n_layers': 1, 13 | 'n_hidden': 512, 14 | 'cfg_attn_mode': 'sigmoid_scalar', 15 | 16 | # TreeLSTM 17 | 'treelstm_cell_type': 'nary', # nary or childsum 18 | 'n_ast_words': 50000, 19 | 20 | # Token and Description 21 | 'desc_len': 30, 22 | 'tok_len': 100, 23 | 'n_desc_words': 10000, 24 | 'n_token_words': 25000, 25 | 26 | # data_params 27 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader 28 | # training data 29 | 'train_token':'train.token.h5', 30 | 'train_ast':'train.ast.json', 31 | 'train_cfg':'train.cfg.txt', 32 | 'train_desc':'train.desc.h5', 33 | # test data 34 | 'test_token':'test.token.h5', 35 | 'test_ast':'test.ast.json', 36 | 'test_cfg':'test.cfg.txt', 37 | 'test_desc':'test.desc.h5', 38 | # vocabulary info 39 | 'vocab_token':'vocab.token.json', 40 | 'vocab_ast':'vocab.ast.json', 41 | 'vocab_desc':'vocab.desc.json', 42 | 43 | # model_params 44 | 'emb_size': 300, 45 | # recurrent 46 | 'margin': 0.6, 47 | 'sim_measure':'cos', 48 | 'dropout': 0.1, 49 | 50 | 51 | # training_params 52 | 'nb_epoch': 200, 53 | #'optimizer': 'adamW', 54 | 'learning_rate':0.0003, # try 1e-4(paper) 55 | 'adam_epsilon':1e-8, 56 | 'warmup_steps':5000, 57 | 'fp16': False, 58 | 'fp16_opt_level': 'O1' #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. 59 | #"See details at https://nvidia.github.io/apex/amp.html" 60 | 61 | 62 | } 63 | return conf 64 | 65 | -------------------------------------------------------------------------------- /Baseline methods/MMAN/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/MMAN/data_prepare/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__init__.py -------------------------------------------------------------------------------- /Baseline methods/MMAN/data_prepare/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/data_prepare/__pycache__/util_ast.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_ast.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/data_prepare/__pycache__/util_cfg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_cfg.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/data_prepare/__pycache__/util_desc.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_desc.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/data_prepare/__pycache__/util_tok.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_tok.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/data_prepare/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/data_prepare/util_desc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from collections import Counter 4 | import json 5 | import h5py 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def make_shuffle_index(args): 10 | dir_path = args.data_path + args.dataset 11 | all_desc_file_path = dir_path + args.all_desc_file 12 | with open(all_desc_file_path, 'r') as all_desc_file: 13 | lines = all_desc_file.readlines() 14 | all_num = int(len(lines)/2) 15 | 16 | index = np.arange(all_num) 17 | np.random.seed(16) 18 | np.random.shuffle(index) 19 | #print(index) 20 | np.save(args.shuffle_index_file, index) 21 | 22 | def split_desc_data(args): 23 | index = np.load(args.shuffle_index_file) 24 | 25 | dir_path = args.data_path + args.dataset 26 | all_desc_file_path = dir_path + args.all_desc_file 27 | train_desc_file_path = dir_path + args.train_desc_file 28 | test_desc_file_path = dir_path + args.test_desc_file 29 | 30 | input_desc = [] 31 | with open(all_desc_file_path, 'r') as all_desc_file: 32 | lines = all_desc_file.readlines() 33 | for line in lines: 34 | if (line[0:10] != 'BeginFunc:'): 35 | input_desc.append(line) 36 | print('number of input desc:\n', len(input_desc)) 37 | 38 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file: 39 | for i in range(0, args.trainset_num): 40 | train_desc_file.write(input_desc[index[i]]) 41 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num): 42 | test_desc_file.write(input_desc[index[i]]) 43 | 44 | 45 | def create_desc_dict_file(args): 46 | dir_path = args.data_path + args.dataset 47 | desc_file_path = dir_path + args.train_desc_file 48 | 49 | input_desc = [] 50 | with open(desc_file_path, 'r') as desc_file: 51 | input_desc = desc_file.readlines() 52 | desc_words = [] 53 | for i in range(0, len(input_desc)): 54 | input_desc[i] = input_desc[i].rstrip('\n') 55 | desc_word_list = input_desc[i].split() 56 | for desc_word in desc_word_list: 57 | desc_words.append(desc_word) 58 | vocab_desc_info = Counter(desc_words) 59 | print(len(vocab_desc_info)) 60 | 61 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]] 62 | vocab_desc_index = {'':0, '':1} 63 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))])) 64 | 65 | 66 | vocab_desc_file_path = dir_path + args.vocab_desc_file 67 | desc_dic_str = json.dumps(vocab_desc_index) 68 | with open(vocab_desc_file_path, 'w') as vocab_desc_file: 69 | vocab_desc_file.write(desc_dic_str) 70 | 71 | 72 | def sents2indexes(sent_file_path, vocab_file_path, maxlen): 73 | phrases, indices = [], [] 74 | with open(sent_file_path, 'r') as sent_file: 75 | sents = sent_file.readlines() 76 | vocab = json.loads(open(vocab_file_path, "r").readline()) 77 | start_index = 0 78 | for i in range(0, len(sents)): 79 | sent = sents[i].rstrip('\n') 80 | word_list = sent.split() 81 | sent_len = min(len(word_list), maxlen) 82 | indices.append((sent_len, start_index)) 83 | for j in range(0, sent_len): 84 | word = word_list[j] 85 | phrases.append(vocab.get(word, UNK_ID)) 86 | start_index += sent_len 87 | output_file_path = sent_file_path[0:-3] + 'h5' 88 | output_file = h5py.File(output_file_path, 'w') 89 | output_file['phrases'] = phrases 90 | output_file['indices'] = indices 91 | output_file.close() 92 | 93 | ''' 94 | def parse_args(): 95 | parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder") 96 | 97 | parser.add_argument('--data_path', type=str, default='./data/') 98 | parser.add_argument('--dataset', type=str, default='example/') 99 | 100 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt') 101 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt') 102 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt') 103 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json') 104 | 105 | parser.add_argument('--trainset_num', type=int, default=12) 106 | parser.add_argument('--testset_num', type=int, default=1000) 107 | parser.add_argument('--desc_word_num', type=int, default=50) 108 | parser.add_argument('--desc_maxlen', type=int, default=50) 109 | parser.add_argument('--testset_start_index', type=int, default=33000) 110 | 111 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy') 112 | 113 | return parser.parse_args() 114 | 115 | if __name__ == '__main__': 116 | 117 | args = parse_args() 118 | 119 | #make_shuffle_index(args) 120 | #split_data(args) 121 | create_desc_dict_file(args) 122 | 123 | 124 | dir_path = args.data_path + args.dataset 125 | # train.desc.txt -> train.desc.h5(and test...) 126 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 127 | #sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 128 | ''' 129 | -------------------------------------------------------------------------------- /Baseline methods/MMAN/data_prepare/util_tok.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from collections import Counter 4 | import json 5 | import h5py 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def split_token_data(args): 10 | index = np.load(args.shuffle_index_file) 11 | 12 | dir_path = args.data_path + args.dataset 13 | all_token_file_path = dir_path + args.all_token_file 14 | train_token_file_path = dir_path + args.train_token_file 15 | test_token_file_path = dir_path + args.test_token_file 16 | 17 | input_token = [] 18 | with open(all_token_file_path, 'r') as all_token_file: 19 | lines = all_token_file.readlines() 20 | for line in lines: 21 | if (line[0:10] != 'BeginFunc:'): 22 | input_token.append(line) 23 | print('number of input token:\n', len(input_token)) 24 | 25 | with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file: 26 | for i in range(0, args.trainset_num): 27 | train_token_file.write(input_token[index[i]]) 28 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num): 29 | test_token_file.write(input_token[index[i]]) 30 | 31 | 32 | def create_token_dict_file(args): 33 | dir_path = args.data_path + args.dataset 34 | token_file_path = dir_path + args.train_token_file 35 | 36 | input_token = [] 37 | with open(token_file_path, 'r') as token_file: 38 | input_token = token_file.readlines() 39 | token_words = [] 40 | for i in range(0, len(input_token)): 41 | input_token[i] = input_token[i].rstrip('\n') 42 | token_word_list = input_token[i].split() 43 | for token_word in token_word_list: 44 | token_words.append(token_word) 45 | vocab_token_info = Counter(token_words) 46 | print(len(vocab_token_info)) 47 | 48 | vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]] 49 | vocab_token_index = {'':0, '':1} 50 | vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))])) 51 | 52 | 53 | vocab_token_file_path = dir_path + args.vocab_token_file 54 | token_dic_str = json.dumps(vocab_token_index) 55 | with open(vocab_token_file_path, 'w') as vocab_token_file: 56 | vocab_token_file.write(token_dic_str) 57 | 58 | 59 | ''' 60 | def parse_args(): 61 | parser = argparse.ArgumentParser("Parse tokenription data for CFGEmbedder") 62 | 63 | parser.add_argument('--data_path', type=str, default='./data/') 64 | parser.add_argument('--dataset', type=str, default='example/') 65 | 66 | parser.add_argument('--all_token_file', type=str, default='all.token.txt') 67 | parser.add_argument('--train_token_file', type=str, default='train.token.txt') 68 | parser.add_argument('--test_token_file', type=str, default='test.token.txt') 69 | parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json') 70 | 71 | parser.add_argument('--trainset_num', type=int, default=12) 72 | parser.add_argument('--testset_num', type=int, default=1000) 73 | parser.add_argument('--token_word_num', type=int, default=50) 74 | parser.add_argument('--token_maxlen', type=int, default=50) 75 | parser.add_argument('--testset_start_index', type=int, default=33000) 76 | 77 | 78 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy') 79 | 80 | return parser.parse_args() 81 | 82 | if __name__ == '__main__': 83 | args = parse_args() 84 | 85 | #make_shuffle_index(args) 86 | #split_data(args) 87 | #create_token_dict_file(args) 88 | 89 | 90 | #dir_path = args.data_path + args.dataset 91 | # train.token.txt -> train.token.h5(and test...) 92 | #sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen) 93 | #sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen) 94 | ''' 95 | -------------------------------------------------------------------------------- /Baseline methods/MMAN/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .multiemb import MultiEmbeder -------------------------------------------------------------------------------- /Baseline methods/MMAN/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/models/__pycache__/cfgemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/cfgemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/models/__pycache__/jointemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/jointemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/models/__pycache__/multiemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/multiemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/models/__pycache__/tokenemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/tokenemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/MMAN/output/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/output/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/MMAN/output/MultiEmbeder/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/output/MultiEmbeder/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/MMAN/shuffle_index.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/shuffle_index.npy -------------------------------------------------------------------------------- /Baseline methods/MMAN/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | import numpy as np 5 | import argparse 6 | import threading 7 | import codecs 8 | import logging 9 | from tqdm import tqdm 10 | logger = logging.getLogger(__name__) 11 | logging.basicConfig(level=logging.INFO, format="%(message)s") 12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package 13 | 14 | import torch 15 | 16 | import models, configs, data_loader 17 | from modules import get_cosine_schedule_with_warmup 18 | #from utils import similarity, normalize 19 | from data_loader import * 20 | 21 | 22 | def normalize(data): 23 | """normalize matrix by rows""" 24 | return data/np.linalg.norm(data,axis=1,keepdims=True) 25 | 26 | def test(config, model, device): 27 | logger.info('Test Begin...') 28 | 29 | model.eval() 30 | model.to(device) 31 | 32 | # load data 33 | data_path = args.data_path+args.dataset+'/' 34 | test_set = eval(config['dataset_name'])(config, data_path, 35 | config['test_token'], config['tok_len'], 36 | config['test_ast'], config['vocab_ast'], 37 | config['test_cfg'], config['n_node'], 38 | config['test_desc'], config['desc_len']) 39 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32, 40 | collate_fn=batcher(device), shuffle=False, drop_last=False, num_workers=0) 41 | # encode tokens and descs 42 | code_reprs, desc_reprs = [], [] 43 | n_processed = 0 44 | for batch in data_loader: 45 | # batch[0:7]: tokens, tok_len, tree, tree_node_num, init_input, adjmat, node_mask 46 | code_batch = [tensor for tensor in batch[:7]] 47 | # batch[7:9]: good_desc, good_desc_len 48 | desc_batch = [tensor for tensor in batch[7:9]] 49 | with torch.no_grad(): 50 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32) 51 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size] 52 | # normalize when sim_measure=='cos' 53 | code_repr = normalize(code_repr) 54 | desc_repr = normalize(desc_repr) 55 | code_reprs.append(code_repr) 56 | desc_reprs.append(desc_repr) 57 | n_processed += batch[0].size(0) # +batch_size 58 | # code_reprs: [n_processed x n_hidden] 59 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 60 | 61 | # calculate similarity 62 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], [] 63 | test_sim_result, test_rank_result = [], [] 64 | for i in tqdm(range(0, n_processed)): 65 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden] 66 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed] 67 | negsims = np.negative(sims) 68 | predict = np.argsort(negsims) 69 | 70 | # SuccessRate@k 71 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]] 72 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0) 73 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0) 74 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0) 75 | # MRR 76 | predict_list = predict.tolist() 77 | rank = predict_list.index(i) 78 | sum_mrr.append(1/float(rank+1)) 79 | 80 | # results need to be saved 81 | predict_20 = [int(k) for k in predict[0:20]] 82 | sim_20 = [sims[k] for k in predict_20] 83 | test_sim_result.append(zip(predict_20, sim_20)) 84 | test_rank_result.append(rank+1) 85 | 86 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}') 87 | save_path = args.data_path + 'result/' 88 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy' 89 | np.save(save_path+sim_result_filename, test_sim_result) 90 | np.save(save_path+rank_result_filename, test_rank_result) 91 | 92 | 93 | def parse_args(): 94 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model") 95 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 96 | parser.add_argument('--model', type=str, default='MultiEmbeder', help='model name') 97 | parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python') 98 | parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from') 99 | parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\ 100 | 'Note: should be consistent with the same argument in the repr_code.py') 101 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 102 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard") 103 | return parser.parse_args() 104 | 105 | 106 | if __name__ == '__main__': 107 | args = parse_args() 108 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 109 | config = getattr(configs, 'config_'+args.model)() 110 | 111 | ##### Define model ###### 112 | logger.info('Constructing Model..') 113 | model = getattr(models, args.model)(config) # initialize the model 114 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5' 115 | model.load_state_dict(torch.load(ckpt, map_location=device)) 116 | 117 | test(config, model, device) 118 | 119 | 120 | -------------------------------------------------------------------------------- /Baseline methods/MMAN/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" 3 | import sys 4 | import random 5 | import time 6 | from datetime import datetime 7 | import numpy as np 8 | import math 9 | import argparse 10 | random.seed(42) 11 | from tqdm import tqdm 12 | 13 | import logging 14 | logger = logging.getLogger(__name__) 15 | logging.basicConfig(level=logging.INFO, format="%(message)s") 16 | 17 | import torch 18 | 19 | import models, configs 20 | from modules import get_cosine_schedule_with_warmup 21 | from data_loader import * 22 | 23 | 24 | def train(args): 25 | fh = logging.FileHandler(f"./output/{args.model}/{args.dataset}/logs.txt") 26 | # create file handler which logs even debug messages 27 | logger.addHandler(fh)# add the handlers to the logger 28 | timestamp = datetime.now().strftime('%Y%m%d%H%M') 29 | 30 | random.seed(args.seed) 31 | np.random.seed(args.seed) 32 | torch.manual_seed(args.seed) 33 | torch.cuda.manual_seed(args.seed) 34 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 35 | 36 | def save_model(model, epoch): 37 | torch.save(model.state_dict(), f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5') 38 | 39 | def load_model(model, epoch, to_device): 40 | assert os.path.exists(f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5'), f'Weights at epoch {epoch} not found' 41 | model.load_state_dict(torch.load(f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5', map_location=to_device)) 42 | 43 | config = getattr(configs, 'config_'+args.model)() 44 | print(config) 45 | 46 | # load data 47 | data_path = args.data_path+args.dataset+'/' 48 | train_set = eval(config['dataset_name'])(config, data_path, 49 | config['train_token'], config['tok_len'], 50 | config['train_ast'], config['vocab_ast'], 51 | config['train_cfg'], config['n_node'], 52 | config['train_desc'], config['desc_len']) 53 | 54 | data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], 55 | collate_fn=batcher(device), shuffle=True, drop_last=False, num_workers=0) 56 | 57 | # define the models 58 | logger.info('Constructing Model..') 59 | model = getattr(models, args.model)(config) #initialize the model 60 | if args.reload_from>0: 61 | load_model(model, args.reload_from, device) 62 | logger.info('done') 63 | model.to(device) 64 | 65 | no_decay = ['bias', 'LayerNorm.weight'] 66 | optimizer_grouped_parameters = [ 67 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, 68 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 69 | ] 70 | optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon']) 71 | scheduler = get_cosine_schedule_with_warmup( 72 | optimizer, num_warmup_steps=config['warmup_steps'], 73 | num_training_steps=len(data_loader)*config['nb_epoch']) # do not forget to modify the number when dataset is changed 74 | 75 | print('---model parameters---') 76 | num_params = 0 77 | for param in model.parameters(): 78 | num_params += param.numel() 79 | print(num_params / 1e6) 80 | 81 | n_iters = len(data_loader) 82 | itr_global = args.reload_from+1 83 | for epoch in range(int(args.reload_from)+1, config['nb_epoch']+1): 84 | itr_start_time = time.time() 85 | losses=[] 86 | for batch in data_loader: 87 | 88 | model.train() 89 | batch_gpu = [tensor for tensor in batch] 90 | loss = model(*batch_gpu) 91 | 92 | loss.backward() 93 | torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) 94 | 95 | optimizer.step() 96 | scheduler.step() 97 | model.zero_grad() 98 | 99 | losses.append(loss.item()) 100 | 101 | if itr_global % args.log_every == 0: 102 | elapsed = time.time() - itr_start_time 103 | logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f'% 104 | (epoch, config['nb_epoch'], itr_global%n_iters, n_iters, elapsed, np.mean(losses))) 105 | 106 | losses=[] 107 | itr_start_time = time.time() 108 | itr_global = itr_global + 1 109 | 110 | # save every epoch 111 | if epoch >= 90: 112 | if epoch % 5 == 0: 113 | save_model(model, epoch) 114 | 115 | 116 | def parse_args(): 117 | parser = argparse.ArgumentParser("Train and Validate The Code Search (Embedding) Model") 118 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 119 | parser.add_argument('--model', type=str, default='MultiEmbeder', help='model name') 120 | parser.add_argument('--dataset', type=str, default='github', help='name of dataset.java, python') 121 | parser.add_argument('--reload_from', type=int, default=-1, help='epoch to reload from') 122 | 123 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 124 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard") 125 | # Training Arguments 126 | parser.add_argument('--log_every', type=int, default=50, help='interval to log autoencoder training results') 127 | parser.add_argument('--seed', type=int, default=1111, help='random seed') 128 | 129 | 130 | return parser.parse_args() 131 | 132 | if __name__ == '__main__': 133 | args = parse_args() 134 | 135 | # make output directory if it doesn't already exist 136 | os.makedirs(f'./output/{args.model}/{args.dataset}/models', exist_ok=True) 137 | os.makedirs(f'./output/{args.model}/{args.dataset}/tmp_results', exist_ok=True) 138 | 139 | torch.backends.cudnn.benchmark = True # speed up training by using cudnn 140 | torch.backends.cudnn.deterministic = True # fix the random seed in cudnn 141 | 142 | train(args) 143 | -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/Tok-Att.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "../CFG-Att" 5 | }, 6 | { 7 | "path": "../AST-Att" 8 | }, 9 | { 10 | "path": "." 11 | } 12 | ] 13 | } -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/__pycache__/configs.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/configs.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/__pycache__/data_loader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/data_loader.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/__pycache__/modules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/modules.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/configs.py: -------------------------------------------------------------------------------- 1 | 2 | def config_TokenEmbeder(): 3 | conf = { 4 | # added_params 5 | 'gpu': 1, 6 | 'transform_every_modal': 0, # to make modal more complex? 7 | 'save_attn_weight': 0, 8 | 'use_tanh': 1, 9 | 'use_attn': 1, 10 | 11 | # data_params 12 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader 13 | #training data 14 | 'train_tokens':'train.token.h5', 15 | 'train_desc':'train.desc.h5', 16 | #valid data 17 | 'valid_tokens':'valid.token.h5', 18 | 'valid_desc':'valid.desc.h5', 19 | # test data 20 | 'test_tokens':'test.token.h5', 21 | 'test_desc':'test.desc.h5', 22 | 23 | #parameters 24 | 'tokens_len':50, 25 | 'desc_len': 30, 26 | 'n_token_words': 20000, # len(vocabulary) + 1 27 | 'n_desc_words': 12000, # wait to decide 28 | #vocabulary info 29 | 'vocab_tokens':'vocab.token.json', 30 | 'vocab_desc':'vocab.desc.json', 31 | 32 | #training_params 33 | 'batch_size': 32, 34 | 'chunk_size': 200000, 35 | 'nb_epoch': 200, 36 | #'optimizer': 'adam', 37 | 'learning_rate':0.0003, # try 1e-4(paper) 38 | 'adam_epsilon':1e-8, 39 | 'warmup_steps':5000, 40 | 'fp16': False, 41 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. 42 | #"See details at https://nvidia.github.io/apex/amp.html" 43 | 44 | # model_params 45 | 'emb_size': 300, 46 | 'n_hidden': 512,#number of hidden dimension of code/desc representation 47 | # recurrent 48 | 'margin': 0.6, 49 | 'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf 50 | #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization. 51 | 'dropout':0.1 52 | } 53 | return conf 54 | 55 | -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/data/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/data_loader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import torch.utils.data as data 4 | import torch.nn as nn 5 | import tables 6 | import json 7 | import random 8 | import numpy as np 9 | import pickle 10 | from utils import PAD_ID, UNK_ID, indexes2sent 11 | 12 | import logging 13 | logger = logging.getLogger(__name__) 14 | logging.basicConfig(level=logging.INFO, format="%(message)s") 15 | 16 | 17 | class CodeSearchDataset(data.Dataset): 18 | """ 19 | Dataset that has only positive samples. 20 | """ 21 | def __init__(self, data_dir, f_tokens, max_tok_len, f_descs=None, max_desc_len=None): 22 | self.max_tok_len = max_tok_len 23 | self.max_desc_len = max_desc_len 24 | # initialize file path or list of file names 25 | """read training data(list of int arrays) from a hdf5 file""" 26 | self.training = False 27 | print("loading data...") 28 | table_tokens = tables.open_file(data_dir+f_tokens) 29 | self.tokens = table_tokens.get_node('/phrases')[:].astype(np.long) 30 | self.idx_tokens = table_tokens.get_node('/indices')[:] 31 | if f_descs is not None: 32 | self.training=True 33 | table_desc = tables.open_file(data_dir+f_descs) 34 | self.descs = table_desc.get_node('/phrases')[:].astype(np.long) 35 | self.idx_descs = table_desc.get_node('/indices')[:] 36 | 37 | if f_descs is not None: 38 | assert self.idx_tokens.shape[0]==self.idx_descs.shape[0] 39 | self.data_len = self.idx_tokens.shape[0] 40 | print("{} entries".format(self.data_len)) 41 | 42 | def pad_seq(self, seq, maxlen): 43 | if len(seq) < maxlen: 44 | # !!!!! numpy appending is slow. Try to optimize the padding 45 | seq = np.append(seq, [PAD_ID]*(maxlen-len(seq))) 46 | seq = seq[:maxlen] 47 | return seq 48 | 49 | def __getitem__(self, offset): 50 | len, pos = self.idx_tokens[offset][0], self.idx_tokens[offset][1] 51 | tok_len = min(int(len), self.max_tok_len) 52 | tokens = self.tokens[pos:pos+tok_len] 53 | tokens = self.pad_seq(tokens, self.max_tok_len) 54 | 55 | if self.training: 56 | len, pos = self.idx_descs[offset][0], self.idx_descs[offset][1] 57 | good_desc_len = min(int(len), self.max_desc_len) 58 | good_desc = self.descs[pos:pos+good_desc_len] 59 | good_desc = self.pad_seq(good_desc, self.max_desc_len) 60 | 61 | rand_offset = random.randint(0, self.data_len-1) 62 | len, pos = self.idx_descs[rand_offset][0], self.idx_descs[rand_offset][1] 63 | bad_desc_len=min(int(len), self.max_desc_len) 64 | bad_desc = self.descs[pos:pos+bad_desc_len] 65 | bad_desc = self.pad_seq(bad_desc, self.max_desc_len) 66 | 67 | return tokens, tok_len, good_desc, good_desc_len, bad_desc, bad_desc_len 68 | return tokens, tok_len, good_desc, good_desc_len 69 | 70 | def __len__(self): 71 | return self.data_len 72 | 73 | 74 | def load_dict(filename): 75 | return json.loads(open(filename, "r").readline()) 76 | #return pickle.load(open(filename, 'rb')) 77 | 78 | def load_vecs(fin): 79 | """read vectors (2D numpy array) from a hdf5 file""" 80 | h5f = tables.open_file(fin) 81 | h5vecs= h5f.root.vecs 82 | 83 | vecs=np.zeros(shape=h5vecs.shape,dtype=h5vecs.dtype) 84 | vecs[:]=h5vecs[:] 85 | h5f.close() 86 | return vecs 87 | 88 | def save_vecs(vecs, fout): 89 | fvec = tables.open_file(fout, 'w') 90 | atom = tables.Atom.from_dtype(vecs.dtype) 91 | filters = tables.Filters(complib='blosc', complevel=5) 92 | ds = fvec.create_carray(fvec.root,'vecs', atom, vecs.shape,filters=filters) 93 | ds[:] = vecs 94 | print('done') 95 | fvec.close() 96 | 97 | if __name__ == '__main__': 98 | input_dir = './data/github/' 99 | train_set = CodeSearchDataset(input_dir, 'train.token.h5', 60, 'train.desc.h5', 30) 100 | train_data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=32, shuffle=False, num_workers=1) 101 | logger.info('hello') 102 | #print(len(train_data_loader)) 103 | ''' 104 | use_set = CodeSearchDataset(input_dir, 'use.tokens.h5', 30) 105 | use_data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=1, shuffle=False, num_workers=1) 106 | #print(len(use_data_loader)) 107 | vocab_tokens = load_dict(input_dir+'vocab.tokens.json') 108 | vocab_desc = load_dict(input_dir+'vocab.desc.json') 109 | ''' 110 | print('============ Train Data ================') 111 | k=0 112 | for batch in train_data_loader: 113 | print("batch[0].size(0): ", batch[0].size(0)) 114 | batch = tuple([t.numpy() for t in batch]) 115 | tokens, tok_len, good_desc, good_desc_len, bad_desc, bad_desc_len = batch 116 | k+=1 117 | if k>0: break 118 | print('-------------------------------') 119 | #print(indexes2sent(tokens, vocab_tokens)) 120 | #print(indexes2sent(good_desc, vocab_desc)) 121 | 122 | 123 | -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tokenemb import TokenEmbeder -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/models/__pycache__/jointemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/models/__pycache__/jointemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/models/__pycache__/tokenemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/models/__pycache__/tokenemb.cpython-36.pyc -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/modules.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as weight_init 8 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 9 | from torch import optim 10 | import torch.nn.functional as F 11 | 12 | import logging 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class SeqEncoder(nn.Module): 17 | def __init__(self, vocab_size, emb_size, hidden_size, n_layers=1): 18 | super(SeqEncoder, self).__init__() 19 | self.emb_size = emb_size 20 | self.hidden_size = hidden_size 21 | self.n_layers = n_layers 22 | 23 | self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0) 24 | 25 | self.init_xavier_linear(self.embedding, init_bias=False) 26 | 27 | self.lstm = nn.LSTM(emb_size, hidden_size, dropout=0.1, batch_first=True, bidirectional=False) 28 | 29 | def init_xavier_linear(self, linear, init_bias=True, gain=1, init_normal_std=1e-4): 30 | torch.nn.init.xavier_uniform_(linear.weight, gain) 31 | if init_bias: 32 | if linear.bias is not None: 33 | linear.bias.data.normal_(std=init_normal_std) 34 | 35 | def init_hidden(self, batch_size): 36 | weight = next(self.parameters()).data 37 | return (weight.new(self.n_layers, batch_size, self.hidden_size).zero_().requires_grad_(), # rnn_type == 'LSTM' 38 | weight.new(self.n_layers, batch_size, self.hidden_size).zero_().requires_grad_()) 39 | 40 | 41 | def forward(self, inputs, input_lens=None, hidden=None): 42 | batch_size, seq_len = inputs.size() 43 | inputs = self.embedding(inputs) # input: [batch_sz x seq_len] embedded: [batch_sz x seq_len x emb_sz] 44 | #inputs = F.dropout(inputs, 0.1, self.training) # mark. 45 | 46 | if input_lens is not None:# sort and pack sequence 47 | input_lens_sorted, indices = input_lens.sort(descending=True) 48 | inputs_sorted = inputs.index_select(0, indices) 49 | inputs = pack_padded_sequence(inputs_sorted, input_lens_sorted.data.tolist(), batch_first=True) 50 | 51 | hids, (h_n, c_n) = self.lstm(inputs, hidden) # hids:[b x seq x hid_sz*2](biRNN) 52 | 53 | if input_lens is not None: # reorder and pad 54 | _, inv_indices = indices.sort() 55 | hids, lens = pad_packed_sequence(hids, batch_first=True) 56 | #hids = F.dropout(hids, p=0.1, training=self.training) # mark. 57 | hids = hids.index_select(0, inv_indices) # [batch_sz x seq_len x hid_sz] 58 | h_n = h_n.index_select(1, inv_indices) 59 | c_n = c_n.index_select(1, inv_indices) 60 | 61 | h_n = h_n[0] # [batch_sz x hid_sz] n_layers==1 and n_dirs==1 62 | c_n = c_n[0] 63 | 64 | return hids, (h_n, c_n) 65 | 66 | 67 | from torch.optim.lr_scheduler import LambdaLR 68 | 69 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1): 70 | """ Create a schedule with a learning rate that decreases following the 71 | values of the cosine function between 0 and `pi * cycles` after a warmup 72 | period during which it increases linearly between 0 and 1. 73 | """ 74 | def lr_lambda(current_step): 75 | if current_step < num_warmup_steps: 76 | return float(current_step) / float(max(1, num_warmup_steps)) 77 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 78 | return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress))) 79 | 80 | return LambdaLR(optimizer, lr_lambda, last_epoch) 81 | 82 | 83 | def get_word_weights(vocab_size, padding_idx=0): 84 | '''contruct a word weighting table ''' 85 | def cal_weight(word_idx): 86 | return 1-math.exp(-word_idx) 87 | weight_table = np.array([cal_weight(w) for w in range(vocab_size)]) 88 | if padding_idx is not None: 89 | weight_table[padding_idx] = 0. # zero vector for padding dimension 90 | return torch.FloatTensor(weight_table) 91 | -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/output/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/output/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/output/TokenEmbeder/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/output/TokenEmbeder/.DS_Store -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/shuffle_index.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/shuffle_index.npy -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['KMP_DUPLICATE_LIB_OK']='True' 3 | import sys 4 | import traceback 5 | import numpy as np 6 | import argparse 7 | import threading 8 | import codecs 9 | import logging 10 | from tqdm import tqdm 11 | logger = logging.getLogger(__name__) 12 | logging.basicConfig(level=logging.INFO, format="%(message)s") 13 | 14 | import torch 15 | 16 | import models, configs, data_loader 17 | from modules import get_cosine_schedule_with_warmup 18 | from utils import similarity, normalize 19 | from data_loader import * 20 | 21 | 22 | def test(config, model, device): 23 | logger.info('test begin...') 24 | 25 | model.eval() 26 | model.to(device) 27 | 28 | # load data 29 | data_path = args.data_path+args.dataset+'/' 30 | test_set = eval(config['dataset_name'])(data_path, 31 | config['test_tokens'], config['tokens_len'], 32 | config['test_desc'], config['desc_len']) 33 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32, 34 | shuffle=False, drop_last=False, num_workers=1) 35 | # encode tokens and descs 36 | code_reprs, desc_reprs = [], [] 37 | n_processed = 0 38 | for batch in data_loader: 39 | code_batch = [tensor.to(device) for tensor in batch[:2]] 40 | desc_batch = [tensor.to(device) for tensor in batch[2:4]] 41 | with torch.no_grad(): 42 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32) 43 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size] 44 | # normalize when sim_measure=='cos' 45 | code_repr = normalize(code_repr) 46 | desc_repr = normalize(desc_repr) 47 | code_reprs.append(code_repr) 48 | desc_reprs.append(desc_repr) 49 | n_processed += batch[0].size(0) # +batch_size 50 | # code_reprs: [n_processed x n_hidden] 51 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 52 | 53 | # calculate similarity 54 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], [] 55 | test_sim_result, test_rank_result = [], [] 56 | for i in tqdm(range(0, n_processed)): 57 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden] 58 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed] 59 | negsims = np.negative(sims) 60 | predict = np.argsort(negsims) 61 | 62 | # SuccessRate@k 63 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]] 64 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0) 65 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0) 66 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0) 67 | # MRR 68 | predict_list = predict.tolist() 69 | rank = predict_list.index(i) 70 | sum_mrr.append(1/float(rank+1)) 71 | 72 | # results need to be saved 73 | predict_20 = [int(k) for k in predict[0:20]] 74 | sim_20 = [sims[k] for k in predict_20] 75 | test_sim_result.append(zip(predict_20, sim_20)) 76 | test_rank_result.append(rank+1) 77 | 78 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}') 79 | save_path = args.data_path + 'result/' 80 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy' 81 | np.save(save_path+sim_result_filename, test_sim_result) 82 | np.save(save_path+rank_result_filename, test_rank_result) 83 | 84 | 85 | def parse_args(): 86 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model") 87 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 88 | parser.add_argument('--model', type=str, default='TokenEmbeder', help='model name') 89 | parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python') 90 | parser.add_argument('--reload_from', type=int, default=200, help='epoch to reload from') 91 | parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\ 92 | 'Note: should be consistent with the same argument in the repr_code.py') 93 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 94 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard") 95 | return parser.parse_args() 96 | 97 | 98 | if __name__ == '__main__': 99 | args = parse_args() 100 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 101 | config = getattr(configs, 'config_'+args.model)() 102 | 103 | ##### Define model ###### 104 | logger.info('Constructing Model..') 105 | model = getattr(models, args.model)(config) # initialize the model 106 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5' 107 | model.load_state_dict(torch.load(ckpt, map_location=device)) 108 | 109 | test(config, model, device) 110 | 111 | 112 | -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/util_desc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from collections import Counter 4 | import json 5 | import h5py 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def make_shuffle_index(args): 10 | dir_path = args.data_path + args.dataset 11 | all_desc_file_path = dir_path + args.all_desc_file 12 | with open(all_desc_file_path, 'r') as all_desc_file: 13 | lines = all_desc_file.readlines() 14 | all_num = int(len(lines)/2) 15 | 16 | index = np.arange(all_num) 17 | np.random.seed(16) 18 | np.random.shuffle(index) 19 | print(len(index)) 20 | np.save(args.shuffle_index_file, index) 21 | 22 | def split_desc_data(args): 23 | index = np.load(args.shuffle_index_file) 24 | 25 | dir_path = args.data_path + args.dataset 26 | all_desc_file_path = dir_path + args.all_desc_file 27 | train_desc_file_path = dir_path + args.train_desc_file 28 | test_desc_file_path = dir_path + args.test_desc_file 29 | 30 | input_desc = [] 31 | with open(all_desc_file_path, 'r') as all_desc_file: 32 | lines = all_desc_file.readlines() 33 | for line in lines: 34 | if (line[0:10] != 'BeginFunc:'): 35 | input_desc.append(line) 36 | print('number of input desc:\n', len(input_desc)) 37 | 38 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file: 39 | for i in range(0, args.trainset_num): 40 | train_desc_file.write(input_desc[index[i]]) 41 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num): 42 | test_desc_file.write(input_desc[index[i]]) 43 | 44 | 45 | def create_desc_dict_file(args): 46 | dir_path = args.data_path + args.dataset 47 | desc_file_path = dir_path + args.train_desc_file 48 | 49 | input_desc = [] 50 | with open(desc_file_path, 'r') as desc_file: 51 | input_desc = desc_file.readlines() 52 | desc_words = [] 53 | for i in range(0, len(input_desc)): 54 | input_desc[i] = input_desc[i].rstrip('\n') 55 | desc_word_list = input_desc[i].split() 56 | for desc_word in desc_word_list: 57 | desc_words.append(desc_word) 58 | vocab_desc_info = Counter(desc_words) 59 | print(len(vocab_desc_info)) 60 | 61 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]] 62 | vocab_desc_index = {'':0, '':1} 63 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))])) 64 | 65 | 66 | vocab_desc_file_path = dir_path + args.vocab_desc_file 67 | desc_dic_str = json.dumps(vocab_desc_index) 68 | with open(vocab_desc_file_path, 'w') as vocab_desc_file: 69 | vocab_desc_file.write(desc_dic_str) 70 | 71 | 72 | def sents2indexes(sent_file_path, vocab_file_path, maxlen): 73 | phrases, indices = [], [] 74 | with open(sent_file_path, 'r') as sent_file: 75 | sents = sent_file.readlines() 76 | vocab = json.loads(open(vocab_file_path, "r").readline()) 77 | start_index = 0 78 | for i in range(0, len(sents)): 79 | sent = sents[i].rstrip('\n') 80 | word_list = sent.split() 81 | sent_len = min(len(word_list), maxlen) 82 | indices.append((sent_len, start_index)) 83 | for j in range(0, sent_len): 84 | word = word_list[j] 85 | phrases.append(vocab.get(word, UNK_ID)) 86 | start_index += sent_len 87 | output_file_path = sent_file_path[0:-3] + 'h5' 88 | output_file = h5py.File(output_file_path, 'w') 89 | output_file['phrases'] = phrases 90 | output_file['indices'] = indices 91 | output_file.close() 92 | 93 | 94 | def parse_args(): 95 | parser = argparse.ArgumentParser("Parse Description data for TokenEmbedder") 96 | 97 | parser.add_argument('--data_path', type=str, default='./data/') 98 | parser.add_argument('--dataset', type=str, default='github11/') 99 | 100 | parser.add_argument('--origin_desc_file', type=str, default='origin.desc.txt') 101 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt') 102 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt') 103 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt') 104 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json') 105 | 106 | parser.add_argument('--trainset_num', type=int, default=33845) 107 | parser.add_argument('--testset_num', type=int, default=2000) 108 | parser.add_argument('--desc_word_num', type=int, default=10000) 109 | parser.add_argument('--desc_maxlen', type=int, default=30) 110 | parser.add_argument('--testset_start_index', type=int, default=33845) 111 | 112 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy') 113 | 114 | return parser.parse_args() 115 | 116 | if __name__ == '__main__': 117 | 118 | args = parse_args() 119 | 120 | #make_shuffle_index(args) 121 | ''' 122 | split_desc_data(args) 123 | create_desc_dict_file(args) 124 | 125 | dir_path = args.data_path + args.dataset 126 | # train.desc.txt -> train.desc.h5(and test...) 127 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 128 | sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen) 129 | ''' 130 | -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/util_tok.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from collections import Counter 4 | import json 5 | import h5py 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def split_token_data(args): 10 | index = np.load(args.shuffle_index_file) 11 | 12 | dir_path = args.data_path + args.dataset 13 | all_token_file_path = dir_path + args.all_token_file 14 | train_token_file_path = dir_path + args.train_token_file 15 | test_token_file_path = dir_path + args.test_token_file 16 | 17 | input_token = [] 18 | with open(all_token_file_path, 'r') as all_token_file: 19 | lines = all_token_file.readlines() 20 | for line in lines: 21 | if (line[0:10] != 'BeginFunc:'): 22 | input_token.append(line) 23 | print('number of input token:\n', len(input_token)) 24 | 25 | with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file: 26 | for i in range(0, args.trainset_num): 27 | train_token_file.write(input_token[index[i]]) 28 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num): 29 | test_token_file.write(input_token[index[i]]) 30 | 31 | 32 | def create_token_dict_file(args): 33 | dir_path = args.data_path + args.dataset 34 | token_file_path = dir_path + args.train_token_file 35 | 36 | input_token = [] 37 | with open(token_file_path, 'r') as token_file: 38 | input_token = token_file.readlines() 39 | token_words = [] 40 | for i in range(0, len(input_token)): 41 | input_token[i] = input_token[i].rstrip('\n') 42 | token_word_list = input_token[i].split() 43 | for token_word in token_word_list: 44 | token_words.append(token_word) 45 | vocab_token_info = Counter(token_words) 46 | print(len(vocab_token_info)) 47 | 48 | vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]] 49 | vocab_token_index = {'':0, '':1} 50 | vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))])) 51 | 52 | 53 | vocab_token_file_path = dir_path + args.vocab_token_file 54 | token_dic_str = json.dumps(vocab_token_index) 55 | with open(vocab_token_file_path, 'w') as vocab_token_file: 56 | vocab_token_file.write(token_dic_str) 57 | 58 | def sents2indexes(sent_file_path, vocab_file_path, maxlen): 59 | phrases, indices = [], [] 60 | with open(sent_file_path, 'r') as sent_file: 61 | sents = sent_file.readlines() 62 | vocab = json.loads(open(vocab_file_path, "r").readline()) 63 | start_index = 0 64 | for i in range(0, len(sents)): 65 | sent = sents[i].rstrip('\n') 66 | word_list = sent.split() 67 | sent_len = min(len(word_list), maxlen) 68 | indices.append((sent_len, start_index)) 69 | for j in range(0, sent_len): 70 | word = word_list[j] 71 | phrases.append(vocab.get(word, UNK_ID)) 72 | start_index += sent_len 73 | output_file_path = sent_file_path[0:-3] + 'h5' 74 | output_file = h5py.File(output_file_path, 'w') 75 | output_file['phrases'] = phrases 76 | output_file['indices'] = indices 77 | output_file.close() 78 | 79 | def parse_args(): 80 | parser = argparse.ArgumentParser("Parse token data for TokenEmbedder") 81 | 82 | parser.add_argument('--data_path', type=str, default='./data/') 83 | parser.add_argument('--dataset', type=str, default='github11/') 84 | 85 | parser.add_argument('--all_token_file', type=str, default='all.token.txt') 86 | parser.add_argument('--train_token_file', type=str, default='train.token.txt') 87 | parser.add_argument('--test_token_file', type=str, default='test.token.txt') 88 | parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json') 89 | 90 | parser.add_argument('--trainset_num', type=int, default=39152) 91 | parser.add_argument('--testset_num', type=int, default=2000) 92 | parser.add_argument('--token_word_num', type=int, default=25000) 93 | parser.add_argument('--token_maxlen', type=int, default=100) 94 | parser.add_argument('--testset_start_index', type=int, default=39152) 95 | 96 | 97 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy') 98 | 99 | return parser.parse_args() 100 | 101 | if __name__ == '__main__': 102 | args = parse_args() 103 | ''' 104 | dir_path = args.data_path + args.dataset 105 | with open(dir_path+'origin.test.token.txt', 'r') as in_file, open(dir_path+'test.token.txt', 'w') as out_file: 106 | lines = in_file.readlines() 107 | for i in range(0, len(lines)): 108 | if lines[i][0:10] != 'BeginFunc:': 109 | out_file.write(lines[i]) 110 | ''' 111 | 112 | split_token_data(args) 113 | create_token_dict_file(args) 114 | 115 | dir_path = args.data_path + args.dataset 116 | # train.token.txt -> train.token.h5(and test...) 117 | sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen) 118 | sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen) 119 | 120 | 121 | ''' 122 | dir_path = args.data_path + args.dataset 123 | all_token_file_path = dir_path + args.all_token_file 124 | with open(all_token_file_path, 'r') as all_token_file: 125 | lines = all_token_file.readlines() 126 | print(len(lines)) 127 | for i in range(0, len(lines)): 128 | line = lines[i] 129 | if line[0:10] != 'BeginFunc:': 130 | words = line.split() 131 | if len(words) == 0: 132 | print(lines[i-1]) 133 | #print(lines[i]) 134 | ''' 135 | -------------------------------------------------------------------------------- /Baseline methods/Tok-Att/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import math 4 | import torch 5 | from torch.nn import functional as F 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def cos_approx(data1,data2): 10 | """numpy implementation of cosine similarity for matrix""" 11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 12 | dotted = np.dot(data1,np.transpose(data2)) 13 | norm1 = np.linalg.norm(data1,axis=1) 14 | norm2 = np.linalg.norm(data2,axis=1) 15 | matrix_vector_norms = np.multiply(norm1, norm2) 16 | neighbors = np.divide(dotted, matrix_vector_norms) 17 | return neighbors 18 | 19 | def normalize(data): 20 | """normalize matrix by rows""" 21 | return data/np.linalg.norm(data,axis=1,keepdims=True) 22 | 23 | def dot_np(data1,data2): 24 | """cosine similarity for normalized vectors""" 25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 26 | return np.dot(data1, data2.T) 27 | 28 | def sigmoid(x): 29 | return 1/(1 + np.exp(-x)) 30 | 31 | def similarity(vec1, vec2, measure='cos'): 32 | if measure=='cos': 33 | vec1_norm = normalize(vec1) 34 | vec2_norm = normalize(vec2) 35 | return np.dot(vec1_norm, vec2_norm.T)[:,0] 36 | elif measure=='poly': 37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T) 38 | elif measure=='sigmoid': 39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1) 40 | elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf 41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1) 42 | euc_sim = 1 / (1 + euc_dist) 43 | if measure=='euc': return euc_sim 44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1) 45 | if measure == 'gesd': return euc_sim * sigmoid_sim 46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim) 47 | 48 | ####################################################################### 49 | 50 | def asMinutes(s): 51 | m = math.floor(s / 60) 52 | s -= m * 60 53 | return '%d:%d'% (m, s) 54 | 55 | def timeSince(since, percent): 56 | now = time.time() 57 | s = now - since 58 | es = s / (percent) 59 | rs = es - s 60 | return '%s<%s'%(asMinutes(s), asMinutes(rs)) 61 | 62 | ####################################################################### 63 | ''' 64 | import nltk 65 | try: nltk.word_tokenize("hello world") 66 | except LookupError: nltk.download('punkt') 67 | 68 | def sent2indexes(sentence, vocab, maxlen): 69 | 70 | def convert_sent(sent, vocab, maxlen): 71 | idxes = np.zeros(maxlen, dtype=np.int64) 72 | idxes.fill(PAD_ID) 73 | tokens = nltk.word_tokenize(sent.strip()) 74 | idx_len = min(len(tokens), maxlen) 75 | for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID) 76 | return idxes, idx_len 77 | if type(sentence) is list: 78 | inds, lens = [], [] 79 | for sent in sentence: 80 | idxes, idx_len = convert_sent(sent, vocab, maxlen) 81 | #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len]) 82 | inds.append(idxes) 83 | lens.append(idx_len) 84 | return np.vstack(inds), np.vstack(lens) 85 | else: 86 | inds, lens = sent2indexes([sentence], vocab, maxlen) 87 | return inds[0], lens[0] 88 | ''' 89 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 90 | '''indexes: numpy array''' 91 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID): 92 | indexes=filter(lambda i: i!=ignore_tok, indexes) 93 | toks, length = [], 0 94 | for idx in indexes: 95 | toks.append(ivocab.get(idx, '')) 96 | length+=1 97 | return ' '.join(toks), length 98 | 99 | ivocab = {v: k for k, v in vocab.items()} 100 | if indexes.ndim==1:# one sentence 101 | return revert_sent(indexes, ivocab, ignore_tok) 102 | else:# dim>1 103 | sentences, lens =[], [] # a batch of sentences 104 | for inds in indexes: 105 | sentence, length = revert_sent(inds, ivocab, ignore_tok) 106 | sentences.append(sentence) 107 | lens.append(length) 108 | return sentences, lens 109 | 110 | ######################################################################## 111 | -------------------------------------------------------------------------------- /Baseline methods/readme.md: -------------------------------------------------------------------------------- 1 | # Baseline methods 2 | ### DeepCS 3 | ``` 4 | @inproceedings{gu2018deep, 5 | title={Deep code search}, 6 | author={Gu, Xiaodong and Zhang, Hongyu and Kim, Sunghun}, 7 | booktitle={2018 IEEE/ACM 40th International Conference on Software Engineering (ICSE)}, 8 | pages={933--944}, 9 | year={2018}, 10 | organization={IEEE} 11 | } 12 | ``` 13 | ### MMAN 14 | ``` 15 | @inproceedings{wan2019multi, 16 | title={Multi-modal attention network learning for semantic source code retrieval}, 17 | author={Wan, Yao and Shu, Jingdong and Sui, Yulei and Xu, Guandong and Zhao, Zhou and Wu, Jian and Yu, Philip}, 18 | booktitle={2019 34th IEEE/ACM International Conference on Automated Software Engineering (ASE)}, 19 | pages={13--25}, 20 | year={2019}, 21 | organization={IEEE} 22 | } 23 | ``` 24 | ### MMAN(TDC) 25 | Exploit Token + Variable-based data flow + Variable-based control flow to perform code search tasks 26 | ### AST-Att 27 | Exploit AST and the attention mechanism to perform code search tasks 28 | ### Tok-Att 29 | Exploit Token and the attention mechanism to perform code search tasks 30 | ### CFG-Att 31 | Exploit CFG and the attention mechanism to perform code search tasks 32 | -------------------------------------------------------------------------------- /IR2graph/readme.md: -------------------------------------------------------------------------------- 1 | # Generate our varibale-based flow graph from the input IR 2 | Given the input ".ll" file generated from the original ".c" file from LLVM, the output is the nodes and graphs in our constructed variable-based flow graph. 3 | 4 | ## Generate the VFG 5 | ``` 6 | python vfg_construct.py 7 | ``` 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeGraphCS 2 | 3 | 4 | # Project Overview 5 | This project provides a collection of datasets and source code which are used in our DeGraphCS model. The content of project is as follows: 6 | 7 | 1. Dataset 8 | 9 | 2. DeGraphCS Source Code 10 | 11 | 3. Variable-based Flow Graph Construction 12 | 13 | 4. Baseline methods 14 | 15 | 5. User Study 16 | 17 | 6. Appendix 18 | 19 | ## Dataset 20 | To help people to reproduce our work, we provide raw datasets which are consist of **C code snippet**, corresponding **code comment** and **generated IR**. 21 | 22 | The raw datasets can be accessed in [Google Drive](https://drive.google.com/file/d/1PZ9TAfsrSlXLDpOCp6-0aZQxrzlP4kBA/view?usp=sharing) 23 | 24 | To feed into our model, we first generate Variable-based Flow Graph of 41152 methods and extract corresponding comments. Then we split the datasets into 39152 training set and 2000 test set. All of the data are puted in `dataset/` directory. 25 | 26 | ## DeGraphCS Source Code 27 | We provide DeGraphCS model code which are listed in `src/` directory. 28 | 29 | ## Variable-based Flow Graph Construction 30 | To construct Variable-based Flow Graph according to llvm IR, We provide graph construction code to help users to generate graph which are puted in `IR2graph/` directory. 31 | 32 | ## Baseline Methods 33 | We have reproduced other code search works which are putted in `Baseline methods/` directory. 34 | 35 | ## User Study 36 | We make a user study to evaluate our model. 37 | 38 | 50 queries of the user study are listed in the `user study/queries.txt`. And according to four models (UNIF, MMAN, DeepCS and DeGraphCS), we obtain corresponding searching result which are listed in `user study/` directory. 39 | 40 | ## Appendix 41 | 42 | # Running Our Model 43 | ## Generate Datasets and Build Dictionary 44 | Run the command to split comments datasets into training set and test set, and build dictionary 45 | ``` 46 | python src/util_desc.py 47 | ``` 48 | Run the command to split Variable-based Flow Graph datasets into training set and test set, and build dictionary 49 | ``` 50 | python src/util_ir.py 51 | ``` 52 | ## Train the DeGraphCS Model 53 | ``` 54 | python src/train.py 55 | ``` 56 | ## Test the DeGraphCS Model 57 | ``` 58 | python src/test.py 59 | ``` 60 | -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- 1 | ## Process Raw Dataset 2 | 3 | To obtain high-quality dataset, we process raw dataset in `/raw_dataset` as follows: 4 | 5 | 1. After we delete duplicate methods, we obtain 74489 methods from 151414 methods. 6 | 7 | 2. To generate a common dataset for all models(DeGraphCS, DeepCS, MMAN and UNIF), we delete those methods which do not generate AST and CFG. Then we obtain 59725 methods. 8 | 9 | 3. To make sure our dataset is high-quality, we constraint comments' length and quality, and the number of nodes in AST, CFG and VFG(Variable-based Flow Graph). 10 | After we delete those methods which do not meet our requirements, we obtain 41152 methods in `/preprocessed_dataset`. 11 | 12 | 13 | -------------------------------------------------------------------------------- /dataset/preprocessed_dataset/origin.ir.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f601269dd0365f12862fb1100d7538a957caa999c4767835a26a8e3781bd6d67 3 | size 160986007 4 | -------------------------------------------------------------------------------- /dataset/preprocessed_dataset/readme.md: -------------------------------------------------------------------------------- 1 | ## Directory Introduction 2 | 3 | **origin.desc.txt**: all datasets which includes **41152** descriptions of methods. 4 | 5 | **origin.ir.txt**: all datasets which includes **41152** Variable-based Flow Graph representation of methods. 6 | 7 | **train.desc.txt**: training set which includes **39152** descriptions of methods. 8 | 9 | **train.ir.txt**: training set which includes **39152** Variable-based Flow Graph representation of methods. 10 | 11 | **test.desc.txt**: test set which includes **2000** descriptions of methods. 12 | 13 | **test.ir.txt**: test set which includes **2000** Variable-based Flow Graph representation of methods. 14 | -------------------------------------------------------------------------------- /dataset/preprocessed_dataset/train.ir.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6203c767bc9854e6addb905668ad89fd73a88fa542968884b681e14e3404fac1 3 | size 146141125 4 | -------------------------------------------------------------------------------- /dataset/raw_dataset/readme.md: -------------------------------------------------------------------------------- 1 | 2 | The raw datasets can be accessed in [Google Drive](https://drive.google.com/file/d/1PZ9TAfsrSlXLDpOCp6-0aZQxrzlP4kBA/view?usp=sharing) 3 | -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | # Core parts of deGraphcS 2 | configs.py --Config the hyper-parameters of deGraphCS 3 | 4 | dataloader.py --Load the data in batch 5 | 6 | util_IR.py --Preprocess the origin IR to generate grpahs, which can be identified by graph neural networks 7 | 8 | util_desc.py --Preprocess the comments 9 | 10 | generate_interface.py --Generate the interfaces of the third-party libraries 11 | 12 | # Generate the interfaces to solve the compilation probelm 13 | ## An example to show how the compilation problem of IR can be solved 14 | ### Initial code snippets crawled from Github 15 | ``` 16 | public void range(IHypercube space, IvisitKDNode visitor){ 17 | if(root == null) return; 18 | root.getRange(space, visitor); 19 | } 20 | ``` 21 | The code above cannot be compiled because of the following parts: 22 | 1. The third-library IHypercube and IvisitKDNode is missing. 23 | 2. The object root and its method getRange is missing. 24 | 25 | The third-library missing probelm can be solved by adding some empty interfaces (the Root class with getRange method, IHypercube class and IvisitKDNode class) since the realization details of the method are not neccessary. 26 | ### After adding the interface, the example source code can be successfully compiled: 27 | ``` 28 | public class Range{ 29 | private Root root; 30 | public void range(IHypercube space, IvisitKDNode visitor){ 31 | if(root == null) return; 32 | root.getRange(space, visitor); 33 | } 34 | } 35 | class Root{ 36 | public void getRange(IHypercube space, IvisitKDNode visitor){ 37 | return; 38 | } 39 | } 40 | class IHypercube{} 41 | class IvisitKDNode{} 42 | ``` 43 | -------------------------------------------------------------------------------- /src/configs.py: -------------------------------------------------------------------------------- 1 | 2 | def config_IREmbeder(): 3 | conf = { 4 | # added_params 5 | 'transform_every_modal': 0, 6 | 'use_attn': 0, 7 | 'use_tanh': 1, 8 | 'save_attn_weight': 0, 9 | 10 | # GGNN 11 | 'state_dim': 512, # GGNN hidden state size 12 | 'annotation_dim': 300, 13 | 'n_edge_types': 2, 14 | 'n_node': 160, # maximum nodenum 15 | 'n_steps': 5, # propogation steps number of GGNN 16 | 'output_type': 'no_reduce', 17 | 'batch_size': 32, 18 | 'n_layers': 1, 19 | 'n_hidden': 512, 20 | 'ir_attn_mode': 'sigmoid_scalar', 21 | 'word_split': True, 22 | 'pooling_type': 'max_pooling', # ave_pooling 23 | 'max_word_num': 5, 24 | 25 | # data_params 26 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader 27 | # training data 28 | 'train_ir':'train.ir.json', 29 | 'train_desc':'train.desc.h5', 30 | # test data 31 | 'test_ir':'test.ir.json', 32 | 'test_desc':'test.desc.h5', 33 | 34 | # parameters 35 | 'desc_len': 30, 36 | 'n_desc_words': 10000, 37 | 'n_ir_words': 15000, 38 | # vocabulary info 39 | 'vocab_ir':'vocab.ir.json', 40 | 'vocab_desc':'vocab.desc.json', 41 | 42 | #training_params 43 | 'nb_epoch': 100, 44 | #'optimizer': 'adam', 45 | 'learning_rate':0.0003, # try 1e-4(paper) 46 | 'adam_epsilon':1e-8, 47 | 'warmup_steps':5000, 48 | 'fp16': False, 49 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. 50 | 51 | # model_params 52 | 'emb_size': 300, 53 | # recurrent 54 | 'margin': 0.6, 55 | 'sim_measure':'cos', 56 | 'dropout': 0 57 | } 58 | return conf 59 | 60 | -------------------------------------------------------------------------------- /src/data_loader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import torch.utils.data as data 4 | import torch.nn as nn 5 | import tables 6 | import json 7 | import random 8 | import numpy as np 9 | import pickle 10 | 11 | from utils import PAD_ID, UNK_ID, indexes2sent 12 | import configs 13 | from util_ir import get_one_ir_npy_info 14 | 15 | import logging 16 | logger = logging.getLogger(__name__) 17 | logging.basicConfig(level=logging.INFO, format="%(message)s") 18 | 19 | 20 | class CodeSearchDataset(data.Dataset): 21 | """ 22 | Dataset that has only positive samples. 23 | """ 24 | def __init__(self, config, data_dir, f_irs, max_node_num, f_descs=None, max_desc_len=None): 25 | 26 | self.max_node_num = max_node_num 27 | self.max_desc_len = max_desc_len 28 | 29 | self.n_edge_types = config['n_edge_types'] 30 | self.state_dim = config['state_dim'] 31 | self.max_word_num = config['max_word_num'] 32 | 33 | print("Loading Data...") 34 | 35 | self.graph_dict = json.loads(open(data_dir+f_irs, 'r').readline()) 36 | 37 | table_desc = tables.open_file(data_dir+f_descs) 38 | self.descs = table_desc.get_node('/phrases')[:].astype(np.long) 39 | self.idx_descs = table_desc.get_node('/indices')[:] 40 | 41 | assert len(self.graph_dict)==self.idx_descs.shape[0] 42 | self.data_len = self.idx_descs.shape[0] 43 | print("{} entries".format(self.data_len)) 44 | 45 | def pad_seq(self, seq, maxlen): 46 | if len(seq) < maxlen: 47 | seq = np.append(seq, [PAD_ID]*(maxlen-len(seq))) 48 | seq = seq[:maxlen] 49 | return seq 50 | 51 | def __getitem__(self, offset): 52 | # anno:[n_node], adjmat:[n_node x (n_node*n_edge_types*2)], node_mask:[n_node] 53 | # node_num:[1], word_num: [n_node] 54 | anno, adjmat, node_mask= get_one_ir_npy_info(self.graph_dict[str(offset)], 55 | self.max_node_num, self.n_edge_types, self.max_word_num) 56 | 57 | anno = torch.from_numpy(anno).type(torch.LongTensor) 58 | adjmat = torch.from_numpy(adjmat).type(torch.FloatTensor) 59 | node_mask = torch.Tensor(node_mask) 60 | 61 | len, pos = self.idx_descs[offset][0], self.idx_descs[offset][1] 62 | good_desc_len = min(int(len), self.max_desc_len) 63 | good_desc = self.descs[pos: pos+good_desc_len] 64 | good_desc = self.pad_seq(good_desc, self.max_desc_len) 65 | 66 | rand_offset = random.randint(0, self.data_len-1) 67 | len, pos = self.idx_descs[rand_offset][0], self.idx_descs[rand_offset][1] 68 | bad_desc_len = min(int(len), self.max_desc_len) 69 | bad_desc = self.descs[pos: pos+bad_desc_len] 70 | bad_desc = self.pad_seq(bad_desc, self.max_desc_len) 71 | 72 | return anno, adjmat, node_mask, good_desc, good_desc_len, bad_desc, bad_desc_len 73 | 74 | def __len__(self): 75 | return self.data_len 76 | 77 | def load_dict(filename): 78 | return json.loads(open(filename, "r").readline()) 79 | #return pickle.load(open(filename, 'rb')) 80 | 81 | 82 | if __name__ == '__main__': 83 | device = 'cpu' 84 | config = getattr(configs, 'config_IREmbeder')() 85 | input_dir = './data/github1/' 86 | 87 | test_set = CodeSearchDataset(config, input_dir, 'test.ir.json', 160, 'test.desc.h5', 30) 88 | test_data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=1, shuffle=False, drop_last=False, num_workers=1) 89 | print('number of batch:\n', len(test_data_loader)) 90 | print('============ Train Data ================') 91 | k = 0 92 | 93 | for batch in test_data_loader: 94 | #print(batch) 95 | anno, adjmat, node_mask, good_desc, good_desc_len, bad_desc, bad_desc_len = [tensor.to(device) for tensor in batch] 96 | #print(anno) 97 | print(adjmat) 98 | for i in range(0, 160): 99 | for j in range(0, 320): 100 | if adjmat[0][i][j] == 1: 101 | print(i, j) 102 | #print(node_num) 103 | #print(word_num) 104 | k+=1 105 | if k>0: break 106 | 107 | 108 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .iremb import IREmbeder -------------------------------------------------------------------------------- /src/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/cfgemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/cfgemb.cpython-36.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/iremb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/iremb.cpython-36.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/jointemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/jointemb.cpython-36.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/tokenemb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/tokenemb.cpython-36.pyc -------------------------------------------------------------------------------- /src/models/iremb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as weight_init 8 | import torch.nn.functional as F 9 | 10 | import logging 11 | logger = logging.getLogger(__name__) 12 | parentPath = os.path.abspath("..") 13 | sys.path.insert(0, parentPath) # add parent folder to path so as to import common modules 14 | from modules import GGNN, SeqEncoder 15 | 16 | class IREmbeder(nn.Module): 17 | def __init__(self, config): 18 | super(IREmbeder, self).__init__() 19 | 20 | self.conf = config 21 | 22 | self.margin = config['margin'] 23 | self.dropout = config['dropout'] 24 | 25 | self.n_desc_words = config['n_desc_words'] 26 | self.emb_size = config['emb_size'] 27 | self.n_hidden = config['n_hidden'] 28 | self.ir_attn_mode = config['ir_attn_mode'] 29 | 30 | self.ir_encoder = GGNN(self.conf) 31 | self.desc_encoder = SeqEncoder(self.n_desc_words, self.emb_size, self.n_hidden) 32 | 33 | self.linear_attn_out = nn.Sequential(nn.Linear(self.n_hidden, self.n_hidden), 34 | nn.Tanh(), 35 | nn.Linear(self.n_hidden, self.n_hidden)) 36 | 37 | if self.conf['transform_every_modal']: 38 | self.linear_single_modal = nn.Sequential(nn.Linear(self.n_hidden, self.n_hidden), 39 | nn.Tanh(), 40 | nn.Linear(self.n_hidden, self.n_hidden)) 41 | 42 | if self.conf['save_attn_weight']: 43 | self.attn_weight_torch = [] 44 | self.node_mask_torch = [] 45 | 46 | self.self_attn = nn.Linear(self.n_hidden, self.n_hidden) 47 | self.self_attn_scalar = nn.Linear(self.n_hidden, 1) 48 | 49 | 50 | def code_encoding(self, ir_init_input_batch, ir_adjmat_batch, ir_node_mask): 51 | batch_size = ir_node_mask.size()[0] 52 | 53 | # code_feat: [batch_size x n_node x state_dim] 54 | code_feat = self.ir_encoder(ir_init_input_batch, ir_adjmat_batch) # forward(annotation, A) 55 | 56 | node_num = code_feat.size()[1] # n_node 57 | code_feat = code_feat.reshape(-1, node_num, self.n_hidden) 58 | # mask_1forgt0: [batch_size x n_node] 59 | mask_1forgt0 = ir_node_mask.bool().reshape(-1, node_num) 60 | 61 | if self.conf['transform_every_modal']: 62 | code_feat = torch.tanh( 63 | self.linear_single_modal(F.dropout(code_feat, self.dropout, training=self.training))) 64 | 65 | code_sa_tanh = torch.tanh(self.self_attn(code_feat.reshape(-1, self.n_hidden))) # [(batch_size * n_node) x n_hidden] 66 | code_sa_tanh = F.dropout(code_sa_tanh, self.dropout, training=self.training) 67 | # code_sa_tanh: [batch_size x n_node] 68 | code_sa_tanh = self.self_attn_scalar(code_sa_tanh).reshape(-1, node_num) 69 | 70 | code_feat = code_feat.reshape(-1, node_num, self.n_hidden) 71 | batch_size = code_feat.size()[0] 72 | 73 | self_attn_code_feat = None 74 | for _i in range(batch_size): 75 | # code_sa_tanh_one: [1 x real_node_num] 76 | code_sa_tanh_one = torch.masked_select(code_sa_tanh[_i, :], mask_1forgt0[_i, :]).reshape(1, -1) 77 | 78 | if self.ir_attn_mode == 'sigmoid_scalar': 79 | # attn_w_one: [1 x 1 x real_node_num] 80 | attn_w_one = torch.sigmoid(code_sa_tanh_one).reshape(1, 1, -1) 81 | else: 82 | attn_w_one = F.softmax(code_sa_tanh_one, dim=1).reshape(1, 1, -1) 83 | 84 | if self.conf['save_attn_weight']: 85 | self.attn_weight_torch.append(attn_w_one.detach().reshape(1, -1).cpu()) 86 | self.node_mask_torch.append(mask_1forgt0[_i, :].detach().reshape(1, -1).cpu()) 87 | 88 | # attn_feat_one: [1 x real_node_num x n_hidden] 89 | attn_feat_one = torch.masked_select(code_feat[_i, :, :].reshape(1, node_num, self.n_hidden), 90 | mask_1forgt0[_i, :].reshape(1, node_num, 1)).reshape(1, -1, self.n_hidden) 91 | # out_to_cat: [1 x n_hidden] 92 | out_to_cat = torch.bmm(attn_w_one, attn_feat_one).reshape(1, self.n_hidden) 93 | # self_attn_code_feat: [batch_size x n_hidden] 94 | self_attn_code_feat = out_to_cat if self_attn_code_feat is None else torch.cat( 95 | (self_attn_code_feat, out_to_cat), 0) 96 | 97 | if self.conf['use_attn']: 98 | self_attn_code_feat = torch.tanh( 99 | self.linear_attn_out( 100 | F.dropout(self_attn_code_feat, self.dropout, training=self.training)) 101 | ) 102 | elif self.conf['use_tanh']: 103 | self_attn_code_feat = torch.tanh(self_attn_code_feat) 104 | 105 | # self_attn_code_feat: [batch_size x n_hidden] 106 | return self_attn_code_feat 107 | 108 | def desc_encoding(self, desc, desc_len): 109 | batch_size = desc.size()[0] 110 | desc_enc_hidden = self.desc_encoder.init_hidden(batch_size) 111 | # desc_enc_hidden: [2 x batch_size x n_hidden] 112 | _, desc_enc_hidden = self.desc_encoder(desc, desc_len) 113 | # desc_feat: [batch_size x n_hidden] 114 | desc_feat = desc_enc_hidden[0].reshape(batch_size, self.n_hidden) 115 | 116 | if self.conf['transform_every_modal']: 117 | desc_feat = torch.tanh( 118 | self.linear_single_modal( 119 | F.dropout(desc_feat, self.dropout, training=self.training) 120 | ) 121 | ) 122 | elif self.conf['use_tanh']: 123 | desc_feat = torch.tanh(desc_feat) 124 | 125 | # desc_feat: [batch_size x n_hidden] 126 | return desc_feat 127 | 128 | 129 | def forward(self, ir_anno, ir_adjmat, ir_node_mask, desc_anchor, desc_anchor_len, desc_neg, desc_neg_len): 130 | # code_repr: [batch_size x n_hidden] 131 | ir_repr = self.code_encoding(ir_anno, ir_adjmat, ir_node_mask) 132 | # desc_repr: [batch_size x n_hidden] 133 | desc_anchor_repr = self.desc_encoding(desc_anchor, desc_anchor_len) 134 | desc_neg_repr = self.desc_encoding(desc_neg, desc_neg_len) 135 | 136 | # sim: [batch_sz] 137 | anchor_sim = F.cosine_similarity(ir_repr, desc_anchor_repr) 138 | neg_sim = F.cosine_similarity(ir_repr, desc_neg_repr) 139 | 140 | loss = (self.margin-anchor_sim+neg_sim).clamp(min=1e-6).mean() 141 | 142 | return loss 143 | -------------------------------------------------------------------------------- /src/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | import numpy as np 5 | import argparse 6 | import threading 7 | import codecs 8 | import logging 9 | from tqdm import tqdm 10 | logger = logging.getLogger(__name__) 11 | logging.basicConfig(level=logging.INFO, format="%(message)s") 12 | 13 | import torch 14 | 15 | import models, configs, data_loader 16 | from modules import get_cosine_schedule_with_warmup 17 | from utils import similarity, normalize 18 | from data_loader import * 19 | 20 | 21 | def test(config, model, device): 22 | logger.info('Test Begin...') 23 | 24 | model.eval() 25 | model.to(device) 26 | 27 | # load data 28 | data_path = args.data_path+args.dataset+'/' 29 | test_set = eval(config['dataset_name'])(config, data_path, 30 | config['test_ir'], config['n_node'], 31 | config['test_desc'], config['desc_len']) 32 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32, 33 | shuffle=False, drop_last=False, num_workers=1) 34 | # encode tokens and descs 35 | code_reprs, desc_reprs = [], [] 36 | n_processed = 0 37 | for batch in data_loader: 38 | # batch[0:3]: init_input, adjmat, node_mask 39 | code_batch = [tensor.to(device) for tensor in batch[:3]] 40 | # batch[3:5]: good_desc, good_desc_len 41 | desc_batch = [tensor.to(device) for tensor in batch[3:5]] 42 | with torch.no_grad(): 43 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32) 44 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size] 45 | # normalize when sim_measure=='cos' 46 | code_repr = normalize(code_repr) 47 | desc_repr = normalize(desc_repr) 48 | code_reprs.append(code_repr) 49 | desc_reprs.append(desc_repr) 50 | n_processed += batch[0].size(0) # +batch_size 51 | # code_reprs: [n_processed x n_hidden] 52 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 53 | 54 | # calculate similarity 55 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], [] 56 | test_sim_result, test_rank_result = [], [] 57 | for i in tqdm(range(0, n_processed)): 58 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden] 59 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed] 60 | negsims = np.negative(sims) 61 | predict = np.argsort(negsims) 62 | 63 | # SuccessRate@k 64 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]] 65 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0) 66 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0) 67 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0) 68 | # MRR 69 | predict_list = predict.tolist() 70 | rank = predict_list.index(i) 71 | sum_mrr.append(1/float(rank+1)) 72 | 73 | # results need to be saved 74 | predict_20 = [int(k) for k in predict[0:20]] 75 | sim_20 = [sims[k] for k in predict_20] 76 | test_sim_result.append(zip(predict_20, sim_20)) 77 | test_rank_result.append(rank+1) 78 | 79 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}') 80 | save_path = args.data_path + 'result/' 81 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy' 82 | np.save(save_path+sim_result_filename, test_sim_result) 83 | np.save(save_path+rank_result_filename, test_rank_result) 84 | 85 | 86 | def parse_args(): 87 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model") 88 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 89 | parser.add_argument('--model', type=str, default='IREmbeder', help='model name') 90 | parser.add_argument('-d', '--dataset', type=str, default='github1', help='name of dataset.java, python') 91 | parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from') 92 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 93 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard") 94 | return parser.parse_args() 95 | 96 | 97 | if __name__ == '__main__': 98 | args = parse_args() 99 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 100 | config = getattr(configs, 'config_'+args.model)() 101 | 102 | ##### Define model ###### 103 | logger.info('Constructing Model..') 104 | model = getattr(models, args.model)(config) # initialize the model 105 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5' 106 | model.load_state_dict(torch.load(ckpt, map_location=device)) 107 | 108 | test(config, model, device) 109 | 110 | 111 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import math 4 | import torch 5 | from torch.nn import functional as F 6 | 7 | PAD_ID, UNK_ID = [0, 1] 8 | 9 | def cos_approx(data1,data2): 10 | """numpy implementation of cosine similarity for matrix""" 11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 12 | dotted = np.dot(data1,np.transpose(data2)) 13 | norm1 = np.linalg.norm(data1,axis=1) 14 | norm2 = np.linalg.norm(data2,axis=1) 15 | matrix_vector_norms = np.multiply(norm1, norm2) 16 | neighbors = np.divide(dotted, matrix_vector_norms) 17 | return neighbors 18 | 19 | def normalize(data): 20 | """normalize matrix by rows""" 21 | return data/np.linalg.norm(data,axis=1,keepdims=True) 22 | 23 | def dot_np(data1,data2): 24 | """cosine similarity for normalized vectors""" 25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 26 | return np.dot(data1, data2.T) 27 | 28 | def sigmoid(x): 29 | return 1/(1 + np.exp(-x)) 30 | 31 | def similarity(vec1, vec2, measure='cos'): 32 | if measure=='cos': 33 | vec1_norm = normalize(vec1) 34 | vec2_norm = normalize(vec2) 35 | return np.dot(vec1_norm, vec2_norm.T)[:,0] 36 | elif measure=='poly': 37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T) 38 | elif measure=='sigmoid': 39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1) 40 | elif measure in ['enc', 'gesd', 'aesd']: 41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1) 42 | euc_sim = 1 / (1 + euc_dist) 43 | if measure=='euc': return euc_sim 44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1) 45 | if measure == 'gesd': return euc_sim * sigmoid_sim 46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim) 47 | 48 | ####################################################################### 49 | 50 | def asMinutes(s): 51 | m = math.floor(s / 60) 52 | s -= m * 60 53 | return '%d:%d'% (m, s) 54 | 55 | def timeSince(since, percent): 56 | now = time.time() 57 | s = now - since 58 | es = s / (percent) 59 | rs = es - s 60 | return '%s<%s'%(asMinutes(s), asMinutes(rs)) 61 | 62 | ####################################################################### 63 | import nltk 64 | try: nltk.word_tokenize("hello world") 65 | except LookupError: nltk.download('punkt') 66 | 67 | def sent2indexes(sentence, vocab, maxlen): 68 | '''sentence: a string or list of string 69 | return: a numpy array of word indices 70 | ''' 71 | def convert_sent(sent, vocab, maxlen): 72 | idxes = np.zeros(maxlen, dtype=np.int64) 73 | idxes.fill(PAD_ID) 74 | tokens = nltk.word_tokenize(sent.strip()) 75 | idx_len = min(len(tokens), maxlen) 76 | for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID) 77 | return idxes, idx_len 78 | if type(sentence) is list: 79 | inds, lens = [], [] 80 | for sent in sentence: 81 | idxes, idx_len = convert_sent(sent, vocab, maxlen) 82 | #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len]) 83 | inds.append(idxes) 84 | lens.append(idx_len) 85 | return np.vstack(inds), np.vstack(lens) 86 | else: 87 | inds, lens = sent2indexes([sentence], vocab, maxlen) 88 | return inds[0], lens[0] 89 | 90 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 91 | '''indexes: numpy array''' 92 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID): 93 | indexes=filter(lambda i: i!=ignore_tok, indexes) 94 | toks, length = [], 0 95 | for idx in indexes: 96 | toks.append(ivocab.get(idx, '')) 97 | length+=1 98 | return ' '.join(toks), length 99 | 100 | ivocab = {v: k for k, v in vocab.items()} 101 | if indexes.ndim==1:# one sentence 102 | return revert_sent(indexes, ivocab, ignore_tok) 103 | else:# dim>1 104 | sentences, lens =[], [] # a batch of sentences 105 | for inds in indexes: 106 | sentence, length = revert_sent(inds, ivocab, ignore_tok) 107 | sentences.append(sentence) 108 | lens.append(length) 109 | return sentences, lens 110 | 111 | ######################################################################## 112 | -------------------------------------------------------------------------------- /user study/queries.txt: -------------------------------------------------------------------------------- 1 | write a byte to output buffer of device 2 | insert a new value into list 3 | check string is suffix of another string 4 | remove all elements in list 5 | stop logging messages to syslog 6 | render bignum into decimal 7 | look for match name in system dictionary 8 | remove an element from a row vector 9 | load a bignum from int 10 | grab a lock on a mutex 11 | advance a carray cursor to next row of output 12 | split string into args respecting backwhack and quote 13 | retrieve page from the pager cache 14 | round up to nearest integer 15 | create a new thread safe queue of size siz 16 | push integer to end of list 17 | restart timer from current point in time 18 | return file name in path 19 | get name of current executable 20 | read machine uptime 21 | store integer into register 22 | stop stream server 23 | get length of UCS2 string 24 | contrain maximum of a range 25 | pad given buffer with len padding characters 26 | write data in output buffers to client 27 | generate trace call to print 28 | get dimensions of given bmp file 29 | add extension to filename 30 | read one word from onboard RAM 31 | encode ucs2 string into utf8 string 32 | check pointer is in the heap 33 | search a file in directory recursively 34 | compress block of raw data 35 | allocate and clean buffer 36 | receive N byte from socket 37 | lookup key in a hash map 38 | fast integral power function 39 | check if directory is empty 40 | create a new tree node 41 | free a dirty page 42 | create message with given type 43 | encrypt byte sequence 44 | parse checksum file 45 | remove trailing blanks, tabs and newlines 46 | search last occurrence of char in string 47 | binary search in sorted array of size 48 | calculate checksum of checkpoint 49 | judge whether two strings are equal 50 | return random integer value between min and max -------------------------------------------------------------------------------- /user study/readme.md: -------------------------------------------------------------------------------- 1 | # The questionnaire to select the participants 2 | ``` 3 | 1) What grade are you in? 4 | 2) Have you taken C language courses in the past few years? 5 | 3) Do you have C language programming experience, if so, how long is the programming experience? 6 | 4) Which C language projects have you participated in? Please introduce them. 7 | ``` 8 | # The contents of each file 9 | ``` 10 | queries.txt: the 50 queries randomly selected from our test-set with further filtering(e.g., removing clear technical keywords) 11 | code_search_DeGraphCS.txt: the top-10 searched results returned by DegraphCS 12 | code_search_DeepCS.txt: the top-10 searched results returned by DeepCS 13 | code_search_MMAN.txt: the top-10 searched results returned by MMAN 14 | code_search_UNIF.txt: the top-10 searched results returned by UNIF 15 | ``` 16 | --------------------------------------------------------------------------------