├── Appendix
    ├── README.md
    ├── baseline_comparison.png
    ├── propagation model.png
    ├── subgraph_compare.png
    └── vfg_of_loop_recur.png
├── Baseline methods
    ├── AST-Att
    │   ├── .DS_Store
    │   ├── __pycache__
    │   │   ├── configs.cpython-36.pyc
    │   │   ├── data_loader.cpython-36.pyc
    │   │   ├── modules.cpython-36.pyc
    │   │   ├── util_ast.cpython-36.pyc
    │   │   └── utils.cpython-36.pyc
    │   ├── configs.py
    │   ├── data
    │   │   └── .DS_Store
    │   ├── data_loader.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── astemb.cpython-36.pyc
    │   │   │   ├── jointemb.cpython-36.pyc
    │   │   │   └── tokenemb.cpython-36.pyc
    │   │   └── astemb.py
    │   ├── modules.py
    │   ├── output
    │   │   ├── .DS_Store
    │   │   └── ASTEmbeder
    │   │   │   └── .DS_Store
    │   ├── test.py
    │   ├── train.py
    │   ├── util_ast.py
    │   ├── util_desc.py
    │   └── utils.py
    ├── CFG-Att
    │   ├── .DS_Store
    │   ├── __pycache__
    │   │   ├── configs.cpython-36.pyc
    │   │   ├── data_loader.cpython-36.pyc
    │   │   ├── modules.cpython-36.pyc
    │   │   ├── util_cfg.cpython-36.pyc
    │   │   └── utils.cpython-36.pyc
    │   ├── configs.py
    │   ├── data
    │   │   └── .DS_Store
    │   ├── data_loader.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── cfgemb.cpython-36.pyc
    │   │   │   ├── jointemb.cpython-36.pyc
    │   │   │   └── tokenemb.cpython-36.pyc
    │   │   └── cfgemb.py
    │   ├── modules.py
    │   ├── output
    │   │   ├── .DS_Store
    │   │   └── CFGEmbeder
    │   │   │   └── .DS_Store
    │   ├── test.py
    │   ├── train.py
    │   ├── util_cfg.py
    │   ├── util_desc.py
    │   └── utils.py
    ├── DeepCS
    │   ├── .DS_Store
    │   ├── Tok-Att.code-workspace
    │   ├── __pycache__
    │   │   ├── configs.cpython-36.pyc
    │   │   ├── data_loader.cpython-36.pyc
    │   │   ├── modules.cpython-36.pyc
    │   │   └── utils.cpython-36.pyc
    │   ├── configs.py
    │   ├── data
    │   │   └── .DS_Store
    │   ├── data_loader.py
    │   ├── data_prepare.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── jointemb.cpython-36.pyc
    │   │   │   └── tokenemb.cpython-36.pyc
    │   │   └── jointemb.py
    │   ├── modules.py
    │   ├── output
    │   │   └── .DS_Store
    │   ├── shuffle_index.npy
    │   ├── test.py
    │   ├── train.py
    │   ├── user_study.py
    │   ├── util_desc.py
    │   ├── util_name.py
    │   ├── util_tok.py
    │   └── utils.py
    ├── MMAN(TDC)
    │   ├── .DS_Store
    │   ├── __pycache__
    │   │   ├── configs.cpython-36.pyc
    │   │   ├── data_loader.cpython-36.pyc
    │   │   ├── modules.cpython-36.pyc
    │   │   ├── util_cfg.cpython-36.pyc
    │   │   ├── util_dfg.cpython-36.pyc
    │   │   └── utils.cpython-36.pyc
    │   ├── configs.py
    │   ├── data
    │   │   └── .DS_Store
    │   ├── data_loader.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── cfgemb.cpython-36.pyc
    │   │   │   ├── jointemb.cpython-36.pyc
    │   │   │   └── tokenemb.cpython-36.pyc
    │   │   └── cfgemb.py
    │   ├── modules.py
    │   ├── output
    │   │   └── .DS_Store
    │   ├── shuffle_index.npy
    │   ├── test.py
    │   ├── train.py
    │   ├── util_cfg.py
    │   ├── util_desc.py
    │   ├── util_dfg.py
    │   ├── util_tok.py
    │   └── utils.py
    ├── MMAN
    │   ├── .DS_Store
    │   ├── __pycache__
    │   │   ├── configs.cpython-36.pyc
    │   │   ├── data_loader.cpython-36.pyc
    │   │   ├── modules.cpython-36.pyc
    │   │   ├── util_cfg.cpython-36.pyc
    │   │   └── utils.cpython-36.pyc
    │   ├── configs.py
    │   ├── data
    │   │   └── .DS_Store
    │   ├── data_loader.py
    │   ├── data_prepare
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── util_ast.cpython-36.pyc
    │   │   │   ├── util_cfg.cpython-36.pyc
    │   │   │   ├── util_desc.cpython-36.pyc
    │   │   │   ├── util_tok.cpython-36.pyc
    │   │   │   └── utils.cpython-36.pyc
    │   │   ├── util_ast.py
    │   │   ├── util_cfg.py
    │   │   ├── util_desc.py
    │   │   ├── util_tok.py
    │   │   └── utils.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── cfgemb.cpython-36.pyc
    │   │   │   ├── jointemb.cpython-36.pyc
    │   │   │   ├── multiemb.cpython-36.pyc
    │   │   │   └── tokenemb.cpython-36.pyc
    │   │   └── multiemb.py
    │   ├── modules.py
    │   ├── output
    │   │   ├── .DS_Store
    │   │   └── MultiEmbeder
    │   │   │   └── .DS_Store
    │   ├── shuffle_index.npy
    │   ├── test.py
    │   └── train.py
    ├── Tok-Att
    │   ├── .DS_Store
    │   ├── Tok-Att.code-workspace
    │   ├── __pycache__
    │   │   ├── configs.cpython-36.pyc
    │   │   ├── data_loader.cpython-36.pyc
    │   │   ├── modules.cpython-36.pyc
    │   │   └── utils.cpython-36.pyc
    │   ├── configs.py
    │   ├── data
    │   │   └── .DS_Store
    │   ├── data_loader.py
    │   ├── data_prepare.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── jointemb.cpython-36.pyc
    │   │   │   └── tokenemb.cpython-36.pyc
    │   │   └── tokenemb.py
    │   ├── modules.py
    │   ├── output
    │   │   ├── .DS_Store
    │   │   └── TokenEmbeder
    │   │   │   └── .DS_Store
    │   ├── shuffle_index.npy
    │   ├── test.py
    │   ├── train.py
    │   ├── util_desc.py
    │   ├── util_tok.py
    │   └── utils.py
    └── readme.md
├── IR2graph
    ├── readme.md
    └── vfg_construct.py
├── README.md
├── dataset
    ├── README.md
    ├── preprocessed_dataset
    │   ├── origin.desc.txt
    │   ├── origin.ir.txt
    │   ├── readme.md
    │   ├── test.desc.txt
    │   ├── test.ir.txt
    │   ├── train.desc.txt
    │   └── train.ir.txt
    └── raw_dataset
    │   └── readme.md
├── src
    ├── README.md
    ├── configs.py
    ├── data_loader.py
    ├── generate_interface.py
    ├── models
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── cfgemb.cpython-36.pyc
    │   │   ├── iremb.cpython-36.pyc
    │   │   ├── jointemb.cpython-36.pyc
    │   │   └── tokenemb.cpython-36.pyc
    │   └── iremb.py
    ├── modules.py
    ├── test.py
    ├── train.py
    ├── util_desc.py
    ├── util_ir.py
    └── utils.py
└── user study
    ├── code_search_DeGraphCS.txt
    ├── code_search_DeepCS.txt
    ├── code_search_MMAN.txt
    ├── code_search_UNIF.txt
    ├── queries.txt
    └── readme.md


/Appendix/README.md:
--------------------------------------------------------------------------------
 1 | # Online-Repo
 2 | 
 3 | ## An example to show the recursive call and loop functions can be represented as close as possible from our variable-based flow graph
 4 | #### An example "get sum" function realized by loop function
 5 | ```
 6 | int get_sum(int N){
 7 |     int sum = 0;
 8 |     while(N != 0){
 9 |        sum += N;
10 |        N-=1;
11 |     }
12 |     return sum;
13 | }
14 | ```
15 | #### An example "get sum" function realized by recursive call
16 | ```
17 | int get_sum(int N){
18 |     if(N == 0){return N;}
19 |     else{
20 |        int sum;
21 |        sum = N + get_sum(N-1);
22 |        return sum; 
23 |     }
24 | }
25 | ```
26 | #### The corresponding generated variable-based flow graphs are shown as below:
27 | Annotation:for recursive call "get_sum(N-1)" in (b), we link the result of "N-1" to input parameter "N", and we regard the return value "sum" of get_sum as the result of "get_sum(N-1)" so that we link return value "sum" to "add" operation):
28 | <img src="https://github.com/degraphcs/DeGraphCS/blob/main/Appendix/vfg_of_loop_recur.png" width="600" height="300" alt="the constructed graph"/><br/>
29 | 
30 | #### To better illustrate the common charateristics of variable-based flow graph constructed by deGraphCS from the above two different realizations, we extract the core part of the two realizations to make comparison:
31 | ```
32 | sum += N; N-= 1; // in loop function
33 | sum = N + get_sum(N-1) // in recursive call
34 | ```
35 | #### The corresponding sub-graphs of the core part are shown as below, from which we can clearly capture the common part:
36 | <img src="https://github.com/degraphcs/DeGraphCS/blob/main/Appendix/subgraph_compare.png" width="600" height="400" alt="the constructed graph"/><br/>
37 | 
38 | #### The corresponding generated AST and CFG of the two above realizations, the difference is obvious:
39 | <img src="https://github.com/degraphcs/DeGraphCS/blob/main/Appendix/baseline_comparison.png" width="600" height="500" alt="the constructed graph"/><br/>
40 | 
41 | ## The details of the equations and algorithms in deGraphCS
42 | ### The realization details of the attention mechanism on the whole graph and the comments
43 | ```
44 | self_attn = nn.Linear(self.n_hidden, self.n_hidden)
45 | self_attn_scalar = nn.Linear(self.n_hidden, 1)
46 | ```
47 | 
48 | Here, function f() in Equation (2) and Equation (4) means the first MLP layer: nn.Linear(self.n_hidden, self.n_hidden).
49 | 
50 | u_vfg means the second MLP layer: nn.Linear(self.n_hidden, 1), which can be seen as a high level representation of the VFG nodes.
51 | 
52 | h_vfg means the final weighted sum embedding of the whole graph (the weighted sum of self_attn_scalar and each node's final embedding). The difference between u_vfg and h_vfg is the same for the corresponding part of Equation (4) and Equation (5).
53 | 
54 | ### The aggragation function used in Equation (1)
55 | The aggregation function used in Equation (1) can be illustrated as follows:
56 | <img src="https://github.com/degraphcs/DeGraphCS/blob/main/Appendix/propagation%20model.png" width="800" height="120" alt="propogation model"/><br/>
57 | 
58 | In the functions above, Eq. 1 is the initialization step, which copies node annotations into the first components
59 | of the hidden state and pads the rest with zeros. 
60 | 
61 | Eq. 2 is the step that passes information between
62 | different nodes of the graph via incoming and outgoing edges with parameters dependent on the edge
63 | type and direction. 
64 | 
65 | The remaining are GRU-like updates that incorporate information from the other nodes and from the previous timestep
66 | to update each node’s hidden state.
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/Appendix/baseline_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/baseline_comparison.png


--------------------------------------------------------------------------------
/Appendix/propagation model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/propagation model.png


--------------------------------------------------------------------------------
/Appendix/subgraph_compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/subgraph_compare.png


--------------------------------------------------------------------------------
/Appendix/vfg_of_loop_recur.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/vfg_of_loop_recur.png


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/configs.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/data_loader.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/modules.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/__pycache__/util_ast.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/util_ast.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/configs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def config_ASTEmbeder():   
 3 |     conf = {
 4 |             # added_params
 5 |             'transform_attn_out': 0,
 6 |             'transform_every_modal': 0,  # to make modal more complex?
 7 |             'save_attn_weight': 0,
 8 |             'use_tanh': 1,
 9 |             'use_attn': 1,
10 |             'use_desc_attn': 1,
11 | 
12 |             # tree lstm
13 |             'treelstm_cell_type': 'nary', # nary or childsum
14 | 
15 |             # data_params
16 |             'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
17 |             #training data
18 |             'train_ast':'train.ast.json',
19 |             'train_desc':'train.desc.h5',
20 |             # test data
21 |             'test_ast':'test.ast.json',
22 |             'test_desc':'test.desc.h5', 
23 |                    
24 |             #parameters
25 |             'desc_len': 30,
26 |             'n_ast_words': 16000, # len(vocabulary) + 1 
27 |             'n_desc_words': 10000, # wait to decide
28 |             #vocabulary info
29 |             'vocab_ast':'vocab.ast.json',
30 |             'vocab_desc':'vocab.desc.json',
31 |                     
32 |         #training_params            
33 |             'batch_size': 32,
34 |             'nb_epoch': 200,
35 |             #'optimizer': 'adam',
36 |             'learning_rate':0.0003, # try 1e-4(paper)
37 |             'adam_epsilon':1e-8,
38 |             'warmup_steps':5000,
39 |             'fp16': False,
40 |             'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
41 |                             #"See details at https://nvidia.github.io/apex/amp.html"
42 | 
43 |         # model_params
44 |             'emb_size': 300,
45 |             'n_hidden': 512,#number of hidden dimension of code/desc representation
46 |             # recurrent  
47 |             'margin': 0.6,
48 |             'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
49 |                          #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
50 |             'dropout':0.1
51 |     }
52 |     return conf
53 | 
54 | 


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/data/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .astemb import ASTEmbeder


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/models/__pycache__/astemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/astemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/jointemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/tokenemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/output/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/output/ASTEmbeder/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/output/ASTEmbeder/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | import numpy as np
  5 | import argparse
  6 | import threading
  7 | import codecs
  8 | import logging
  9 | from tqdm import tqdm
 10 | logger = logging.getLogger(__name__)
 11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package
 13 | 
 14 | import torch
 15 | 
 16 | import models, configs, data_loader 
 17 | from modules import get_cosine_schedule_with_warmup
 18 | from utils import similarity, normalize
 19 | from data_loader import *
 20 | 
 21 | 
 22 | def test(config, model, device):
 23 |     logger.info('test begin...')
 24 | 
 25 |     model.eval()
 26 |     model.to(device)
 27 | 
 28 |     # load data
 29 |     data_path = args.data_path+args.dataset+'/'
 30 |     test_set = eval(config['dataset_name'])(data_path,
 31 |                                 config['test_ast'], config['vocab_ast'],
 32 |                                 config['test_desc'], config['desc_len'])
 33 |     data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
 34 |                                         collate_fn=batcher(device), shuffle=False, drop_last=False, num_workers=0)
 35 |     # encode asts and descs
 36 |     code_reprs, desc_reprs = [], []
 37 |     n_processed = 0
 38 |     for batch in data_loader:
 39 |         code_batch = [tensor for tensor in batch[:2]]
 40 |         desc_batch = [tensor for tensor in batch[2:4]]
 41 |         with torch.no_grad():
 42 |             code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
 43 |             desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
 44 |             # normalize when sim_measure=='cos'
 45 |             code_repr = normalize(code_repr)
 46 |             desc_repr = normalize(desc_repr)
 47 |         code_reprs.append(code_repr)
 48 |         desc_reprs.append(desc_repr)
 49 |         n_processed += batch[2].size(0) # +batch_size
 50 |     # code_reprs: [n_processed x n_hidden]
 51 |     code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 
 52 | 
 53 |     # calculate similarity
 54 |     sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
 55 |     test_sim_result, test_rank_result = [], []
 56 |     for i in tqdm(range(0, n_processed)):
 57 |         desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
 58 |         sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
 59 |         negsims = np.negative(sims)
 60 |         predict = np.argsort(negsims)
 61 |         
 62 |         # SuccessRate@k
 63 |         predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
 64 |         sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
 65 |         sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
 66 |         sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
 67 |         # MRR
 68 |         predict_list = predict.tolist()
 69 |         rank = predict_list.index(i)
 70 |         sum_mrr.append(1/float(rank+1))
 71 | 
 72 |         # results need to be saved
 73 |         predict_20 = [int(k) for k in predict[0:20]]
 74 |         sim_20 = [sims[k] for k in predict_20]
 75 |         test_sim_result.append(zip(predict_20, sim_20))
 76 |         test_rank_result.append(rank+1)
 77 | 
 78 |     logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
 79 |     save_path = args.data_path + 'result/'
 80 |     sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
 81 |     #np.save(save_path+sim_result_filename, test_sim_result)
 82 |     #np.save(save_path+rank_result_filename, test_rank_result)
 83 | 
 84 |     
 85 | def parse_args():
 86 |     parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
 87 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
 88 |     parser.add_argument('--model', type=str, default='ASTEmbeder', help='model name')
 89 |     parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python')
 90 |     parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from')
 91 |     parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\
 92 |                          'Note: should be consistent with the same argument in the repr_code.py')
 93 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
 94 |     parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
 95 |     return parser.parse_args()
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     args = parse_args()
100 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
101 |     config = getattr(configs, 'config_'+args.model)()
102 |     
103 |     ##### Define model ######
104 |     logger.info('Constructing Model..')
105 |     model = getattr(models, args.model)(config) # initialize the model
106 |     ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
107 |     model.load_state_dict(torch.load(ckpt, map_location=device))
108 |     
109 |     test(config, model, device)    
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/util_desc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import argparse
  3 | from collections import Counter
  4 | import json
  5 | import h5py
  6 | from utils import UNK_ID
  7 | 
  8 | def make_shuffle_index(args):
  9 |     dir_path = args.data_path + args.dataset
 10 |     all_desc_file_path = dir_path + args.all_desc_file
 11 |     with open(all_desc_file_path, 'r') as all_desc_file:
 12 |         lines = all_desc_file.readlines()
 13 |         all_num = int(len(lines)/2) 
 14 | 
 15 |     index = np.arange(all_num)
 16 |     np.random.seed(16)
 17 |     np.random.shuffle(index)
 18 |     #print(index)
 19 |     np.save(args.shuffle_index_file, index)
 20 | 
 21 | def split_data(args):
 22 | 
 23 |     dir_path = args.data_path + args.dataset
 24 |     all_desc_file_path = dir_path + args.all_desc_file
 25 |     train_desc_file_path = dir_path + args.train_desc_file
 26 |     test_desc_file_path = dir_path + args.test_desc_file
 27 | 
 28 |     input_desc = []
 29 |     with open(all_desc_file_path, 'r') as all_desc_file:
 30 |         lines = all_desc_file.readlines()
 31 |         for line in lines:
 32 |             if (line[0:10] != 'BeginFunc:'):
 33 |                 input_desc.append(line)
 34 |         print('number of input desc:\n', len(input_desc))
 35 | 
 36 |     with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
 37 |         for i in range(0, args.trainset_num):
 38 |             train_desc_file.write(input_desc[i])
 39 |         for i in range(args.testset_start_ind, args.testset_start_ind+args.testset_num):
 40 |             test_desc_file.write(input_desc[i])
 41 | 
 42 | 
 43 | def create_dict_file(args):
 44 |     dir_path = args.data_path + args.dataset
 45 |     desc_file_path = dir_path + args.train_desc_file 
 46 | 
 47 |     input_desc = []
 48 |     with open(desc_file_path, 'r') as desc_file:
 49 |         input_desc = desc_file.readlines()
 50 |     desc_words = []
 51 |     for i in range(0, len(input_desc)):
 52 |         input_desc[i] = input_desc[i].rstrip('\n')
 53 |         desc_word_list = input_desc[i].split()
 54 |         for desc_word in desc_word_list:
 55 |             desc_words.append(desc_word)
 56 |     vocab_desc_info = Counter(desc_words)
 57 |     print(len(vocab_desc_info))
 58 | 
 59 |     vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
 60 |     vocab_desc_index = {'<pad>':0, '<unk>':1}
 61 |     vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
 62 | 
 63 |     
 64 |     vocab_desc_file_path = dir_path + args.vocab_desc_file
 65 |     desc_dic_str = json.dumps(vocab_desc_index)
 66 |     with open(vocab_desc_file_path, 'w') as vocab_desc_file:
 67 |         vocab_desc_file.write(desc_dic_str)
 68 | 
 69 | 
 70 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
 71 |     phrases, indices = [], []
 72 |     with open(sent_file_path, 'r') as sent_file:
 73 |         sents = sent_file.readlines()
 74 |     vocab = json.loads(open(vocab_file_path, "r").readline())
 75 |     start_index = 0
 76 |     for i in range(0, len(sents)):
 77 |         sent = sents[i].rstrip('\n')
 78 |         word_list = sent.split()
 79 |         sent_len = min(len(word_list), maxlen)
 80 |         indices.append((sent_len, start_index))
 81 |         for j in range(0, sent_len):
 82 |             word = word_list[j]
 83 |             phrases.append(vocab.get(word, UNK_ID))
 84 |         start_index += sent_len
 85 |     output_file_path = sent_file_path[0:-3] + 'h5'
 86 |     output_file = h5py.File(output_file_path, 'w')
 87 |     output_file['phrases'] = phrases
 88 |     output_file['indices'] = indices
 89 |     output_file.close()
 90 | 
 91 | 
 92 | def parse_args():
 93 |     parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder")
 94 |     
 95 |     parser.add_argument('--data_path', type=str, default='./data/')
 96 |     parser.add_argument('--dataset', type=str, default='github/')
 97 | 
 98 |     parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
 99 |     parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
100 |     parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
101 |     parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
102 |     
103 |     parser.add_argument('--trainset_num', type=int, default=32000)
104 |     parser.add_argument('--testset_num', type=int, default=1000)
105 |     parser.add_argument('--testset_start_ind', type=int, default=32000)
106 |     parser.add_argument('--desc_word_num', type=int, default=10000)
107 |     parser.add_argument('--desc_maxlen', type=int, default=30)
108 | 
109 |     parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
110 |  
111 |     return parser.parse_args()
112 | 
113 | if __name__ == '__main__':
114 |     args = parse_args()
115 | 
116 |     split_data(args)
117 |     create_dict_file(args)
118 |     
119 |     dir_path = args.data_path + args.dataset
120 |     # train.desc.txt -> train.desc.h5(and test...) 
121 |     sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
122 |     sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
123 |     
124 | 


--------------------------------------------------------------------------------
/Baseline methods/AST-Att/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import math
  4 | import torch
  5 | from torch.nn import functional as F
  6 | 
  7 | PAD_ID, UNK_ID = [0, 1]
  8 | 
  9 | def cos_approx(data1,data2):
 10 |     """numpy implementation of cosine similarity for matrix"""
 11 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
 12 |     dotted = np.dot(data1,np.transpose(data2))
 13 |     norm1 = np.linalg.norm(data1,axis=1)
 14 |     norm2 = np.linalg.norm(data2,axis=1)
 15 |     matrix_vector_norms = np.multiply(norm1, norm2)
 16 |     neighbors = np.divide(dotted, matrix_vector_norms)
 17 |     return neighbors
 18 | 
 19 | def normalize(data):
 20 |     """normalize matrix by rows"""
 21 |     return data/np.linalg.norm(data,axis=1,keepdims=True)
 22 | 
 23 | def dot_np(data1,data2):
 24 |     """cosine similarity for normalized vectors"""
 25 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
 26 |     return np.dot(data1, data2.T)
 27 | 
 28 | def sigmoid(x):
 29 |     return 1/(1 + np.exp(-x)) 
 30 | 
 31 | def similarity(vec1, vec2, measure='cos'):
 32 |     if measure=='cos':
 33 |         vec1_norm = normalize(vec1)
 34 |         vec2_norm = normalize(vec2)
 35 |         return np.dot(vec1_norm, vec2_norm.T)[:,0]
 36 |     elif measure=='poly':
 37 |         return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
 38 |     elif measure=='sigmoid':
 39 |         return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
 40 |     elif measure in ['enc', 'gesd', 'aesd']: 
 41 |         euc_dist = np.linalg.norm(vec1-vec2, axis=1)
 42 |         euc_sim = 1 / (1 + euc_dist)
 43 |         if measure=='euc': return euc_sim                
 44 |         sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
 45 |         if measure == 'gesd': return euc_sim * sigmoid_sim
 46 |         elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
 47 | 
 48 | #######################################################################
 49 | 
 50 | def asMinutes(s):
 51 |     m = math.floor(s / 60)
 52 |     s -= m * 60
 53 |     return '%d:%d'% (m, s)
 54 | 
 55 | def timeSince(since, percent):
 56 |     now = time.time()
 57 |     s = now - since
 58 |     es = s / (percent)
 59 |     rs = es - s
 60 |     return '%s<%s'%(asMinutes(s), asMinutes(rs))
 61 | 
 62 | #######################################################################
 63 | '''
 64 | import nltk
 65 | try: nltk.word_tokenize("hello world")
 66 | except LookupError: nltk.download('punkt')
 67 |     
 68 | def sent2indexes(sentence, vocab, maxlen):
 69 |     
 70 |     def convert_sent(sent, vocab, maxlen):
 71 |         idxes = np.zeros(maxlen, dtype=np.int64)
 72 |         idxes.fill(PAD_ID)
 73 |         tokens = nltk.word_tokenize(sent.strip())
 74 |         idx_len = min(len(tokens), maxlen)
 75 |         for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID)
 76 |         return idxes, idx_len
 77 |     if type(sentence) is list:
 78 |         inds, lens = [], []
 79 |         for sent in sentence:
 80 |             idxes, idx_len = convert_sent(sent, vocab, maxlen)
 81 |             #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len])
 82 |             inds.append(idxes)
 83 |             lens.append(idx_len)
 84 |         return np.vstack(inds), np.vstack(lens)
 85 |     else:
 86 |         inds, lens = sent2indexes([sentence], vocab, maxlen)
 87 |         return inds[0], lens[0]
 88 | '''  
 89 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 
 90 |     '''indexes: numpy array'''
 91 |     def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
 92 |         indexes=filter(lambda i: i!=ignore_tok, indexes)
 93 |         toks, length = [], 0        
 94 |         for idx in indexes:
 95 |             toks.append(ivocab.get(idx, '<unk>'))
 96 |             length+=1
 97 |         return ' '.join(toks), length
 98 |     
 99 |     ivocab = {v: k for k, v in vocab.items()}
100 |     if indexes.ndim==1:# one sentence
101 |         return revert_sent(indexes, ivocab, ignore_tok)
102 |     else:# dim>1
103 |         sentences, lens =[], [] # a batch of sentences
104 |         for inds in indexes:
105 |             sentence, length = revert_sent(inds, ivocab, ignore_tok)
106 |             sentences.append(sentence)
107 |             lens.append(length)
108 |         return sentences, lens
109 | 
110 | ########################################################################
111 | 


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/configs.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/data_loader.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/modules.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/__pycache__/util_cfg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/util_cfg.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/configs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def config_CFGEmbeder():   
 3 |     conf = {
 4 |             # added_params
 5 |             'transform_every_modal': 0,  # to make modal more complex?
 6 |             'save_attn_weight': 0,
 7 |             'use_tanh': 1,
 8 |             'use_attn': 1,
 9 | 
10 |             # GGNN
11 |             'state_dim': 512, # GGNN hidden state size
12 |             'annotation_dim': 5,
13 |             'n_edge_types': 2,
14 |             'n_node': 200, # could be less than 512, like the maximum nodenum
15 |             'n_steps': 5, # propogation steps number of GGNN
16 |             'output_type': 'no_reduce',
17 |             'batch_size': 32,
18 |             'n_layers': 1,
19 |             'n_hidden': 512,
20 |             'cfg_attn_mode': 'sigmoid_scalar',
21 | 
22 |             # data_params
23 |             'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
24 |             #training data
25 |             'train_cfg':'train.cfg.txt',
26 |             'train_desc':'train.desc.h5',
27 |             # test data
28 |             'test_cfg':'test.cfg.txt',
29 |             'test_desc':'test.desc.h5', 
30 |                    
31 |             #parameters
32 |             'desc_len': 30,
33 |             'n_desc_words': 10000, # wait to decide
34 |             #vocabulary info
35 |             'vocab_desc':'vocab.desc.json',
36 |                     
37 |             #training_params            
38 |             'chunk_size': 200000,
39 |             'nb_epoch': 200,
40 |             #'optimizer': 'adam',
41 |             'learning_rate':0.0003, # try 1e-4(paper)
42 |             'adam_epsilon':1e-8,
43 |             'warmup_steps':5000,
44 |             'fp16': False,
45 |             'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
46 |                             #"See details at https://nvidia.github.io/apex/amp.html"
47 | 
48 |         # model_params
49 |             'emb_size': 300,
50 |             # recurrent  
51 |             'margin': 0.6,
52 |             'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
53 |                          #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
54 |             'dropout': 0.1
55 |     }
56 |     return conf
57 | 
58 | 


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/data/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/data_loader.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import torch 
  3 | import torch.utils.data as data
  4 | import torch.nn as nn
  5 | import tables
  6 | import json
  7 | import random
  8 | import numpy as np
  9 | import pickle
 10 | 
 11 | from utils import PAD_ID, UNK_ID, indexes2sent
 12 | import configs
 13 | from util_cfg import get_cfg_npy_info, get_one_cfg_npy_info
 14 | 
 15 | import logging
 16 | logger = logging.getLogger(__name__)
 17 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 18 | 
 19 |     
 20 | class CodeSearchDataset(data.Dataset):
 21 |     """
 22 |     Dataset that has only positive samples.
 23 |     """
 24 |     def __init__(self, config, data_dir, f_cfgs, max_node_num, f_descs=None, max_desc_len=None):
 25 |     
 26 |         self.max_node_num = max_node_num
 27 |         self.max_desc_len = max_desc_len
 28 |         self.n_edge_types = config['n_edge_types']
 29 |         self.state_dim = config['state_dim']
 30 |         self.annotation_dim = config['annotation_dim']
 31 | 
 32 |         # initialize file path or list of file names
 33 |         self.training = False
 34 |         print("Loading Data...")
 35 | 
 36 |         self.mark_list = []
 37 |         start_index, end_index = [0, 0]
 38 |         with open(data_dir+f_cfgs, 'r') as cfg_file:
 39 |             self.cfg_lines = cfg_file.readlines()
 40 |             for i in range(0, len(self.cfg_lines)):
 41 |                 self.cfg_lines[i] = self.cfg_lines[i].rstrip('\n')
 42 |                 if self.cfg_lines[i][0:10] == 'BeginFunc:' and i != 0:
 43 |                     end_index = i
 44 |                     self.mark_list.append([start_index, end_index])
 45 |                     start_index = i
 46 |             self.mark_list.append([start_index, len(self.cfg_lines)])
 47 |             
 48 |         '''
 49 |         # cfg_adjmat: [all_num x n_node x (n_node * n_edge_types * 2)]
 50 |         # cfg_init_input: [all_num x n_node x state_dim]
 51 |         # cfg_node_mask: [all_num x n_node]
 52 |         self.cfg_adjmat, self.cfg_init_input, self.cfg_node_mask = get_cfg_npy_info(self.cfg_lines, 
 53 |                                 self.max_node_num, self.n_edge_types, self.state_dim, self.annotation_dim)
 54 |         '''
 55 | 
 56 |         if f_descs is not None:
 57 |             self.training = True
 58 |             table_desc = tables.open_file(data_dir+f_descs)
 59 |             self.descs = table_desc.get_node('/phrases')[:].astype(np.long)
 60 |             self.idx_descs = table_desc.get_node('/indices')[:]
 61 |         '''
 62 |         if f_descs is not None:
 63 |             assert len(self.cfg_adjmat)==self.idx_descs.shape[0]
 64 |         '''
 65 |         self.data_len = self.idx_descs.shape[0]
 66 |         print("{} entries".format(self.data_len))
 67 |         
 68 |     def pad_seq(self, seq, maxlen):
 69 |         if len(seq) < maxlen:
 70 |             seq = np.append(seq, [PAD_ID]*(maxlen-len(seq)))
 71 |         seq = seq[:maxlen]
 72 |         return seq
 73 |     
 74 |     def __getitem__(self, offset):          
 75 |         #print('offset:\n', offset)
 76 |         #print('cfg start_index = {}, end_index = {}'.format(self.mark_list[offset][0], self.mark_list[offset][1]))
 77 | 
 78 |         input_cfg_lines = self.cfg_lines[self.mark_list[offset][0]: self.mark_list[offset][1]]
 79 |         adjmat, init_input, node_mask = get_one_cfg_npy_info(input_cfg_lines, 
 80 |                     self.max_node_num, self.n_edge_types, self.state_dim, self.annotation_dim)
 81 | 
 82 |         if self.training:
 83 |             len, pos = self.idx_descs[offset][0], self.idx_descs[offset][1]
 84 |             good_desc_len = min(int(len), self.max_desc_len)
 85 |             good_desc = self.descs[pos: pos+good_desc_len]
 86 |             good_desc = self.pad_seq(good_desc, self.max_desc_len)
 87 |             
 88 |             rand_offset = random.randint(0, self.data_len-1)
 89 |             len, pos = self.idx_descs[rand_offset][0], self.idx_descs[rand_offset][1]
 90 |             bad_desc_len = min(int(len), self.max_desc_len)
 91 |             bad_desc = self.descs[pos: pos+bad_desc_len]
 92 |             bad_desc = self.pad_seq(bad_desc, self.max_desc_len)
 93 | 
 94 |             return torch.Tensor(init_input), torch.Tensor(adjmat), torch.Tensor(node_mask), good_desc, good_desc_len, bad_desc, bad_desc_len
 95 |         return torch.Tensor(init_input), torch.Tensor(adjmat), torch.Tensor(node_mask), good_desc, good_desc_len
 96 |         
 97 |     def __len__(self):
 98 |         return self.data_len
 99 | 
100 | def load_dict(filename):
101 |     return json.loads(open(filename, "r").readline())
102 |     #return pickle.load(open(filename, 'rb')) 
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     device = 'cpu'
107 |     config = getattr(configs, 'config_CFGEmbeder')()
108 |     input_dir = './data/github/'
109 | 
110 |     train_set = CodeSearchDataset(config, input_dir, 'train.cfg.txt', 512, 'train.desc.h5', 30)
111 |     train_data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=5, shuffle=False, drop_last=False, num_workers=1)
112 |     print('number of batch:\n', len(train_data_loader))
113 |     '''
114 |     use_set = CodeSearchDataset(input_dir, 'use.tokens.h5', 30)
115 |     use_data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=1, shuffle=False, num_workers=1)
116 |     #print(len(use_data_loader))
117 |     vocab_tokens = load_dict(input_dir+'vocab.tokens.json')
118 |     vocab_desc = load_dict(input_dir+'vocab.desc.json')
119 |     '''
120 |     vocab_desc = load_dict(input_dir+'vocab.desc.json')
121 |     print('============ Train Data ================')
122 |     k = 0
123 |     for epo in range(0,3):
124 |         for batch in train_data_loader:
125 |             print("batch[1].size(): ", batch[1].size())
126 |             #batch = tuple([t.numpy() for t in batch])
127 |             init_input, adjmat, node_mask, good_desc, good_desc_len, bad_desc, bad_desc_len = [tensor.to(device) for tensor in batch]
128 |             print(adjmat.dtype)
129 |             #print(batch)
130 |             k+=1
131 |             #if k>0: break
132 |             print('-------------------------------')
133 |             print(indexes2sent(good_desc, vocab_desc))
134 |             #print(indexes2sent(good_desc, vocab_desc))
135 |     
136 |     
137 | 


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .cfgemb import CFGEmbeder


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/models/__pycache__/cfgemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/cfgemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/jointemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/tokenemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/output/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/output/CFGEmbeder/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/output/CFGEmbeder/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | import numpy as np
  5 | import argparse
  6 | import threading
  7 | import codecs
  8 | import logging
  9 | from tqdm import tqdm
 10 | logger = logging.getLogger(__name__)
 11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package
 13 | 
 14 | import torch
 15 | 
 16 | import models, configs, data_loader 
 17 | from modules import get_cosine_schedule_with_warmup
 18 | from utils import similarity, normalize
 19 | from data_loader import *
 20 | 
 21 | 
 22 | def test(config, model, device):
 23 |     logger.info('Test Begin...')
 24 | 
 25 |     model.eval()
 26 |     model.to(device)
 27 | 
 28 |     # load data
 29 |     data_path = args.data_path+args.dataset+'/'
 30 |     test_set = eval(config['dataset_name'])(config, data_path,
 31 |                                 config['test_cfg'], config['n_node'],
 32 |                                 config['test_desc'], config['desc_len'])
 33 |     data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
 34 |                                         shuffle=False, drop_last=False, num_workers=1)
 35 |     # encode tokens and descs
 36 |     code_reprs, desc_reprs = [], []
 37 |     n_processed = 0
 38 |     for batch in data_loader:
 39 |         # batch[0:3]: init_input, adjmat, node_mask
 40 |         code_batch = [tensor.to(device) for tensor in batch[:3]]
 41 |         # batch[3:5]: good_desc, good_desc_len
 42 |         desc_batch = [tensor.to(device) for tensor in batch[3:5]]
 43 |         with torch.no_grad():
 44 |             code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
 45 |             desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
 46 |             # normalize when sim_measure=='cos'
 47 |             code_repr = normalize(code_repr)
 48 |             desc_repr = normalize(desc_repr)
 49 |         code_reprs.append(code_repr)
 50 |         desc_reprs.append(desc_repr)
 51 |         n_processed += batch[0].size(0) # +batch_size
 52 |     # code_reprs: [n_processed x n_hidden]
 53 |     code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 
 54 | 
 55 |     # calculate similarity
 56 |     sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
 57 |     test_sim_result, test_rank_result = [], []
 58 |     for i in tqdm(range(0, n_processed)):
 59 |         desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
 60 |         sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
 61 |         negsims = np.negative(sims)
 62 |         predict = np.argsort(negsims)
 63 |         
 64 |         # SuccessRate@k
 65 |         predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
 66 |         sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
 67 |         sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
 68 |         sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
 69 |         # MRR
 70 |         predict_list = predict.tolist()
 71 |         rank = predict_list.index(i)
 72 |         sum_mrr.append(1/float(rank+1))
 73 | 
 74 |         # results need to be saved
 75 |         predict_20 = [int(k) for k in predict[0:20]]
 76 |         sim_20 = [sims[k] for k in predict_20]
 77 |         test_sim_result.append(zip(predict_20, sim_20))
 78 |         test_rank_result.append(rank+1)
 79 | 
 80 |     logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
 81 |     save_path = args.data_path + 'result/'
 82 |     sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
 83 |     np.save(save_path+sim_result_filename, test_sim_result)
 84 |     np.save(save_path+rank_result_filename, test_rank_result)
 85 | 
 86 |     
 87 | def parse_args():
 88 |     parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
 89 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
 90 |     parser.add_argument('--model', type=str, default='CFGEmbeder', help='model name')
 91 |     parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python')
 92 |     parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from')
 93 |     parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\
 94 |                          'Note: should be consistent with the same argument in the repr_code.py')
 95 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
 96 |     parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
 97 |     return parser.parse_args()
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     args = parse_args()
102 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
103 |     config = getattr(configs, 'config_'+args.model)()
104 |     
105 |     ##### Define model ######
106 |     logger.info('Constructing Model..')
107 |     model = getattr(models, args.model)(config) # initialize the model
108 |     ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
109 |     model.load_state_dict(torch.load(ckpt, map_location=device))
110 |     
111 |     test(config, model, device)    
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/util_desc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import argparse
  3 | from collections import Counter
  4 | import json
  5 | import h5py
  6 | from utils import UNK_ID
  7 | 
  8 | def make_shuffle_index_num(args, all_num):
  9 |     index = np.arange(all_num)
 10 |     np.random.seed(16)
 11 |     np.random.shuffle(index)
 12 |     print('index:\n', index)
 13 |     np.save(args.shuffle_index_file, index)
 14 | 
 15 | def make_shuffle_index(args):
 16 |     dir_path = args.data_path + args.dataset
 17 |     all_desc_file_path = dir_path + args.all_desc_file
 18 |     with open(all_desc_file_path, 'r') as all_desc_file:
 19 |         lines = all_desc_file.readlines()
 20 |         all_num = int(len(lines)/2) 
 21 |     print('all_num of desc:\n', all_num)
 22 | 
 23 |     index = np.arange(all_num)
 24 |     np.random.seed(16)
 25 |     np.random.shuffle(index)
 26 |     print('index:\n', index)
 27 |     np.save(args.shuffle_index_file, index)
 28 | 
 29 | def split_data(args):
 30 |     index = np.load(args.shuffle_index_file)
 31 | 
 32 |     dir_path = args.data_path + args.dataset
 33 |     all_desc_file_path = dir_path + args.all_desc_file
 34 |     train_desc_file_path = dir_path + args.train_desc_file
 35 |     test_desc_file_path = dir_path + args.test_desc_file
 36 | 
 37 |     input_desc = []
 38 |     with open(all_desc_file_path, 'r') as all_desc_file:
 39 |         lines = all_desc_file.readlines()
 40 |         for line in lines:
 41 |             if (line[0:10] != 'BeginFunc:'):
 42 |                 input_desc.append(line)
 43 |         print('number of input desc:\n', len(input_desc))
 44 | 
 45 |     with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
 46 |         for i in range(0, args.trainset_num):
 47 |             train_desc_file.write(input_desc[index[i]])
 48 |         for i in range(32000, 32000+args.testset_num):
 49 |             test_desc_file.write(input_desc[index[i]])
 50 |     
 51 | 
 52 | def create_dict_file(args):
 53 |     dir_path = args.data_path + args.dataset
 54 |     desc_file_path = dir_path + args.train_desc_file 
 55 | 
 56 |     input_desc = []
 57 |     with open(desc_file_path, 'r') as desc_file:
 58 |         input_desc = desc_file.readlines()
 59 |     desc_words = []
 60 |     for i in range(0, len(input_desc)):
 61 |         input_desc[i] = input_desc[i].rstrip('\n')
 62 |         desc_word_list = input_desc[i].split()
 63 |         for desc_word in desc_word_list:
 64 |             desc_words.append(desc_word)
 65 |     vocab_desc_info = Counter(desc_words)
 66 |     print(len(vocab_desc_info))
 67 | 
 68 |     vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
 69 |     vocab_desc_index = {'<pad>':0, '<unk>':1}
 70 |     vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
 71 | 
 72 |     
 73 |     vocab_desc_file_path = dir_path + args.vocab_desc_file
 74 |     desc_dic_str = json.dumps(vocab_desc_index)
 75 |     with open(vocab_desc_file_path, 'w') as vocab_desc_file:
 76 |         vocab_desc_file.write(desc_dic_str)
 77 | 
 78 | 
 79 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
 80 |     phrases, indices = [], []
 81 |     with open(sent_file_path, 'r') as sent_file:
 82 |         sents = sent_file.readlines()
 83 |     vocab = json.loads(open(vocab_file_path, "r").readline())
 84 |     start_index = 0
 85 |     for i in range(0, len(sents)):
 86 |         sent = sents[i].rstrip('\n')
 87 |         word_list = sent.split()
 88 |         sent_len = min(len(word_list), maxlen)
 89 |         indices.append((sent_len, start_index))
 90 |         for j in range(0, sent_len):
 91 |             word = word_list[j]
 92 |             phrases.append(vocab.get(word, UNK_ID))
 93 |         start_index += sent_len
 94 |     output_file_path = sent_file_path[0:-3] + 'h5'
 95 |     output_file = h5py.File(output_file_path, 'w')
 96 |     output_file['phrases'] = phrases
 97 |     output_file['indices'] = indices
 98 |     output_file.close()
 99 | 
100 | 
101 | def parse_args():
102 |     parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder")
103 |     
104 |     parser.add_argument('--data_path', type=str, default='./data/')
105 |     parser.add_argument('--dataset', type=str, default='github/')
106 | 
107 |     parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
108 |     parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
109 |     parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
110 |     parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
111 |     
112 |     parser.add_argument('--trainset_num', type=int, default=32000)
113 |     parser.add_argument('--testset_num', type=int, default=1000)
114 |     parser.add_argument('--desc_word_num', type=int, default=10000)
115 |     parser.add_argument('--desc_maxlen', type=int, default=30)
116 | 
117 |     parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
118 |  
119 |     return parser.parse_args()
120 | 
121 | if __name__ == '__main__':
122 |     args = parse_args()
123 | 
124 |     #make_shuffle_index_num(args, 33000)
125 |     #split_data(args)
126 |     #create_dict_file(args)
127 |     
128 |     dir_path = args.data_path + args.dataset
129 |     # train.desc.txt -> train.desc.h5(and test...) 
130 |     sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
131 |     sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
132 |     
133 | 


--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | import math
 4 | import torch
 5 | from torch.nn import functional as F
 6 | 
 7 | PAD_ID, UNK_ID = [0, 1]
 8 | 
 9 | def cos_approx(data1,data2):
10 |     """numpy implementation of cosine similarity for matrix"""
11 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
12 |     dotted = np.dot(data1,np.transpose(data2))
13 |     norm1 = np.linalg.norm(data1,axis=1)
14 |     norm2 = np.linalg.norm(data2,axis=1)
15 |     matrix_vector_norms = np.multiply(norm1, norm2)
16 |     neighbors = np.divide(dotted, matrix_vector_norms)
17 |     return neighbors
18 | 
19 | def normalize(data):
20 |     """normalize matrix by rows"""
21 |     return data/np.linalg.norm(data,axis=1,keepdims=True)
22 | 
23 | def dot_np(data1,data2):
24 |     """cosine similarity for normalized vectors"""
25 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
26 |     return np.dot(data1, data2.T)
27 | 
28 | def sigmoid(x):
29 |     return 1/(1 + np.exp(-x)) 
30 | 
31 | def similarity(vec1, vec2, measure='cos'):
32 |     if measure=='cos':
33 |         vec1_norm = normalize(vec1)
34 |         vec2_norm = normalize(vec2)
35 |         return np.dot(vec1_norm, vec2_norm.T)[:,0]
36 |     elif measure=='poly':
37 |         return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
38 |     elif measure=='sigmoid':
39 |         return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
40 |     elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf 
41 |         euc_dist = np.linalg.norm(vec1-vec2, axis=1)
42 |         euc_sim = 1 / (1 + euc_dist)
43 |         if measure=='euc': return euc_sim                
44 |         sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
45 |         if measure == 'gesd': return euc_sim * sigmoid_sim
46 |         elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
47 | 
48 | #######################################################################
49 | 
50 | def asMinutes(s):
51 |     m = math.floor(s / 60)
52 |     s -= m * 60
53 |     return '%d:%d'% (m, s)
54 | 
55 | def timeSince(since, percent):
56 |     now = time.time()
57 |     s = now - since
58 |     es = s / (percent)
59 |     rs = es - s
60 |     return '%s<%s'%(asMinutes(s), asMinutes(rs))
61 | 
62 | #######################################################################
63 |     
64 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 
65 |     '''indexes: numpy array'''
66 |     def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
67 |         indexes=filter(lambda i: i!=ignore_tok, indexes)
68 |         toks, length = [], 0        
69 |         for idx in indexes:
70 |             toks.append(ivocab.get(idx, '<unk>'))
71 |             length+=1
72 |         return ' '.join(toks), length
73 |     
74 |     ivocab = {v: k for k, v in vocab.items()}
75 |     if indexes.ndim==1:# one sentence
76 |         return revert_sent(indexes, ivocab, ignore_tok)
77 |     else:# dim>1
78 |         sentences, lens =[], [] # a batch of sentences
79 |         for inds in indexes:
80 |             sentence, length = revert_sent(inds, ivocab, ignore_tok)
81 |             sentences.append(sentence)
82 |             lens.append(length)
83 |         return sentences, lens
84 | 
85 | ########################################################################
86 | 


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/Tok-Att.code-workspace:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"folders": [
 3 | 		{
 4 | 			"path": "../CFG-Att"
 5 | 		},
 6 | 		{
 7 | 			"path": "../AST-Att"
 8 | 		},
 9 | 		{
10 | 			"path": "."
11 | 		}
12 | 	]
13 | }


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/configs.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/data_loader.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/modules.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/configs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def config_JointEmbeder():   
 3 |     conf = {
 4 |         # data_params
 5 |         'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
 6 |         #training data
 7 |         'train_name':'train.name.h5',
 8 |         'train_tokens':'train.token.h5',
 9 |         'train_desc':'train.desc.h5',
10 |         # test data
11 |         'test_name':'test.name.h5',
12 |         'test_tokens':'test.token.h5',
13 |         'test_desc':'test.desc.h5', 
14 |         # user study data
15 |         'all_name': 'all.name.h5',
16 |         'all_tokens': 'all.token.h5',
17 |         'query_desc': 'query.desc.h5',
18 |         #parameters
19 |         'name_len': 6,
20 |         'tokens_len': 50,
21 |         'desc_len': 30,
22 |         'n_words': 10000, # len(vocabulary) + 1 
23 |         #vocabulary info
24 |         'vocab_name':'vocab.name.json',
25 |         'vocab_tokens':'vocab.token.json',
26 |         'vocab_desc':'vocab.desc.json',
27 |                 
28 |         #training_params            
29 |         'batch_size': 32,
30 |         'nb_epoch': 200,
31 |         #'optimizer': 'adam',
32 |         'learning_rate':0.0003, # try 1e-4(paper)
33 |         'adam_epsilon':1e-8,
34 |         'warmup_steps':5000,
35 |         'fp16': False,
36 |         'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
37 |                         #"See details at https://nvidia.github.io/apex/amp.html"
38 | 
39 |     # model_params
40 |         'use_desc_attn': 1,
41 |         'use_tanh': 1,
42 |         'emb_size': 512,
43 |         'n_hidden': 512,#number of hidden dimension of code/desc representation
44 |         'lstm_dims': 256,
45 |         # recurrent  
46 |         'margin': 0.6,
47 |         'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
48 |                         #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
49 |         'dropout':0,
50 |     }
51 |     return conf
52 | 
53 | 


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/data/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .jointemb import JointEmbeder


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/models/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/models/__pycache__/jointemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/models/__pycache__/tokenemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/models/jointemb.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.init as weight_init
  8 | import torch.nn.functional as F
  9 | 
 10 | import logging
 11 | logger = logging.getLogger(__name__)
 12 | parentPath = os.path.abspath("..")
 13 | sys.path.insert(0, parentPath)# add parent folder to path so as to import common modules
 14 | from modules import SeqEncoder, BOWEncoder, SeqEncoder2
 15 | 
 16 | class JointEmbeder(nn.Module):
 17 |     def __init__(self, config):
 18 |         super(JointEmbeder, self).__init__()
 19 |         self.conf = config
 20 |         self.margin = config['margin']
 21 |         self.dropout = config['dropout']
 22 |         self.n_hidden = config['n_hidden']
 23 | 
 24 |         self.name_encoder = SeqEncoder(config['n_words'],config['emb_size'],config['lstm_dims'])
 25 |         self.tok_encoder = BOWEncoder(config['n_words'],config['emb_size'],config['n_hidden'])
 26 |         self.desc_encoder = SeqEncoder2(config['n_words'],config['emb_size'],config['n_hidden'])
 27 |         
 28 |         self.w_name = nn.Linear(2*config['lstm_dims'], config['n_hidden'])
 29 |         self.w_tok = nn.Linear(config['emb_size'], config['n_hidden'])
 30 |         #self.w_desc = nn.Linear(2*config['lstm_dims'], config['n_hidden'])
 31 |         self.fuse3 = nn.Linear(config['n_hidden'], config['n_hidden'])
 32 | 
 33 |         self.self_attn2 = nn.Linear(self.n_hidden, self.n_hidden)
 34 |         self.self_attn_scalar2 = nn.Linear(self.n_hidden, 1)
 35 |         
 36 |         self.init_weights()
 37 |      
 38 |     def init_weights(self):# Initialize Linear Weight 
 39 |         for m in [self.w_name, self.w_tok, self.fuse3]:        
 40 |             m.weight.data.uniform_(-0.1, 0.1) #nn.init.xavier_normal_(m.weight)
 41 |             nn.init.constant_(m.bias, 0.) 
 42 | 
 43 |     def code_encoding(self, name, name_len, tokens, tok_len):
 44 |         name_repr = self.name_encoder(name, name_len)
 45 |         tok_repr = self.tok_encoder(tokens, tok_len)
 46 |         code_repr = self.fuse3(torch.tanh(self.w_name(name_repr)+self.w_tok(tok_repr)))
 47 |         return code_repr
 48 | 
 49 |         
 50 |     def desc_encoding(self, desc, desc_len):
 51 |         batch_size = desc.size()[0]
 52 |         desc_enc_hidden = self.desc_encoder.init_hidden(batch_size)
 53 |         # desc_enc_hidden: [2 x batch_size x n_hidden]
 54 |         desc_feat, desc_enc_hidden = self.desc_encoder(desc, desc_len, desc_enc_hidden)
 55 |         desc_enc_hidden = desc_enc_hidden[0]
 56 | 
 57 |         if self.conf['use_desc_attn']:
 58 |             seq_len = desc_feat.size()[1]
 59 | 
 60 |             device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
 61 |             unpack_len_list = desc_len.long().to(device)
 62 |             range_tensor = torch.arange(seq_len).to(device)
 63 |             mask_1forgt0 = range_tensor[None, :] < unpack_len_list[:, None]
 64 |             mask_1forgt0 = mask_1forgt0.reshape(-1, seq_len)
 65 | 
 66 |             desc_sa_tanh = torch.tanh(self.self_attn2(desc_feat.reshape(-1, self.n_hidden))) # [(batch_sz * seq_len) x n_hidden]
 67 |             desc_sa_tanh = F.dropout(desc_sa_tanh, self.dropout, training=self.training)
 68 |             desc_sa_tanh = self.self_attn_scalar2(desc_sa_tanh).reshape(-1, seq_len) # [batch_sz x seq_len]
 69 |             desc_feat = desc_feat.reshape(-1, seq_len, self.n_hidden)
 70 |             
 71 |             self_attn_desc_feat = None
 72 |             for _i in range(batch_size):
 73 |                 desc_sa_tanh_one = torch.masked_select(desc_sa_tanh[_i, :], mask_1forgt0[_i, :]).reshape(1, -1)
 74 |                 # attn_w_one: [1 x 1 x seq_len]
 75 |                 attn_w_one = F.softmax(desc_sa_tanh_one, dim=1).reshape(1, 1, -1)
 76 |                 
 77 |                 # attn_feat_one: [1 x seq_len x n_hidden]
 78 |                 attn_feat_one = torch.masked_select(desc_feat[_i, :, :].reshape(1, seq_len, self.n_hidden),
 79 |                                                     mask_1forgt0[_i, :].reshape(1, seq_len, 1)).reshape(1, -1, self.n_hidden)
 80 |                 # out_to_cat: [1 x n_hidden]
 81 |                 out_to_cat = torch.bmm(attn_w_one, attn_feat_one).reshape(1, self.n_hidden)   
 82 |                 # self_attn_code_feat: [batch_sz x n_hidden]                                                
 83 |                 self_attn_desc_feat = out_to_cat if self_attn_desc_feat is None else torch.cat(
 84 |                     (self_attn_desc_feat, out_to_cat), 0)
 85 | 
 86 |         else:
 87 |             self_attn_desc_feat = desc_enc_hidden.reshape(batch_size, self.n_hidden)
 88 |             
 89 |         if self.conf['use_tanh']:
 90 |             self_attn_desc_feat = torch.tanh(self_attn_desc_feat)
 91 |         
 92 |         # desc_feat: [batch_size x n_hidden]
 93 |         return self_attn_desc_feat
 94 | 
 95 |     
 96 |     def similarity(self, code_vec, desc_vec):
 97 |         assert self.conf['sim_measure'] in ['cos', 'poly', 'euc', 'sigmoid', 'gesd', 'aesd'], "invalid similarity measure"
 98 |         if self.conf['sim_measure']=='cos':
 99 |             return F.cosine_similarity(code_vec, desc_vec)
100 |         elif self.conf['sim_measure']=='poly':
101 |             return (0.5*torch.matmul(code_vec, desc_vec.t()).diag()+1)**2
102 |         elif self.conf['sim_measure']=='sigmoid':
103 |             return torch.tanh(torch.matmul(code_vec, desc_vec.t()).diag()+1)
104 |         elif self.conf['sim_measure'] in ['euc', 'gesd', 'aesd']:
105 |             euc_dist = torch.dist(code_vec, desc_vec, 2) # or torch.norm(code_vec-desc_vec,2)
106 |             euc_sim = 1 / (1 + euc_dist)
107 |             if self.conf['sim_measure']=='euc': return euc_sim                
108 |             sigmoid_sim = torch.sigmoid(torch.matmul(code_vec, desc_vec.t()).diag()+1)
109 |             if self.conf['sim_measure']=='gesd': 
110 |                 return euc_sim * sigmoid_sim
111 |             elif self.conf['sim_measure']=='aesd':
112 |                 return 0.5*(euc_sim+sigmoid_sim)
113 |     
114 |     def forward(self, name, name_len, tokens, tok_len, desc_anchor, desc_anchor_len, desc_neg, desc_neg_len):
115 |         # code_repr: [batch_sz x n_hidden]  
116 |         code_repr = self.code_encoding(name, name_len, tokens, tok_len)
117 |         # desc_repr: [batch_sz x n_hidden]  
118 |         desc_anchor_repr = self.desc_encoding(desc_anchor, desc_anchor_len)
119 |         desc_neg_repr = self.desc_encoding(desc_neg, desc_neg_len)
120 | 
121 |         # sim: [batch_sz]
122 |         anchor_sim = self.similarity(code_repr, desc_anchor_repr)
123 |         neg_sim = self.similarity(code_repr, desc_neg_repr) 
124 |         
125 |         loss = (self.margin-anchor_sim+neg_sim).clamp(min=1e-6).mean()
126 |         
127 |         return loss


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/output/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/shuffle_index.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/shuffle_index.npy


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
  3 | import sys
  4 | import traceback
  5 | import numpy as np
  6 | import argparse
  7 | import threading
  8 | import codecs
  9 | import logging
 10 | from tqdm import tqdm
 11 | logger = logging.getLogger(__name__)
 12 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 13 | 
 14 | import torch
 15 | 
 16 | import models, configs, data_loader 
 17 | from modules import get_cosine_schedule_with_warmup
 18 | from utils import similarity, normalize
 19 | from data_loader import *
 20 | 
 21 | 
 22 | def test(config, model, device):
 23 |     logger.info('test begin...')
 24 | 
 25 |     model.eval()
 26 |     model.to(device)
 27 | 
 28 |     # load data
 29 |     data_path = args.data_path+args.dataset+'/'
 30 |     test_set = eval(config['dataset_name'])(data_path,
 31 |                                 config['test_name'], config['name_len'],
 32 |                                 config['test_tokens'], config['tokens_len'],
 33 |                                 config['test_desc'], config['desc_len'])
 34 |     data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=2000,
 35 |                                         shuffle=False, drop_last=False, num_workers=1)
 36 |     # encode tokens and descs
 37 |     code_reprs, desc_reprs = [], []
 38 |     n_processed = 0
 39 |     for batch in data_loader:
 40 |         code_batch = [tensor.to(device) for tensor in batch[:4]]
 41 |         desc_batch = [tensor.to(device) for tensor in batch[4:6]]
 42 |         with torch.no_grad():
 43 |             code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
 44 |             desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
 45 |             # normalize when sim_measure=='cos'
 46 |             code_repr = normalize(code_repr)
 47 |             desc_repr = normalize(desc_repr)
 48 |         code_reprs.append(code_repr)
 49 |         desc_reprs.append(desc_repr)
 50 |         n_processed += batch[0].size(0) # +batch_size
 51 |     # code_reprs: [n_processed x n_hidden]
 52 |     code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 
 53 | 
 54 |     # calculate similarity
 55 |     sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
 56 |     test_sim_result, test_rank_result = [], []
 57 |     for i in tqdm(range(0, n_processed)):
 58 |         desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
 59 |         sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
 60 |         negsims = np.negative(sims)
 61 |         predict = np.argsort(negsims)
 62 |         
 63 |         # SuccessRate@k
 64 |         predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
 65 |         sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
 66 |         sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
 67 |         sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
 68 |         # MRR
 69 |         predict_list = predict.tolist()
 70 |         rank = predict_list.index(i)
 71 |         sum_mrr.append(1/float(rank+1))
 72 | 
 73 |         # results need to be saved
 74 |         predict_20 = [int(k) for k in predict[0:20]]
 75 |         sim_20 = [sims[k] for k in predict_20]
 76 |         test_sim_result.append(zip(predict_20, sim_20))
 77 |         test_rank_result.append(rank+1)
 78 | 
 79 |     logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
 80 |     save_path = args.data_path + 'result/'
 81 |     sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
 82 |     np.save(save_path+sim_result_filename, test_sim_result)
 83 |     np.save(save_path+rank_result_filename, test_rank_result)
 84 | 
 85 |     
 86 | def parse_args():
 87 |     parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
 88 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
 89 |     parser.add_argument('--model', type=str, default='JointEmbeder', help='model name')
 90 |     parser.add_argument('-d', '--dataset', type=str, default='github11', help='name of dataset.java, python')
 91 |     parser.add_argument('--reload_from', type=int, default=200, help='epoch to reload from')
 92 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
 93 |     parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
 94 |     return parser.parse_args()
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     args = parse_args()
 99 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
100 |     config = getattr(configs, 'config_'+args.model)()
101 |     
102 |     ##### Define model ######
103 |     logger.info('Constructing Model..')
104 |     model = getattr(models, args.model)(config) # initialize the model
105 |     ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
106 |     model.load_state_dict(torch.load(ckpt, map_location=device))
107 |     
108 |     test(config, model, device)    
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/user_study.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | import numpy as np
  5 | import argparse
  6 | import threading
  7 | import codecs
  8 | import logging
  9 | from tqdm import tqdm
 10 | logger = logging.getLogger(__name__)
 11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 12 | 
 13 | import torch
 14 | 
 15 | import models, configs, data_loader 
 16 | from modules import get_cosine_schedule_with_warmup
 17 | from utils import similarity, normalize
 18 | from data_loader import *
 19 | 
 20 | 
 21 | def test(config, model, device):
 22 |     logger.info('Test Begin...')
 23 | 
 24 |     model.eval()
 25 |     model.to(device)
 26 | 
 27 |     
 28 |     data_path = args.data_path+args.dataset+'/'
 29 | 
 30 |     code_base_set = eval(config['dataset_name'])(data_path,
 31 |                                   config['all_name'], config['name_len'],
 32 |                                   config['all_tokens'], config['tokens_len'])
 33 |     code_data_loader = torch.utils.data.DataLoader(dataset=code_base_set, batch_size=32,
 34 |                                         shuffle=False, drop_last=False, num_workers=1)
 35 |     
 36 |     code_reprs = []
 37 |     code_processed = 0
 38 |     for batch in code_data_loader:
 39 |         # batch[0:4]: name, name_len, token, token_len
 40 |         code_batch = [tensor.to(device) for tensor in batch[:4]]   
 41 |         with torch.no_grad():
 42 |             code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
 43 |             code_repr = normalize(code_repr)   
 44 |         code_reprs.append(code_repr)
 45 |         code_processed += batch[0].size(0) # +batch_size
 46 |     # code_reprs: [code_processed x n_hidden]
 47 |     code_reprs = np.vstack(code_reprs)
 48 |     print('processed code num: ', code_processed)
 49 |     
 50 |     
 51 |     query_desc_set = eval(config['dataset_name'])(data_path,
 52 |                                 f_descs=config['query_desc'], max_desc_len=config['desc_len'])
 53 |     desc_data_loader = torch.utils.data.DataLoader(dataset=query_desc_set, batch_size=32,
 54 |                                         shuffle=False, drop_last=False, num_workers=1)
 55 |     
 56 |     desc_reprs = []
 57 |     desc_processed = 0
 58 |     for batch in desc_data_loader:
 59 |         # batch[0:2]: good_desc, good_desc_len
 60 |         desc_batch = [tensor.to(device) for tensor in batch[0:2]]
 61 |         with torch.no_grad():
 62 |             desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hidden_size]
 63 |             desc_repr = normalize(desc_repr)
 64 |         desc_reprs.append(desc_repr)
 65 |         desc_processed += batch[0].size(0) # +batch_size
 66 |     # desc_reprs: [desc_processed x n_hidden]
 67 |     desc_reprs = np.vstack(desc_reprs)
 68 |     print('processed desc num: ', desc_processed)
 69 | 
 70 |     
 71 |     query_desc_index_file_path = data_path + args.query_desc_index_file
 72 |     desc_index = []
 73 |     with open(query_desc_index_file_path, 'r') as query_desc_index_file:
 74 |         lines = query_desc_index_file.readlines()
 75 |         for i in range(0, len(lines)):
 76 |             line = lines[i].strip()
 77 |             desc_index.append(int(line))
 78 |     print('desc_index: ', desc_index)
 79 | 
 80 |     sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
 81 |     test_sim_result, test_rank_result = [], []
 82 |     for i in tqdm(range(0, desc_processed)):
 83 |         ind = desc_index[i]
 84 | 
 85 |         desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
 86 |         sims = np.dot(code_reprs, desc_vec.T)[:, 0] # [code_processed]
 87 |         negsims = np.negative(sims)
 88 |         predict = np.argsort(negsims)
 89 |         
 90 |         # SuccessRate@k
 91 |         predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
 92 |         sum_1.append(1.0) if ind in predict_1 else sum_1.append(0.0)
 93 |         sum_5.append(1.0) if ind in predict_5 else sum_5.append(0.0)
 94 |         sum_10.append(1.0) if ind in predict_10 else sum_10.append(0.0)
 95 |         # MRR
 96 |         predict_list = predict.tolist()
 97 |         rank = predict_list.index(ind)
 98 |         sum_mrr.append(1/float(rank+1))
 99 | 
100 |         # results need to be saved
101 |         predict_20 = [int(k) for k in predict[0:20]]
102 |         sim_20 = [sims[k] for k in predict_20]
103 |         test_sim_result.append(zip(predict_20, sim_20))
104 |         test_rank_result.append(rank+1)
105 | 
106 |     logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
107 |     save_path = args.data_path + 'result/'
108 |     sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
109 |     np.save(save_path+sim_result_filename, test_sim_result)
110 |     np.save(save_path+rank_result_filename, test_rank_result)
111 |     
112 |     
113 | def parse_args():
114 |     parser = argparse.ArgumentParser("Test Code Search(Embedding) Model For User Study")
115 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
116 |     parser.add_argument('--model', type=str, default='JointEmbeder', help='model name')
117 |     parser.add_argument('-d', '--dataset', type=str, default='github_user_3', help='name of dataset.java, python')
118 |     parser.add_argument('--query_desc_index_file', type=str, default='query.desc.index.txt')
119 |     parser.add_argument('--reload_from', type=int, default=185, help='epoch to reload from')
120 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
121 |     return parser.parse_args()
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     args = parse_args()
126 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
127 |     config = getattr(configs, 'config_'+args.model)()
128 |     
129 |     ##### Define model ######
130 |     logger.info('Constructing Model..')
131 |     
132 |     model = getattr(models, args.model)(config) # initialize the model
133 |     ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
134 |     model.load_state_dict(torch.load(ckpt, map_location=device))
135 |     
136 |     test(config, model, device)    
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/util_desc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import argparse
  3 | from collections import Counter
  4 | import json
  5 | import h5py
  6 | 
  7 | PAD_ID, UNK_ID = [0, 1]
  8 | 
  9 | def make_shuffle_index(args):
 10 |     dir_path = args.data_path + args.dataset
 11 |     all_desc_file_path = dir_path + args.all_desc_file
 12 |     with open(all_desc_file_path, 'r') as all_desc_file:
 13 |         lines = all_desc_file.readlines()
 14 |         all_num = int(len(lines)/2) 
 15 | 
 16 |     index = np.arange(all_num)
 17 |     np.random.seed(16)
 18 |     np.random.shuffle(index)
 19 |     print(len(index))
 20 |     np.save(args.shuffle_index_file, index)
 21 | 
 22 | def split_desc_data(args):
 23 |     index = np.load(args.shuffle_index_file)
 24 | 
 25 |     dir_path = args.data_path + args.dataset
 26 |     all_desc_file_path = dir_path + args.all_desc_file
 27 |     train_desc_file_path = dir_path + args.train_desc_file
 28 |     test_desc_file_path = dir_path + args.test_desc_file
 29 | 
 30 |     input_desc = []
 31 |     with open(all_desc_file_path, 'r') as all_desc_file:
 32 |         lines = all_desc_file.readlines()
 33 |         for line in lines:
 34 |             if (line[0:10] != 'BeginFunc:'):
 35 |                 input_desc.append(line)
 36 |         print('number of input desc:\n', len(input_desc))
 37 | 
 38 |     with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
 39 |         for i in range(0, args.trainset_num):
 40 |             train_desc_file.write(input_desc[index[i]])
 41 |         for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
 42 |             test_desc_file.write(input_desc[index[i]])
 43 |     
 44 | 
 45 | def create_desc_dict_file(args):
 46 |     dir_path = args.data_path + args.dataset
 47 |     desc_file_path = dir_path + args.train_desc_file 
 48 | 
 49 |     input_desc = []
 50 |     with open(desc_file_path, 'r') as desc_file:
 51 |         input_desc = desc_file.readlines()
 52 |     desc_words = []
 53 |     for i in range(0, len(input_desc)):
 54 |         input_desc[i] = input_desc[i].rstrip('\n')
 55 |         desc_word_list = input_desc[i].split()
 56 |         for desc_word in desc_word_list:
 57 |             desc_words.append(desc_word)
 58 |     vocab_desc_info = Counter(desc_words)
 59 |     print(len(vocab_desc_info))
 60 | 
 61 |     vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
 62 |     vocab_desc_index = {'<pad>':0, '<unk>':1}
 63 |     vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
 64 | 
 65 |     
 66 |     vocab_desc_file_path = dir_path + args.vocab_desc_file
 67 |     desc_dic_str = json.dumps(vocab_desc_index)
 68 |     with open(vocab_desc_file_path, 'w') as vocab_desc_file:
 69 |         vocab_desc_file.write(desc_dic_str)
 70 | 
 71 | 
 72 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
 73 |     phrases, indices = [], []
 74 |     with open(sent_file_path, 'r') as sent_file:
 75 |         sents = sent_file.readlines()
 76 |     vocab = json.loads(open(vocab_file_path, "r").readline())
 77 |     start_index = 0
 78 |     for i in range(0, len(sents)):
 79 |         sent = sents[i].rstrip('\n')
 80 |         word_list = sent.split()
 81 |         sent_len = min(len(word_list), maxlen)
 82 |         indices.append((sent_len, start_index))
 83 |         for j in range(0, sent_len):
 84 |             word = word_list[j]
 85 |             phrases.append(vocab.get(word, UNK_ID))
 86 |         start_index += sent_len
 87 |     output_file_path = sent_file_path[0:-3] + 'h5'
 88 |     output_file = h5py.File(output_file_path, 'w')
 89 |     output_file['phrases'] = phrases
 90 |     output_file['indices'] = indices
 91 |     output_file.close()
 92 | 
 93 | 
 94 | def parse_args():
 95 |     parser = argparse.ArgumentParser("Parse Description data for TokenEmbedder")
 96 |     
 97 |     parser.add_argument('--data_path', type=str, default='./data/')
 98 |     parser.add_argument('--dataset', type=str, default='github11/')
 99 | 
100 |     parser.add_argument('--origin_desc_file', type=str, default='origin.desc.txt')
101 |     parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
102 |     parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
103 |     parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
104 |     parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
105 |     
106 |     parser.add_argument('--trainset_num', type=int, default=39152)
107 |     parser.add_argument('--testset_num', type=int, default=2000)
108 |     parser.add_argument('--desc_word_num', type=int, default=10000)
109 |     parser.add_argument('--desc_maxlen', type=int, default=30)
110 |     parser.add_argument('--testset_start_index', type=int, default=39152)
111 | 
112 |     parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
113 |  
114 |     return parser.parse_args()
115 | 
116 | if __name__ == '__main__':
117 | 
118 |     args = parse_args()
119 | 
120 |     #make_shuffle_index(args)
121 |     
122 |     #split_desc_data(args)
123 |     
124 |     create_desc_dict_file(args)
125 | 
126 |     dir_path = args.data_path + args.dataset
127 |     # train.desc.txt -> train.desc.h5(and test...) 
128 |     sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
129 |     sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
130 |     
131 | 


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/util_tok.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import argparse
  3 | from collections import Counter
  4 | import json
  5 | import h5py
  6 | 
  7 | PAD_ID, UNK_ID = [0, 1]
  8 | 
  9 | def split_token_data(args):
 10 |     index = np.load(args.shuffle_index_file)
 11 | 
 12 |     dir_path = args.data_path + args.dataset
 13 |     all_token_file_path = dir_path + args.all_token_file
 14 |     train_token_file_path = dir_path + args.train_token_file
 15 |     test_token_file_path = dir_path + args.test_token_file
 16 | 
 17 |     input_token = []
 18 |     with open(all_token_file_path, 'r') as all_token_file:
 19 |         lines = all_token_file.readlines()
 20 |         for line in lines:
 21 |             if (line[0:10] != 'BeginFunc:'):
 22 |                 input_token.append(line)
 23 |         print('number of input token:\n', len(input_token))
 24 | 
 25 |     with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file:
 26 |         for i in range(0, args.trainset_num):
 27 |             train_token_file.write(input_token[index[i]])
 28 |         for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
 29 |             test_token_file.write(input_token[index[i]])
 30 |     
 31 | 
 32 | def create_token_dict_file(args):
 33 |     dir_path = args.data_path + args.dataset
 34 |     token_file_path = dir_path + args.train_token_file 
 35 | 
 36 |     input_token = []
 37 |     with open(token_file_path, 'r') as token_file:
 38 |         input_token = token_file.readlines()
 39 |     token_words = []
 40 |     for i in range(0, len(input_token)):
 41 |         input_token[i] = input_token[i].rstrip('\n')
 42 |         token_word_list = input_token[i].split()
 43 |         for token_word in token_word_list:
 44 |             token_words.append(token_word)
 45 |     vocab_token_info = Counter(token_words)
 46 |     print(len(vocab_token_info))
 47 | 
 48 |     vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]]
 49 |     vocab_token_index = {'<pad>':0, '<unk>':1}
 50 |     vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))]))
 51 | 
 52 |     
 53 |     vocab_token_file_path = dir_path + args.vocab_token_file
 54 |     token_dic_str = json.dumps(vocab_token_index)
 55 |     with open(vocab_token_file_path, 'w') as vocab_token_file:
 56 |         vocab_token_file.write(token_dic_str)
 57 | 
 58 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
 59 |     phrases, indices = [], []
 60 |     with open(sent_file_path, 'r') as sent_file:
 61 |         sents = sent_file.readlines()
 62 |     vocab = json.loads(open(vocab_file_path, "r").readline())
 63 |     start_index = 0
 64 |     for i in range(0, len(sents)):
 65 |         sent = sents[i].rstrip('\n')
 66 |         word_list = sent.split()
 67 |         sent_len = min(len(word_list), maxlen)
 68 |         indices.append((sent_len, start_index))
 69 |         for j in range(0, sent_len):
 70 |             word = word_list[j]
 71 |             phrases.append(vocab.get(word, UNK_ID))
 72 |         start_index += sent_len
 73 |     output_file_path = sent_file_path[0:-3] + 'h5'
 74 |     output_file = h5py.File(output_file_path, 'w')
 75 |     output_file['phrases'] = phrases
 76 |     output_file['indices'] = indices
 77 |     output_file.close()
 78 | 
 79 | def remove_dup_tokens(args):
 80 |     dir_path = args.data_path + args.dataset
 81 |     origin_token_file_path = dir_path + args.origin_token_file
 82 |     all_token_file_path = dir_path + args.all_token_file
 83 | 
 84 |     with open(origin_token_file_path, 'r') as origin_token_file, open(all_token_file_path, 'w') as all_token_file:
 85 |         lines = origin_token_file.readlines()
 86 |         for i in range(0, len(lines)):
 87 |             if lines[i][0:10] != 'BeginFunc:':
 88 |                 line = lines[i].strip()
 89 |                 words = line.split()
 90 |                 new_words = list(set(words))
 91 |                 new_line = ' '.join(new_words)
 92 |                 all_token_file.write(new_line + '\n')
 93 | 
 94 | 
 95 | def parse_args():
 96 |     parser = argparse.ArgumentParser("Parse token data for TokenEmbedder")
 97 |     
 98 |     parser.add_argument('--data_path', type=str, default='./data/')
 99 |     parser.add_argument('--dataset', type=str, default='github_user_3/')
100 | 
101 |     parser.add_argument('--origin_token_file', type=str, default='origin.token.txt')
102 |     parser.add_argument('--all_token_file', type=str, default='all.token.txt')
103 |     parser.add_argument('--train_token_file', type=str, default='train.token.txt')
104 |     parser.add_argument('--test_token_file', type=str, default='test.token.txt')
105 |     parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json')
106 |     
107 |     parser.add_argument('--trainset_num', type=int, default=39152)
108 |     parser.add_argument('--testset_num', type=int, default=2000)
109 |     parser.add_argument('--token_word_num', type=int, default=10000)
110 |     parser.add_argument('--token_maxlen', type=int, default=50)
111 |     parser.add_argument('--testset_start_index', type=int, default=39152)
112 | 
113 | 
114 |     parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
115 |  
116 |     return parser.parse_args()
117 | 
118 | if __name__ == '__main__':
119 |     args = parse_args()
120 |     '''
121 |     dir_path = args.data_path + args.dataset
122 |     with open(dir_path+'origin.test.token.txt', 'r') as in_file, open(dir_path+'test.token.txt', 'w') as out_file:
123 |         lines = in_file.readlines()
124 |         for i in range(0, len(lines)):
125 |             if lines[i][0:10] != 'BeginFunc:':
126 |                 out_file.write(lines[i])
127 |     '''
128 |     remove_dup_tokens(args)
129 |     
130 |     #split_token_data(args)
131 |     #create_token_dict_file(args)
132 | 
133 |     dir_path = args.data_path + args.dataset
134 |     sents2indexes(dir_path+args.all_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
135 | 
136 |     '''
137 |     dir_path = args.data_path + args.dataset
138 |     # train.token.txt -> train.token.h5(and test...) 
139 |     sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
140 |     sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
141 |     '''
142 | 
143 |     '''
144 |     dir_path = args.data_path + args.dataset
145 |     all_token_file_path = dir_path + args.all_token_file
146 |     with open(all_token_file_path, 'r') as all_token_file:
147 |         lines = all_token_file.readlines()
148 |         print(len(lines))
149 |     for i in range(0, len(lines)):
150 |         line = lines[i]
151 |         if line[0:10] != 'BeginFunc:':
152 |             words = line.split()
153 |             if len(words) == 0:
154 |                 print(lines[i-1])
155 |                 #print(lines[i])
156 |     '''
157 | 


--------------------------------------------------------------------------------
/Baseline methods/DeepCS/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import math
  4 | import torch
  5 | from torch.nn import functional as F
  6 | 
  7 | PAD_ID, UNK_ID = [0, 1]
  8 | 
  9 | def cos_approx(data1,data2):
 10 |     """numpy implementation of cosine similarity for matrix"""
 11 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
 12 |     dotted = np.dot(data1,np.transpose(data2))
 13 |     norm1 = np.linalg.norm(data1,axis=1)
 14 |     norm2 = np.linalg.norm(data2,axis=1)
 15 |     matrix_vector_norms = np.multiply(norm1, norm2)
 16 |     neighbors = np.divide(dotted, matrix_vector_norms)
 17 |     return neighbors
 18 | 
 19 | def normalize(data):
 20 |     """normalize matrix by rows"""
 21 |     return data/np.linalg.norm(data,axis=1,keepdims=True)
 22 | 
 23 | def dot_np(data1,data2):
 24 |     """cosine similarity for normalized vectors"""
 25 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
 26 |     return np.dot(data1, data2.T)
 27 | 
 28 | def sigmoid(x):
 29 |     return 1/(1 + np.exp(-x)) 
 30 | 
 31 | def similarity(vec1, vec2, measure='cos'):
 32 |     if measure=='cos':
 33 |         vec1_norm = normalize(vec1)
 34 |         vec2_norm = normalize(vec2)
 35 |         return np.dot(vec1_norm, vec2_norm.T)[:,0]
 36 |     elif measure=='poly':
 37 |         return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
 38 |     elif measure=='sigmoid':
 39 |         return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
 40 |     elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf 
 41 |         euc_dist = np.linalg.norm(vec1-vec2, axis=1)
 42 |         euc_sim = 1 / (1 + euc_dist)
 43 |         if measure=='euc': return euc_sim                
 44 |         sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
 45 |         if measure == 'gesd': return euc_sim * sigmoid_sim
 46 |         elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
 47 | 
 48 | #######################################################################
 49 | 
 50 | def asMinutes(s):
 51 |     m = math.floor(s / 60)
 52 |     s -= m * 60
 53 |     return '%d:%d'% (m, s)
 54 | 
 55 | def timeSince(since, percent):
 56 |     now = time.time()
 57 |     s = now - since
 58 |     es = s / (percent)
 59 |     rs = es - s
 60 |     return '%s<%s'%(asMinutes(s), asMinutes(rs))
 61 | 
 62 | #######################################################################
 63 | '''
 64 | import nltk
 65 | try: nltk.word_tokenize("hello world")
 66 | except LookupError: nltk.download('punkt')
 67 |     
 68 | def sent2indexes(sentence, vocab, maxlen):
 69 |    
 70 |     def convert_sent(sent, vocab, maxlen):
 71 |         idxes = np.zeros(maxlen, dtype=np.int64)
 72 |         idxes.fill(PAD_ID)
 73 |         tokens = nltk.word_tokenize(sent.strip())
 74 |         idx_len = min(len(tokens), maxlen)
 75 |         for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID)
 76 |         return idxes, idx_len
 77 |     if type(sentence) is list:
 78 |         inds, lens = [], []
 79 |         for sent in sentence:
 80 |             idxes, idx_len = convert_sent(sent, vocab, maxlen)
 81 |             #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len])
 82 |             inds.append(idxes)
 83 |             lens.append(idx_len)
 84 |         return np.vstack(inds), np.vstack(lens)
 85 |     else:
 86 |         inds, lens = sent2indexes([sentence], vocab, maxlen)
 87 |         return inds[0], lens[0]
 88 | '''
 89 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 
 90 |     '''indexes: numpy array'''
 91 |     def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
 92 |         indexes=filter(lambda i: i!=ignore_tok, indexes)
 93 |         toks, length = [], 0        
 94 |         for idx in indexes:
 95 |             toks.append(ivocab.get(idx, '<unk>'))
 96 |             length+=1
 97 |         return ' '.join(toks), length
 98 |     
 99 |     ivocab = {v: k for k, v in vocab.items()}
100 |     if indexes.ndim==1:# one sentence
101 |         return revert_sent(indexes, ivocab, ignore_tok)
102 |     else:# dim>1
103 |         sentences, lens =[], [] # a batch of sentences
104 |         for inds in indexes:
105 |             sentence, length = revert_sent(inds, ivocab, ignore_tok)
106 |             sentences.append(sentence)
107 |             lens.append(length)
108 |         return sentences, lens
109 | 
110 | ########################################################################
111 | 


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/configs.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/data_loader.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/modules.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/util_cfg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/util_cfg.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/util_dfg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/util_dfg.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/configs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def config_CFGEmbeder():   
 3 |     conf = {
 4 |             # added_params
 5 |             'transform_every_modal': 0,  # to make modal more complex?
 6 |             'save_attn_weight': 0,
 7 |             'use_tanh': 1,
 8 |             'use_attn': 1,
 9 |             'use_desc_attn': 1,
10 | 
11 |             # GGNN
12 |             'state_dim': 512, # GGNN hidden state size
13 |             'annotation_dim': 5,
14 |             'n_edge_types': 1,
15 |             'n_node': 150, # could be less than 512, like the maximum nodenum
16 |             'n_steps': 5, # propogation steps number of GGNN
17 |             'output_type': 'no_reduce',
18 |             'batch_size': 32,
19 |             'n_layers': 1,
20 |             'n_hidden': 512,
21 |             'cfg_attn_mode': 'sigmoid_scalar',
22 | 
23 |             # data_params
24 |             'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
25 |             #training data
26 |             'train_token':'train.token.h5',
27 |             'train_dfg':'train.dfg.txt',
28 |             'train_cfg':'train.cfg.txt',
29 |             'train_desc':'train.desc.h5',
30 |             # test data
31 |             'test_token':'test.token.h5',
32 |             'test_dfg':'test.dfg.txt',
33 |             'test_cfg':'test.cfg.txt',
34 |             'test_desc':'test.desc.h5', 
35 |             #vocabulary info
36 |             'vocab_desc':'vocab.desc.json',
37 |             'vocab_token':'vocab.token.json',
38 |                    
39 |             #parameters
40 |             'desc_len': 30,
41 |             'tok_len': 100,
42 |             'n_desc_words': 10000, # wait to decide
43 |             'n_token_words': 20000,
44 |             
45 |             #training_params            
46 |             'nb_epoch': 200,
47 |             #'optimizer': 'adam',
48 |             'learning_rate':0.0003, # try 1e-4(paper)
49 |             'adam_epsilon':1e-8,
50 |             'warmup_steps':5000,
51 |             'fp16': False,
52 |             'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
53 |                             #"See details at https://nvidia.github.io/apex/amp.html"
54 | 
55 |         # model_params
56 |             'emb_size': 300,
57 |             # recurrent  
58 |             'margin': 0.6,
59 |             'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
60 |                          #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
61 |             'dropout': 0.1
62 |     }
63 |     return conf
64 | 
65 | 


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/data/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .cfgemb import CFGEmbeder


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/models/__pycache__/cfgemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/cfgemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/jointemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/tokenemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/output/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/shuffle_index.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/shuffle_index.npy


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | import numpy as np
  5 | import argparse
  6 | import threading
  7 | import codecs
  8 | import logging
  9 | from tqdm import tqdm
 10 | logger = logging.getLogger(__name__)
 11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package
 13 | 
 14 | import torch
 15 | 
 16 | import models, configs, data_loader 
 17 | from modules import get_cosine_schedule_with_warmup
 18 | from utils import similarity, normalize
 19 | from data_loader import *
 20 | 
 21 | 
 22 | def test(config, model, device):
 23 |     logger.info('Test Begin...')
 24 | 
 25 |     model.eval()
 26 |     model.to(device)
 27 | 
 28 |     # load data
 29 |     data_path = args.data_path+args.dataset+'/'
 30 |     test_set = eval(config['dataset_name'])(config, data_path,
 31 |                                 config['test_cfg'], config['n_node'],
 32 |                                 config['test_desc'], config['desc_len'])
 33 |     data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
 34 |                                         shuffle=False, drop_last=False, num_workers=1)
 35 |     # encode tokens and descs
 36 |     code_reprs, desc_reprs = [], []
 37 |     n_processed = 0
 38 |     for batch in data_loader:
 39 |         # batch[0:3]: init_input, adjmat, node_mask
 40 |         code_batch = [tensor.to(device) for tensor in batch[:3]]
 41 |         # batch[3:5]: good_desc, good_desc_len
 42 |         desc_batch = [tensor.to(device) for tensor in batch[3:5]]
 43 |         with torch.no_grad():
 44 |             code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
 45 |             desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
 46 |             # normalize when sim_measure=='cos'
 47 |             code_repr = normalize(code_repr)
 48 |             desc_repr = normalize(desc_repr)
 49 |         code_reprs.append(code_repr)
 50 |         desc_reprs.append(desc_repr)
 51 |         n_processed += batch[0].size(0) # +batch_size
 52 |     # code_reprs: [n_processed x n_hidden]
 53 |     code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 
 54 | 
 55 |     # calculate similarity
 56 |     sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
 57 |     test_sim_result, test_rank_result = [], []
 58 |     for i in tqdm(range(0, n_processed)):
 59 |         desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
 60 |         sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
 61 |         negsims = np.negative(sims)
 62 |         predict = np.argsort(negsims)
 63 |         
 64 |         # SuccessRate@k
 65 |         predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
 66 |         sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
 67 |         sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
 68 |         sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
 69 |         # MRR
 70 |         predict_list = predict.tolist()
 71 |         rank = predict_list.index(i)
 72 |         sum_mrr.append(1/float(rank+1))
 73 | 
 74 |         # results need to be saved
 75 |         predict_20 = [int(k) for k in predict[0:20]]
 76 |         sim_20 = [sims[k] for k in predict_20]
 77 |         test_sim_result.append(zip(predict_20, sim_20))
 78 |         test_rank_result.append(rank+1)
 79 | 
 80 |     logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
 81 |     save_path = args.data_path + 'result/'
 82 |     sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
 83 |     np.save(save_path+sim_result_filename, test_sim_result)
 84 |     np.save(save_path+rank_result_filename, test_rank_result)
 85 | 
 86 |     
 87 | def parse_args():
 88 |     parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
 89 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
 90 |     parser.add_argument('--model', type=str, default='CFGEmbeder', help='model name')
 91 |     parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python')
 92 |     parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from')
 93 |     parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\
 94 |                          'Note: should be consistent with the same argument in the repr_code.py')
 95 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
 96 |     parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
 97 |     return parser.parse_args()
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     args = parse_args()
102 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
103 |     config = getattr(configs, 'config_'+args.model)()
104 |     
105 |     ##### Define model ######
106 |     logger.info('Constructing Model..')
107 |     model = getattr(models, args.model)(config) # initialize the model
108 |     ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
109 |     model.load_state_dict(torch.load(ckpt, map_location=device))
110 |     
111 |     test(config, model, device)    
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/util_desc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import argparse
  3 | from collections import Counter
  4 | import json
  5 | import h5py
  6 | from utils import UNK_ID
  7 | 
  8 | def make_shuffle_index_num(args, all_num):
  9 |     index = np.arange(all_num)
 10 |     np.random.seed(16)
 11 |     np.random.shuffle(index)
 12 |     print('index:\n', index)
 13 |     np.save(args.shuffle_index_file, index)
 14 | 
 15 | def make_shuffle_index(args):
 16 |     dir_path = args.data_path + args.dataset
 17 |     all_desc_file_path = dir_path + args.all_desc_file
 18 |     with open(all_desc_file_path, 'r') as all_desc_file:
 19 |         lines = all_desc_file.readlines()
 20 |         all_num = int(len(lines)/2) 
 21 |     print('all_num of desc:\n', all_num)
 22 | 
 23 |     index = np.arange(all_num)
 24 |     np.random.seed(16)
 25 |     np.random.shuffle(index)
 26 |     print('index:\n', index)
 27 |     np.save(args.shuffle_index_file, index)
 28 | 
 29 | def split_data(args):
 30 |     index = np.load(args.shuffle_index_file)
 31 | 
 32 |     dir_path = args.data_path + args.dataset
 33 |     all_desc_file_path = dir_path + args.all_desc_file
 34 |     train_desc_file_path = dir_path + args.train_desc_file
 35 |     test_desc_file_path = dir_path + args.test_desc_file
 36 | 
 37 |     input_desc = []
 38 |     with open(all_desc_file_path, 'r') as all_desc_file:
 39 |         lines = all_desc_file.readlines()
 40 |         for line in lines:
 41 |             if (line[0:10] != 'BeginFunc:'):
 42 |                 input_desc.append(line)
 43 |         print('number of input desc:\n', len(input_desc))
 44 | 
 45 |     with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
 46 |         for i in range(0, args.trainset_num):
 47 |             train_desc_file.write(input_desc[index[i]])
 48 |         for i in range(32000, 32000+args.testset_num):
 49 |             test_desc_file.write(input_desc[index[i]])
 50 |     
 51 | 
 52 | def create_dict_file(args):
 53 |     dir_path = args.data_path + args.dataset
 54 |     desc_file_path = dir_path + args.train_desc_file 
 55 | 
 56 |     input_desc = []
 57 |     with open(desc_file_path, 'r') as desc_file:
 58 |         input_desc = desc_file.readlines()
 59 |     desc_words = []
 60 |     for i in range(0, len(input_desc)):
 61 |         input_desc[i] = input_desc[i].rstrip('\n')
 62 |         desc_word_list = input_desc[i].split()
 63 |         for desc_word in desc_word_list:
 64 |             desc_words.append(desc_word)
 65 |     vocab_desc_info = Counter(desc_words)
 66 |     print(len(vocab_desc_info))
 67 | 
 68 |     vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
 69 |     vocab_desc_index = {'<pad>':0, '<unk>':1}
 70 |     vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
 71 | 
 72 |     
 73 |     vocab_desc_file_path = dir_path + args.vocab_desc_file
 74 |     desc_dic_str = json.dumps(vocab_desc_index)
 75 |     with open(vocab_desc_file_path, 'w') as vocab_desc_file:
 76 |         vocab_desc_file.write(desc_dic_str)
 77 | 
 78 | 
 79 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
 80 |     phrases, indices = [], []
 81 |     with open(sent_file_path, 'r') as sent_file:
 82 |         sents = sent_file.readlines()
 83 |     vocab = json.loads(open(vocab_file_path, "r").readline())
 84 |     start_index = 0
 85 |     for i in range(0, len(sents)):
 86 |         sent = sents[i].rstrip('\n')
 87 |         word_list = sent.split()
 88 |         sent_len = min(len(word_list), maxlen)
 89 |         indices.append((sent_len, start_index))
 90 |         for j in range(0, sent_len):
 91 |             word = word_list[j]
 92 |             phrases.append(vocab.get(word, UNK_ID))
 93 |         start_index += sent_len
 94 |     output_file_path = sent_file_path[0:-3] + 'h5'
 95 |     output_file = h5py.File(output_file_path, 'w')
 96 |     output_file['phrases'] = phrases
 97 |     output_file['indices'] = indices
 98 |     output_file.close()
 99 | 
100 | 
101 | def parse_args():
102 |     parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder")
103 |     
104 |     parser.add_argument('--data_path', type=str, default='./data/')
105 |     parser.add_argument('--dataset', type=str, default='github/')
106 | 
107 |     parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
108 |     parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
109 |     parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
110 |     parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
111 |     
112 |     parser.add_argument('--trainset_num', type=int, default=32000)
113 |     parser.add_argument('--testset_num', type=int, default=1000)
114 |     parser.add_argument('--desc_word_num', type=int, default=10000)
115 |     parser.add_argument('--desc_maxlen', type=int, default=30)
116 | 
117 |     parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
118 |  
119 |     return parser.parse_args()
120 | 
121 | if __name__ == '__main__':
122 |     args = parse_args()
123 | 
124 |     #make_shuffle_index_num(args, 33000)
125 |     #split_data(args)
126 |     #create_dict_file(args)
127 |     
128 |     dir_path = args.data_path + args.dataset
129 |     # train.desc.txt -> train.desc.h5(and test...) 
130 |     sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
131 |     sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
132 |     
133 | 


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/util_tok.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import argparse
  3 | from collections import Counter
  4 | import json
  5 | import h5py
  6 | 
  7 | PAD_ID, UNK_ID = [0, 1]
  8 | 
  9 | def split_token_data(args):
 10 |     index = np.load(args.shuffle_index_file)
 11 | 
 12 |     dir_path = args.data_path + args.dataset
 13 |     all_token_file_path = dir_path + args.all_token_file
 14 |     train_token_file_path = dir_path + args.train_token_file
 15 |     test_token_file_path = dir_path + args.test_token_file
 16 | 
 17 |     input_token = []
 18 |     with open(all_token_file_path, 'r') as all_token_file:
 19 |         lines = all_token_file.readlines()
 20 |         for line in lines:
 21 |             if (line[0:10] != 'BeginFunc:'):
 22 |                 input_token.append(line)
 23 |         print('number of input token:\n', len(input_token))
 24 | 
 25 |     with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file:
 26 |         for i in range(0, args.trainset_num):
 27 |             train_token_file.write(input_token[index[i]])
 28 |         for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
 29 |             test_token_file.write(input_token[index[i]])
 30 |     
 31 | 
 32 | def create_token_dict_file(args):
 33 |     dir_path = args.data_path + args.dataset
 34 |     token_file_path = dir_path + args.train_token_file 
 35 | 
 36 |     input_token = []
 37 |     with open(token_file_path, 'r') as token_file:
 38 |         input_token = token_file.readlines()
 39 |     token_words = []
 40 |     for i in range(0, len(input_token)):
 41 |         input_token[i] = input_token[i].rstrip('\n')
 42 |         token_word_list = input_token[i].split()
 43 |         for token_word in token_word_list:
 44 |             token_words.append(token_word)
 45 |     vocab_token_info = Counter(token_words)
 46 |     print(len(vocab_token_info))
 47 | 
 48 |     vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]]
 49 |     vocab_token_index = {'<pad>':0, '<unk>':1}
 50 |     vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))]))
 51 | 
 52 |     
 53 |     vocab_token_file_path = dir_path + args.vocab_token_file
 54 |     token_dic_str = json.dumps(vocab_token_index)
 55 |     with open(vocab_token_file_path, 'w') as vocab_token_file:
 56 |         vocab_token_file.write(token_dic_str)
 57 | 
 58 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
 59 |     phrases, indices = [], []
 60 |     with open(sent_file_path, 'r') as sent_file:
 61 |         sents = sent_file.readlines()
 62 |     vocab = json.loads(open(vocab_file_path, "r").readline())
 63 |     start_index = 0
 64 |     for i in range(0, len(sents)):
 65 |         sent = sents[i].rstrip('\n')
 66 |         word_list = sent.split()
 67 |         sent_len = min(len(word_list), maxlen)
 68 |         indices.append((sent_len, start_index))
 69 |         for j in range(0, sent_len):
 70 |             word = word_list[j]
 71 |             phrases.append(vocab.get(word, UNK_ID))
 72 |         start_index += sent_len
 73 |     output_file_path = sent_file_path[0:-3] + 'h5'
 74 |     output_file = h5py.File(output_file_path, 'w')
 75 |     output_file['phrases'] = phrases
 76 |     output_file['indices'] = indices
 77 |     output_file.close()
 78 | 
 79 | def parse_args():
 80 |     parser = argparse.ArgumentParser("Parse token data for TokenEmbedder")
 81 |     
 82 |     parser.add_argument('--data_path', type=str, default='./data/')
 83 |     parser.add_argument('--dataset', type=str, default='github11/')
 84 | 
 85 |     parser.add_argument('--all_token_file', type=str, default='all.token.txt')
 86 |     parser.add_argument('--train_token_file', type=str, default='train.token.txt')
 87 |     parser.add_argument('--test_token_file', type=str, default='test.token.txt')
 88 |     parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json')
 89 |     
 90 |     parser.add_argument('--trainset_num', type=int, default=39152)
 91 |     parser.add_argument('--testset_num', type=int, default=2000)
 92 |     parser.add_argument('--token_word_num', type=int, default=20000)
 93 |     parser.add_argument('--token_maxlen', type=int, default=100)
 94 |     parser.add_argument('--testset_start_index', type=int, default=39152)
 95 | 
 96 | 
 97 |     parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
 98 |  
 99 |     return parser.parse_args()
100 | 
101 | if __name__ == '__main__':
102 |     args = parse_args()
103 |     '''
104 |     dir_path = args.data_path + args.dataset
105 |     with open(dir_path+'origin.test.token.txt', 'r') as in_file, open(dir_path+'test.token.txt', 'w') as out_file:
106 |         lines = in_file.readlines()
107 |         for i in range(0, len(lines)):
108 |             if lines[i][0:10] != 'BeginFunc:':
109 |                 out_file.write(lines[i])
110 |     '''
111 |     
112 |     split_token_data(args)
113 |     create_token_dict_file(args)
114 |     
115 |     dir_path = args.data_path + args.dataset
116 |     # train.token.txt -> train.token.h5(and test...) 
117 |     sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
118 |     sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
119 |     
120 | 
121 |     '''
122 |     dir_path = args.data_path + args.dataset
123 |     all_token_file_path = dir_path + args.all_token_file
124 |     with open(all_token_file_path, 'r') as all_token_file:
125 |         lines = all_token_file.readlines()
126 |         print(len(lines))
127 |     for i in range(0, len(lines)):
128 |         line = lines[i]
129 |         if line[0:10] != 'BeginFunc:':
130 |             words = line.split()
131 |             if len(words) == 0:
132 |                 print(lines[i-1])
133 |                 #print(lines[i])
134 |     '''
135 | 


--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | import math
 4 | import torch
 5 | from torch.nn import functional as F
 6 | 
 7 | PAD_ID, UNK_ID = [0, 1]
 8 | 
 9 | def cos_approx(data1,data2):
10 |     """numpy implementation of cosine similarity for matrix"""
11 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
12 |     dotted = np.dot(data1,np.transpose(data2))
13 |     norm1 = np.linalg.norm(data1,axis=1)
14 |     norm2 = np.linalg.norm(data2,axis=1)
15 |     matrix_vector_norms = np.multiply(norm1, norm2)
16 |     neighbors = np.divide(dotted, matrix_vector_norms)
17 |     return neighbors
18 | 
19 | def normalize(data):
20 |     """normalize matrix by rows"""
21 |     return data/np.linalg.norm(data,axis=1,keepdims=True)
22 | 
23 | def dot_np(data1,data2):
24 |     """cosine similarity for normalized vectors"""
25 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
26 |     return np.dot(data1, data2.T)
27 | 
28 | def sigmoid(x):
29 |     return 1/(1 + np.exp(-x)) 
30 | 
31 | def similarity(vec1, vec2, measure='cos'):
32 |     if measure=='cos':
33 |         vec1_norm = normalize(vec1)
34 |         vec2_norm = normalize(vec2)
35 |         return np.dot(vec1_norm, vec2_norm.T)[:,0]
36 |     elif measure=='poly':
37 |         return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
38 |     elif measure=='sigmoid':
39 |         return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
40 |     elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf 
41 |         euc_dist = np.linalg.norm(vec1-vec2, axis=1)
42 |         euc_sim = 1 / (1 + euc_dist)
43 |         if measure=='euc': return euc_sim                
44 |         sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
45 |         if measure == 'gesd': return euc_sim * sigmoid_sim
46 |         elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
47 | 
48 | #######################################################################
49 | 
50 | def asMinutes(s):
51 |     m = math.floor(s / 60)
52 |     s -= m * 60
53 |     return '%d:%d'% (m, s)
54 | 
55 | def timeSince(since, percent):
56 |     now = time.time()
57 |     s = now - since
58 |     es = s / (percent)
59 |     rs = es - s
60 |     return '%s<%s'%(asMinutes(s), asMinutes(rs))
61 | 
62 | #######################################################################
63 |     
64 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 
65 |     '''indexes: numpy array'''
66 |     def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
67 |         indexes=filter(lambda i: i!=ignore_tok, indexes)
68 |         toks, length = [], 0        
69 |         for idx in indexes:
70 |             toks.append(ivocab.get(idx, '<unk>'))
71 |             length+=1
72 |         return ' '.join(toks), length
73 |     
74 |     ivocab = {v: k for k, v in vocab.items()}
75 |     if indexes.ndim==1:# one sentence
76 |         return revert_sent(indexes, ivocab, ignore_tok)
77 |     else:# dim>1
78 |         sentences, lens =[], [] # a batch of sentences
79 |         for inds in indexes:
80 |             sentence, length = revert_sent(inds, ivocab, ignore_tok)
81 |             sentences.append(sentence)
82 |             lens.append(length)
83 |         return sentences, lens
84 | 
85 | ########################################################################
86 | 


--------------------------------------------------------------------------------
/Baseline methods/MMAN/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/MMAN/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/configs.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/data_loader.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/modules.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/__pycache__/util_cfg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/util_cfg.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/configs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def config_MultiEmbeder():   
 3 |     conf = {
 4 |             # GGNN
 5 |             'state_dim': 512, # GGNN hidden state size
 6 |             'annotation_dim': 5,
 7 |             'n_edge_types': 2,
 8 |             'n_node': 200, # could be less than 512, like the maximum nodenum
 9 |             'n_steps': 5, # propogation steps number of GGNN
10 |             'output_type': 'no_reduce',
11 |             'batch_size': 32,
12 |             'n_layers': 1,
13 |             'n_hidden': 512,
14 |             'cfg_attn_mode': 'sigmoid_scalar',
15 | 
16 |             # TreeLSTM
17 |             'treelstm_cell_type': 'nary', # nary or childsum
18 |             'n_ast_words': 50000,
19 | 
20 |             # Token and Description
21 |             'desc_len': 30,
22 |             'tok_len': 100,
23 |             'n_desc_words': 10000, 
24 |             'n_token_words': 25000,
25 | 
26 |             # data_params
27 |             'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
28 |             # training data
29 |             'train_token':'train.token.h5',
30 |             'train_ast':'train.ast.json',
31 |             'train_cfg':'train.cfg.txt',
32 |             'train_desc':'train.desc.h5',
33 |             # test data
34 |             'test_token':'test.token.h5',
35 |             'test_ast':'test.ast.json',
36 |             'test_cfg':'test.cfg.txt',
37 |             'test_desc':'test.desc.h5', 
38 |             # vocabulary info
39 |             'vocab_token':'vocab.token.json',
40 |             'vocab_ast':'vocab.ast.json',
41 |             'vocab_desc':'vocab.desc.json',
42 |                    
43 |             # model_params
44 |             'emb_size': 300,
45 |             # recurrent  
46 |             'margin': 0.6,
47 |             'sim_measure':'cos',
48 |             'dropout': 0.1,
49 |             
50 |                     
51 |             # training_params            
52 |             'nb_epoch': 200,
53 |             #'optimizer': 'adamW',
54 |             'learning_rate':0.0003, # try 1e-4(paper)
55 |             'adam_epsilon':1e-8,
56 |             'warmup_steps':5000,
57 |             'fp16': False,
58 |             'fp16_opt_level': 'O1' #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
59 |                             #"See details at https://nvidia.github.io/apex/amp.html"
60 | 
61 |         
62 |     }
63 |     return conf
64 | 
65 | 


--------------------------------------------------------------------------------
/Baseline methods/MMAN/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__init__.py


--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/util_ast.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_ast.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/util_cfg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_cfg.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/util_desc.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_desc.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/util_tok.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_tok.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/util_desc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import argparse
  3 | from collections import Counter
  4 | import json
  5 | import h5py
  6 | 
  7 | PAD_ID, UNK_ID = [0, 1]
  8 | 
  9 | def make_shuffle_index(args):
 10 |     dir_path = args.data_path + args.dataset
 11 |     all_desc_file_path = dir_path + args.all_desc_file
 12 |     with open(all_desc_file_path, 'r') as all_desc_file:
 13 |         lines = all_desc_file.readlines()
 14 |         all_num = int(len(lines)/2) 
 15 | 
 16 |     index = np.arange(all_num)
 17 |     np.random.seed(16)
 18 |     np.random.shuffle(index)
 19 |     #print(index)
 20 |     np.save(args.shuffle_index_file, index)
 21 | 
 22 | def split_desc_data(args):
 23 |     index = np.load(args.shuffle_index_file)
 24 | 
 25 |     dir_path = args.data_path + args.dataset
 26 |     all_desc_file_path = dir_path + args.all_desc_file
 27 |     train_desc_file_path = dir_path + args.train_desc_file
 28 |     test_desc_file_path = dir_path + args.test_desc_file
 29 | 
 30 |     input_desc = []
 31 |     with open(all_desc_file_path, 'r') as all_desc_file:
 32 |         lines = all_desc_file.readlines()
 33 |         for line in lines:
 34 |             if (line[0:10] != 'BeginFunc:'):
 35 |                 input_desc.append(line)
 36 |         print('number of input desc:\n', len(input_desc))
 37 | 
 38 |     with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
 39 |         for i in range(0, args.trainset_num):
 40 |             train_desc_file.write(input_desc[index[i]])
 41 |         for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
 42 |             test_desc_file.write(input_desc[index[i]])
 43 |     
 44 | 
 45 | def create_desc_dict_file(args):
 46 |     dir_path = args.data_path + args.dataset
 47 |     desc_file_path = dir_path + args.train_desc_file 
 48 | 
 49 |     input_desc = []
 50 |     with open(desc_file_path, 'r') as desc_file:
 51 |         input_desc = desc_file.readlines()
 52 |     desc_words = []
 53 |     for i in range(0, len(input_desc)):
 54 |         input_desc[i] = input_desc[i].rstrip('\n')
 55 |         desc_word_list = input_desc[i].split()
 56 |         for desc_word in desc_word_list:
 57 |             desc_words.append(desc_word)
 58 |     vocab_desc_info = Counter(desc_words)
 59 |     print(len(vocab_desc_info))
 60 | 
 61 |     vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
 62 |     vocab_desc_index = {'<pad>':0, '<unk>':1}
 63 |     vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
 64 | 
 65 |     
 66 |     vocab_desc_file_path = dir_path + args.vocab_desc_file
 67 |     desc_dic_str = json.dumps(vocab_desc_index)
 68 |     with open(vocab_desc_file_path, 'w') as vocab_desc_file:
 69 |         vocab_desc_file.write(desc_dic_str)
 70 | 
 71 | 
 72 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
 73 |     phrases, indices = [], []
 74 |     with open(sent_file_path, 'r') as sent_file:
 75 |         sents = sent_file.readlines()
 76 |     vocab = json.loads(open(vocab_file_path, "r").readline())
 77 |     start_index = 0
 78 |     for i in range(0, len(sents)):
 79 |         sent = sents[i].rstrip('\n')
 80 |         word_list = sent.split()
 81 |         sent_len = min(len(word_list), maxlen)
 82 |         indices.append((sent_len, start_index))
 83 |         for j in range(0, sent_len):
 84 |             word = word_list[j]
 85 |             phrases.append(vocab.get(word, UNK_ID))
 86 |         start_index += sent_len
 87 |     output_file_path = sent_file_path[0:-3] + 'h5'
 88 |     output_file = h5py.File(output_file_path, 'w')
 89 |     output_file['phrases'] = phrases
 90 |     output_file['indices'] = indices
 91 |     output_file.close()
 92 | 
 93 | '''
 94 | def parse_args():
 95 |     parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder")
 96 |     
 97 |     parser.add_argument('--data_path', type=str, default='./data/')
 98 |     parser.add_argument('--dataset', type=str, default='example/')
 99 | 
100 |     parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
101 |     parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
102 |     parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
103 |     parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
104 |     
105 |     parser.add_argument('--trainset_num', type=int, default=12)
106 |     parser.add_argument('--testset_num', type=int, default=1000)
107 |     parser.add_argument('--desc_word_num', type=int, default=50)
108 |     parser.add_argument('--desc_maxlen', type=int, default=50)
109 |     parser.add_argument('--testset_start_index', type=int, default=33000)
110 | 
111 |     parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
112 |  
113 |     return parser.parse_args()
114 | 
115 | if __name__ == '__main__':
116 | 
117 |     args = parse_args()
118 | 
119 |     #make_shuffle_index(args)
120 |     #split_data(args)
121 |     create_desc_dict_file(args)
122 | 
123 |     
124 |     dir_path = args.data_path + args.dataset
125 |     # train.desc.txt -> train.desc.h5(and test...) 
126 |     sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
127 |     #sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
128 | '''
129 | 


--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/util_tok.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | from collections import Counter
 4 | import json
 5 | import h5py
 6 | 
 7 | PAD_ID, UNK_ID = [0, 1]
 8 | 
 9 | def split_token_data(args):
10 |     index = np.load(args.shuffle_index_file)
11 | 
12 |     dir_path = args.data_path + args.dataset
13 |     all_token_file_path = dir_path + args.all_token_file
14 |     train_token_file_path = dir_path + args.train_token_file
15 |     test_token_file_path = dir_path + args.test_token_file
16 | 
17 |     input_token = []
18 |     with open(all_token_file_path, 'r') as all_token_file:
19 |         lines = all_token_file.readlines()
20 |         for line in lines:
21 |             if (line[0:10] != 'BeginFunc:'):
22 |                 input_token.append(line)
23 |         print('number of input token:\n', len(input_token))
24 | 
25 |     with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file:
26 |         for i in range(0, args.trainset_num):
27 |             train_token_file.write(input_token[index[i]])
28 |         for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
29 |             test_token_file.write(input_token[index[i]])
30 |     
31 | 
32 | def create_token_dict_file(args):
33 |     dir_path = args.data_path + args.dataset
34 |     token_file_path = dir_path + args.train_token_file 
35 | 
36 |     input_token = []
37 |     with open(token_file_path, 'r') as token_file:
38 |         input_token = token_file.readlines()
39 |     token_words = []
40 |     for i in range(0, len(input_token)):
41 |         input_token[i] = input_token[i].rstrip('\n')
42 |         token_word_list = input_token[i].split()
43 |         for token_word in token_word_list:
44 |             token_words.append(token_word)
45 |     vocab_token_info = Counter(token_words)
46 |     print(len(vocab_token_info))
47 | 
48 |     vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]]
49 |     vocab_token_index = {'<pad>':0, '<unk>':1}
50 |     vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))]))
51 | 
52 |     
53 |     vocab_token_file_path = dir_path + args.vocab_token_file
54 |     token_dic_str = json.dumps(vocab_token_index)
55 |     with open(vocab_token_file_path, 'w') as vocab_token_file:
56 |         vocab_token_file.write(token_dic_str)
57 | 
58 | 
59 | '''
60 | def parse_args():
61 |     parser = argparse.ArgumentParser("Parse tokenription data for CFGEmbedder")
62 |     
63 |     parser.add_argument('--data_path', type=str, default='./data/')
64 |     parser.add_argument('--dataset', type=str, default='example/')
65 | 
66 |     parser.add_argument('--all_token_file', type=str, default='all.token.txt')
67 |     parser.add_argument('--train_token_file', type=str, default='train.token.txt')
68 |     parser.add_argument('--test_token_file', type=str, default='test.token.txt')
69 |     parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json')
70 |     
71 |     parser.add_argument('--trainset_num', type=int, default=12)
72 |     parser.add_argument('--testset_num', type=int, default=1000)
73 |     parser.add_argument('--token_word_num', type=int, default=50)
74 |     parser.add_argument('--token_maxlen', type=int, default=50)
75 |     parser.add_argument('--testset_start_index', type=int, default=33000)
76 | 
77 | 
78 |     parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
79 |  
80 |     return parser.parse_args()
81 | 
82 | if __name__ == '__main__':
83 |     args = parse_args()
84 | 
85 |     #make_shuffle_index(args)
86 |     #split_data(args)
87 |     #create_token_dict_file(args)
88 | 
89 |     
90 |     #dir_path = args.data_path + args.dataset
91 |     # train.token.txt -> train.token.h5(and test...) 
92 |     #sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
93 |     #sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
94 | '''
95 | 


--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .multiemb import MultiEmbeder


--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__pycache__/cfgemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/cfgemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/jointemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__pycache__/multiemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/multiemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/tokenemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/MMAN/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/output/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/MMAN/output/MultiEmbeder/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/output/MultiEmbeder/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/MMAN/shuffle_index.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/shuffle_index.npy


--------------------------------------------------------------------------------
/Baseline methods/MMAN/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | import numpy as np
  5 | import argparse
  6 | import threading
  7 | import codecs
  8 | import logging
  9 | from tqdm import tqdm
 10 | logger = logging.getLogger(__name__)
 11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package
 13 | 
 14 | import torch
 15 | 
 16 | import models, configs, data_loader 
 17 | from modules import get_cosine_schedule_with_warmup
 18 | #from utils import similarity, normalize
 19 | from data_loader import *
 20 | 
 21 | 
 22 | def normalize(data):
 23 |     """normalize matrix by rows"""
 24 |     return data/np.linalg.norm(data,axis=1,keepdims=True)
 25 | 
 26 | def test(config, model, device):
 27 |     logger.info('Test Begin...')
 28 | 
 29 |     model.eval()
 30 |     model.to(device)
 31 | 
 32 |     # load data
 33 |     data_path = args.data_path+args.dataset+'/'
 34 |     test_set = eval(config['dataset_name'])(config, data_path,
 35 |                                 config['test_token'], config['tok_len'],
 36 |                                 config['test_ast'], config['vocab_ast'],
 37 |                                 config['test_cfg'], config['n_node'],
 38 |                                 config['test_desc'], config['desc_len'])
 39 |     data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
 40 |                                         collate_fn=batcher(device), shuffle=False, drop_last=False, num_workers=0)
 41 |     # encode tokens and descs
 42 |     code_reprs, desc_reprs = [], []
 43 |     n_processed = 0
 44 |     for batch in data_loader:
 45 |         # batch[0:7]: tokens, tok_len, tree, tree_node_num, init_input, adjmat, node_mask
 46 |         code_batch = [tensor for tensor in batch[:7]]
 47 |         # batch[7:9]: good_desc, good_desc_len
 48 |         desc_batch = [tensor for tensor in batch[7:9]]
 49 |         with torch.no_grad():
 50 |             code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
 51 |             desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
 52 |             # normalize when sim_measure=='cos'
 53 |             code_repr = normalize(code_repr)
 54 |             desc_repr = normalize(desc_repr)
 55 |         code_reprs.append(code_repr)
 56 |         desc_reprs.append(desc_repr)
 57 |         n_processed += batch[0].size(0) # +batch_size
 58 |     # code_reprs: [n_processed x n_hidden]
 59 |     code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 
 60 | 
 61 |     # calculate similarity
 62 |     sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
 63 |     test_sim_result, test_rank_result = [], []
 64 |     for i in tqdm(range(0, n_processed)):
 65 |         desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
 66 |         sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
 67 |         negsims = np.negative(sims)
 68 |         predict = np.argsort(negsims)
 69 |         
 70 |         # SuccessRate@k
 71 |         predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
 72 |         sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
 73 |         sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
 74 |         sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
 75 |         # MRR
 76 |         predict_list = predict.tolist()
 77 |         rank = predict_list.index(i)
 78 |         sum_mrr.append(1/float(rank+1))
 79 | 
 80 |         # results need to be saved
 81 |         predict_20 = [int(k) for k in predict[0:20]]
 82 |         sim_20 = [sims[k] for k in predict_20]
 83 |         test_sim_result.append(zip(predict_20, sim_20))
 84 |         test_rank_result.append(rank+1)
 85 | 
 86 |     logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
 87 |     save_path = args.data_path + 'result/'
 88 |     sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
 89 |     np.save(save_path+sim_result_filename, test_sim_result)
 90 |     np.save(save_path+rank_result_filename, test_rank_result)
 91 | 
 92 |     
 93 | def parse_args():
 94 |     parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
 95 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
 96 |     parser.add_argument('--model', type=str, default='MultiEmbeder', help='model name')
 97 |     parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python')
 98 |     parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from')
 99 |     parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\
100 |                          'Note: should be consistent with the same argument in the repr_code.py')
101 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
102 |     parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
103 |     return parser.parse_args()
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     args = parse_args()
108 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
109 |     config = getattr(configs, 'config_'+args.model)()
110 |     
111 |     ##### Define model ######
112 |     logger.info('Constructing Model..')
113 |     model = getattr(models, args.model)(config) # initialize the model
114 |     ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
115 |     model.load_state_dict(torch.load(ckpt, map_location=device))
116 |     
117 |     test(config, model, device)    
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/Baseline methods/MMAN/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
  3 | import sys
  4 | import random
  5 | import time
  6 | from datetime import datetime
  7 | import numpy as np
  8 | import math
  9 | import argparse
 10 | random.seed(42)
 11 | from tqdm import tqdm
 12 | 
 13 | import logging
 14 | logger = logging.getLogger(__name__)
 15 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 16 | 
 17 | import torch
 18 | 
 19 | import models, configs 
 20 | from modules import get_cosine_schedule_with_warmup
 21 | from data_loader import *
 22 | 
 23 |     
 24 | def train(args):
 25 |     fh = logging.FileHandler(f"./output/{args.model}/{args.dataset}/logs.txt")
 26 |                                       # create file handler which logs even debug messages
 27 |     logger.addHandler(fh)# add the handlers to the logger
 28 |     timestamp = datetime.now().strftime('%Y%m%d%H%M') 
 29 |     
 30 |     random.seed(args.seed)
 31 |     np.random.seed(args.seed)
 32 |     torch.manual_seed(args.seed)
 33 |     torch.cuda.manual_seed(args.seed)
 34 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 
 35 |     
 36 |     def save_model(model, epoch):
 37 |         torch.save(model.state_dict(), f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5')
 38 | 
 39 |     def load_model(model, epoch, to_device):
 40 |         assert os.path.exists(f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5'), f'Weights at epoch {epoch} not found'
 41 |         model.load_state_dict(torch.load(f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5', map_location=to_device))
 42 | 
 43 |     config = getattr(configs, 'config_'+args.model)()
 44 |     print(config)
 45 |     
 46 |     # load data
 47 |     data_path = args.data_path+args.dataset+'/'
 48 |     train_set = eval(config['dataset_name'])(config, data_path, 
 49 |                                 config['train_token'], config['tok_len'],
 50 |                                 config['train_ast'], config['vocab_ast'],
 51 |                                 config['train_cfg'], config['n_node'],
 52 |                                 config['train_desc'], config['desc_len'])
 53 |     
 54 |     data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], 
 55 |                                        collate_fn=batcher(device), shuffle=True, drop_last=False, num_workers=0)
 56 |     
 57 |     # define the models
 58 |     logger.info('Constructing Model..')
 59 |     model = getattr(models, args.model)(config) #initialize the model
 60 |     if args.reload_from>0:
 61 |         load_model(model, args.reload_from, device)    
 62 |     logger.info('done')
 63 |     model.to(device)
 64 | 
 65 |     no_decay = ['bias', 'LayerNorm.weight']
 66 |     optimizer_grouped_parameters = [
 67 |             {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
 68 |             {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
 69 |     ]    
 70 |     optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon'])        
 71 |     scheduler = get_cosine_schedule_with_warmup(
 72 |             optimizer, num_warmup_steps=config['warmup_steps'], 
 73 |             num_training_steps=len(data_loader)*config['nb_epoch']) # do not forget to modify the number when dataset is changed
 74 | 
 75 |     print('---model parameters---')
 76 |     num_params = 0
 77 |     for param in model.parameters():
 78 |         num_params += param.numel()
 79 |     print(num_params / 1e6)
 80 | 
 81 |     n_iters = len(data_loader)
 82 |     itr_global = args.reload_from+1 
 83 |     for epoch in range(int(args.reload_from)+1, config['nb_epoch']+1): 
 84 |         itr_start_time = time.time()
 85 |         losses=[]
 86 |         for batch in data_loader:
 87 |             
 88 |             model.train()
 89 |             batch_gpu = [tensor for tensor in batch]
 90 |             loss = model(*batch_gpu)
 91 |             
 92 |             loss.backward()
 93 |             torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
 94 |                 
 95 |             optimizer.step()
 96 |             scheduler.step()
 97 |             model.zero_grad()
 98 |             
 99 |             losses.append(loss.item())
100 |             
101 |             if itr_global % args.log_every == 0:
102 |                 elapsed = time.time() - itr_start_time
103 |                 logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f'%
104 |                         (epoch, config['nb_epoch'], itr_global%n_iters, n_iters, elapsed, np.mean(losses)))
105 |                     
106 |                 losses=[] 
107 |                 itr_start_time = time.time() 
108 |             itr_global = itr_global + 1
109 | 
110 |         # save every epoch
111 |         if epoch >= 90:
112 |             if epoch % 5 == 0:
113 |                 save_model(model, epoch)
114 | 
115 |     
116 | def parse_args():
117 |     parser = argparse.ArgumentParser("Train and Validate The Code Search (Embedding) Model")
118 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
119 |     parser.add_argument('--model', type=str, default='MultiEmbeder', help='model name')
120 |     parser.add_argument('--dataset', type=str, default='github', help='name of dataset.java, python')
121 |     parser.add_argument('--reload_from', type=int, default=-1, help='epoch to reload from')
122 |    
123 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
124 |     parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
125 |     # Training Arguments
126 |     parser.add_argument('--log_every', type=int, default=50, help='interval to log autoencoder training results')
127 |     parser.add_argument('--seed', type=int, default=1111, help='random seed')
128 |         
129 |     
130 |     return parser.parse_args()
131 | 
132 | if __name__ == '__main__':
133 |     args = parse_args()
134 |     
135 |     # make output directory if it doesn't already exist
136 |     os.makedirs(f'./output/{args.model}/{args.dataset}/models', exist_ok=True)
137 |     os.makedirs(f'./output/{args.model}/{args.dataset}/tmp_results', exist_ok=True)
138 |     
139 |     torch.backends.cudnn.benchmark = True # speed up training by using cudnn
140 |     torch.backends.cudnn.deterministic = True # fix the random seed in cudnn
141 |    
142 |     train(args)
143 |         


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/Tok-Att.code-workspace:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"folders": [
 3 | 		{
 4 | 			"path": "../CFG-Att"
 5 | 		},
 6 | 		{
 7 | 			"path": "../AST-Att"
 8 | 		},
 9 | 		{
10 | 			"path": "."
11 | 		}
12 | 	]
13 | }


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/configs.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/data_loader.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/modules.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/configs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def config_TokenEmbeder():   
 3 |     conf = {
 4 |             # added_params
 5 |             'gpu': 1,
 6 |             'transform_every_modal': 0,  # to make modal more complex?
 7 |             'save_attn_weight': 0,
 8 |             'use_tanh': 1,
 9 |             'use_attn': 1,
10 | 
11 |             # data_params
12 |             'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
13 |             #training data
14 |             'train_tokens':'train.token.h5',
15 |             'train_desc':'train.desc.h5',
16 |             #valid data
17 |             'valid_tokens':'valid.token.h5',
18 |             'valid_desc':'valid.desc.h5',
19 |             # test data
20 |             'test_tokens':'test.token.h5',
21 |             'test_desc':'test.desc.h5', 
22 |                    
23 |             #parameters
24 |             'tokens_len':50,
25 |             'desc_len': 30,
26 |             'n_token_words': 20000, # len(vocabulary) + 1 
27 |             'n_desc_words': 12000, # wait to decide
28 |             #vocabulary info
29 |             'vocab_tokens':'vocab.token.json',
30 |             'vocab_desc':'vocab.desc.json',
31 |                     
32 |         #training_params            
33 |             'batch_size': 32,
34 |             'chunk_size': 200000,
35 |             'nb_epoch': 200,
36 |             #'optimizer': 'adam',
37 |             'learning_rate':0.0003, # try 1e-4(paper)
38 |             'adam_epsilon':1e-8,
39 |             'warmup_steps':5000,
40 |             'fp16': False,
41 |             'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
42 |                             #"See details at https://nvidia.github.io/apex/amp.html"
43 | 
44 |         # model_params
45 |             'emb_size': 300,
46 |             'n_hidden': 512,#number of hidden dimension of code/desc representation
47 |             # recurrent  
48 |             'margin': 0.6,
49 |             'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
50 |                          #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
51 |             'dropout':0.1
52 |     }
53 |     return conf
54 | 
55 | 


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/data/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/data_loader.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import torch 
  3 | import torch.utils.data as data
  4 | import torch.nn as nn
  5 | import tables
  6 | import json
  7 | import random
  8 | import numpy as np
  9 | import pickle
 10 | from utils import PAD_ID, UNK_ID, indexes2sent
 11 | 
 12 | import logging
 13 | logger = logging.getLogger(__name__)
 14 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 15 | 
 16 |     
 17 | class CodeSearchDataset(data.Dataset):
 18 |     """
 19 |     Dataset that has only positive samples.
 20 |     """
 21 |     def __init__(self, data_dir, f_tokens, max_tok_len, f_descs=None, max_desc_len=None):
 22 |         self.max_tok_len = max_tok_len
 23 |         self.max_desc_len = max_desc_len
 24 |         # initialize file path or list of file names
 25 |         """read training data(list of int arrays) from a hdf5 file"""
 26 |         self.training = False
 27 |         print("loading data...")
 28 |         table_tokens = tables.open_file(data_dir+f_tokens)
 29 |         self.tokens = table_tokens.get_node('/phrases')[:].astype(np.long)
 30 |         self.idx_tokens = table_tokens.get_node('/indices')[:]
 31 |         if f_descs is not None:
 32 |             self.training=True
 33 |             table_desc = tables.open_file(data_dir+f_descs)
 34 |             self.descs = table_desc.get_node('/phrases')[:].astype(np.long)
 35 |             self.idx_descs = table_desc.get_node('/indices')[:]
 36 |         
 37 |         if f_descs is not None:
 38 |             assert self.idx_tokens.shape[0]==self.idx_descs.shape[0]
 39 |         self.data_len = self.idx_tokens.shape[0]
 40 |         print("{} entries".format(self.data_len))
 41 |         
 42 |     def pad_seq(self, seq, maxlen):
 43 |         if len(seq) < maxlen:
 44 |             # !!!!! numpy appending is slow. Try to optimize the padding
 45 |             seq = np.append(seq, [PAD_ID]*(maxlen-len(seq)))
 46 |         seq = seq[:maxlen]
 47 |         return seq
 48 |     
 49 |     def __getitem__(self, offset):          
 50 |         len, pos = self.idx_tokens[offset][0], self.idx_tokens[offset][1]
 51 |         tok_len = min(int(len), self.max_tok_len)
 52 |         tokens = self.tokens[pos:pos+tok_len]
 53 |         tokens = self.pad_seq(tokens, self.max_tok_len)
 54 | 
 55 |         if self.training:
 56 |             len, pos = self.idx_descs[offset][0], self.idx_descs[offset][1]
 57 |             good_desc_len = min(int(len), self.max_desc_len)
 58 |             good_desc = self.descs[pos:pos+good_desc_len]
 59 |             good_desc = self.pad_seq(good_desc, self.max_desc_len)
 60 |             
 61 |             rand_offset = random.randint(0, self.data_len-1)
 62 |             len, pos = self.idx_descs[rand_offset][0], self.idx_descs[rand_offset][1]
 63 |             bad_desc_len=min(int(len), self.max_desc_len)
 64 |             bad_desc = self.descs[pos:pos+bad_desc_len]
 65 |             bad_desc = self.pad_seq(bad_desc, self.max_desc_len)
 66 | 
 67 |             return tokens, tok_len, good_desc, good_desc_len, bad_desc, bad_desc_len
 68 |         return tokens, tok_len, good_desc, good_desc_len
 69 |         
 70 |     def __len__(self):
 71 |         return self.data_len
 72 |     
 73 | 
 74 | def load_dict(filename):
 75 |     return json.loads(open(filename, "r").readline())
 76 |     #return pickle.load(open(filename, 'rb')) 
 77 | 
 78 | def load_vecs(fin):         
 79 |     """read vectors (2D numpy array) from a hdf5 file"""
 80 |     h5f = tables.open_file(fin)
 81 |     h5vecs= h5f.root.vecs
 82 |     
 83 |     vecs=np.zeros(shape=h5vecs.shape,dtype=h5vecs.dtype)
 84 |     vecs[:]=h5vecs[:]
 85 |     h5f.close()
 86 |     return vecs
 87 |         
 88 | def save_vecs(vecs, fout):
 89 |     fvec = tables.open_file(fout, 'w')
 90 |     atom = tables.Atom.from_dtype(vecs.dtype)
 91 |     filters = tables.Filters(complib='blosc', complevel=5)
 92 |     ds = fvec.create_carray(fvec.root,'vecs', atom, vecs.shape,filters=filters)
 93 |     ds[:] = vecs
 94 |     print('done')
 95 |     fvec.close()
 96 | 
 97 | if __name__ == '__main__':
 98 |     input_dir = './data/github/'
 99 |     train_set = CodeSearchDataset(input_dir, 'train.token.h5', 60, 'train.desc.h5', 30)
100 |     train_data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=32, shuffle=False, num_workers=1)
101 |     logger.info('hello')
102 |     #print(len(train_data_loader))
103 |     '''
104 |     use_set = CodeSearchDataset(input_dir, 'use.tokens.h5', 30)
105 |     use_data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=1, shuffle=False, num_workers=1)
106 |     #print(len(use_data_loader))
107 |     vocab_tokens = load_dict(input_dir+'vocab.tokens.json')
108 |     vocab_desc = load_dict(input_dir+'vocab.desc.json')
109 |     '''
110 |     print('============ Train Data ================')
111 |     k=0
112 |     for batch in train_data_loader:
113 |         print("batch[0].size(0): ", batch[0].size(0))
114 |         batch = tuple([t.numpy() for t in batch])
115 |         tokens, tok_len, good_desc, good_desc_len, bad_desc, bad_desc_len = batch
116 |         k+=1
117 |         if k>0: break
118 |         print('-------------------------------')
119 |         #print(indexes2sent(tokens, vocab_tokens))
120 |         #print(indexes2sent(good_desc, vocab_desc))
121 |         
122 |     
123 | 


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenemb import TokenEmbeder


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/models/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/models/__pycache__/jointemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/models/__pycache__/tokenemb.cpython-36.pyc


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/modules.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import math
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.init as weight_init
 8 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 9 | from torch import optim
10 | import torch.nn.functional as F
11 | 
12 | import logging
13 | logger = logging.getLogger(__name__)
14 | 
15 |         
16 | class SeqEncoder(nn.Module):
17 |     def __init__(self, vocab_size, emb_size, hidden_size, n_layers=1):
18 |         super(SeqEncoder, self).__init__()
19 |         self.emb_size = emb_size
20 |         self.hidden_size = hidden_size
21 |         self.n_layers = n_layers
22 | 
23 |         self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
24 | 
25 |         self.init_xavier_linear(self.embedding, init_bias=False)
26 | 
27 |         self.lstm = nn.LSTM(emb_size, hidden_size, dropout=0.1, batch_first=True, bidirectional=False)
28 | 
29 |     def init_xavier_linear(self, linear, init_bias=True, gain=1, init_normal_std=1e-4):
30 |         torch.nn.init.xavier_uniform_(linear.weight, gain)
31 |         if init_bias:
32 |             if linear.bias is not None:
33 |                 linear.bias.data.normal_(std=init_normal_std)
34 | 
35 |     def init_hidden(self, batch_size):
36 |         weight = next(self.parameters()).data
37 |         return (weight.new(self.n_layers, batch_size, self.hidden_size).zero_().requires_grad_(), # rnn_type == 'LSTM'
38 |                 weight.new(self.n_layers, batch_size, self.hidden_size).zero_().requires_grad_())
39 | 
40 | 
41 |     def forward(self, inputs, input_lens=None, hidden=None): 
42 |         batch_size, seq_len = inputs.size()
43 |         inputs = self.embedding(inputs)  # input: [batch_sz x seq_len]  embedded: [batch_sz x seq_len x emb_sz]
44 |         #inputs = F.dropout(inputs, 0.1, self.training) # mark.
45 |         
46 |         if input_lens is not None:# sort and pack sequence 
47 |             input_lens_sorted, indices = input_lens.sort(descending=True)
48 |             inputs_sorted = inputs.index_select(0, indices)        
49 |             inputs = pack_padded_sequence(inputs_sorted, input_lens_sorted.data.tolist(), batch_first=True)
50 |             
51 |         hids, (h_n, c_n) = self.lstm(inputs, hidden) # hids:[b x seq x hid_sz*2](biRNN) 
52 |         
53 |         if input_lens is not None: # reorder and pad
54 |             _, inv_indices = indices.sort()
55 |             hids, lens = pad_packed_sequence(hids, batch_first=True)   
56 |             #hids = F.dropout(hids, p=0.1, training=self.training) # mark.
57 |             hids = hids.index_select(0, inv_indices) # [batch_sz x seq_len x hid_sz]
58 |             h_n = h_n.index_select(1, inv_indices)
59 |             c_n = c_n.index_select(1, inv_indices)
60 | 
61 |         h_n = h_n[0] # [batch_sz x hid_sz] n_layers==1 and n_dirs==1
62 |         c_n = c_n[0] 
63 | 
64 |         return hids, (h_n, c_n)
65 | 
66 |     
67 | from torch.optim.lr_scheduler import LambdaLR
68 | 
69 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
70 |     """ Create a schedule with a learning rate that decreases following the
71 |     values of the cosine function between 0 and `pi * cycles` after a warmup
72 |     period during which it increases linearly between 0 and 1.
73 |     """
74 |     def lr_lambda(current_step):
75 |         if current_step < num_warmup_steps:
76 |             return float(current_step) / float(max(1, num_warmup_steps))
77 |         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
78 |         return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
79 | 
80 |     return LambdaLR(optimizer, lr_lambda, last_epoch)    
81 |     
82 | 
83 | def get_word_weights(vocab_size, padding_idx=0):
84 |     '''contruct a word weighting table '''
85 |     def cal_weight(word_idx):
86 |         return 1-math.exp(-word_idx)
87 |     weight_table = np.array([cal_weight(w) for w in range(vocab_size)])
88 |     if padding_idx is not None:        
89 |         weight_table[padding_idx] = 0. # zero vector for padding dimension
90 |     return torch.FloatTensor(weight_table)
91 | 


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/output/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/output/TokenEmbeder/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/output/TokenEmbeder/.DS_Store


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/shuffle_index.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/shuffle_index.npy


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
  3 | import sys
  4 | import traceback
  5 | import numpy as np
  6 | import argparse
  7 | import threading
  8 | import codecs
  9 | import logging
 10 | from tqdm import tqdm
 11 | logger = logging.getLogger(__name__)
 12 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 13 | 
 14 | import torch
 15 | 
 16 | import models, configs, data_loader 
 17 | from modules import get_cosine_schedule_with_warmup
 18 | from utils import similarity, normalize
 19 | from data_loader import *
 20 | 
 21 | 
 22 | def test(config, model, device):
 23 |     logger.info('test begin...')
 24 | 
 25 |     model.eval()
 26 |     model.to(device)
 27 | 
 28 |     # load data
 29 |     data_path = args.data_path+args.dataset+'/'
 30 |     test_set = eval(config['dataset_name'])(data_path,
 31 |                                 config['test_tokens'], config['tokens_len'],
 32 |                                 config['test_desc'], config['desc_len'])
 33 |     data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
 34 |                                         shuffle=False, drop_last=False, num_workers=1)
 35 |     # encode tokens and descs
 36 |     code_reprs, desc_reprs = [], []
 37 |     n_processed = 0
 38 |     for batch in data_loader:
 39 |         code_batch = [tensor.to(device) for tensor in batch[:2]]
 40 |         desc_batch = [tensor.to(device) for tensor in batch[2:4]]
 41 |         with torch.no_grad():
 42 |             code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
 43 |             desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
 44 |             # normalize when sim_measure=='cos'
 45 |             code_repr = normalize(code_repr)
 46 |             desc_repr = normalize(desc_repr)
 47 |         code_reprs.append(code_repr)
 48 |         desc_reprs.append(desc_repr)
 49 |         n_processed += batch[0].size(0) # +batch_size
 50 |     # code_reprs: [n_processed x n_hidden]
 51 |     code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 
 52 | 
 53 |     # calculate similarity
 54 |     sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
 55 |     test_sim_result, test_rank_result = [], []
 56 |     for i in tqdm(range(0, n_processed)):
 57 |         desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
 58 |         sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
 59 |         negsims = np.negative(sims)
 60 |         predict = np.argsort(negsims)
 61 |         
 62 |         # SuccessRate@k
 63 |         predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
 64 |         sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
 65 |         sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
 66 |         sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
 67 |         # MRR
 68 |         predict_list = predict.tolist()
 69 |         rank = predict_list.index(i)
 70 |         sum_mrr.append(1/float(rank+1))
 71 | 
 72 |         # results need to be saved
 73 |         predict_20 = [int(k) for k in predict[0:20]]
 74 |         sim_20 = [sims[k] for k in predict_20]
 75 |         test_sim_result.append(zip(predict_20, sim_20))
 76 |         test_rank_result.append(rank+1)
 77 | 
 78 |     logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
 79 |     save_path = args.data_path + 'result/'
 80 |     sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
 81 |     np.save(save_path+sim_result_filename, test_sim_result)
 82 |     np.save(save_path+rank_result_filename, test_rank_result)
 83 | 
 84 |     
 85 | def parse_args():
 86 |     parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
 87 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
 88 |     parser.add_argument('--model', type=str, default='TokenEmbeder', help='model name')
 89 |     parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python')
 90 |     parser.add_argument('--reload_from', type=int, default=200, help='epoch to reload from')
 91 |     parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\
 92 |                          'Note: should be consistent with the same argument in the repr_code.py')
 93 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
 94 |     parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
 95 |     return parser.parse_args()
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     args = parse_args()
100 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
101 |     config = getattr(configs, 'config_'+args.model)()
102 |     
103 |     ##### Define model ######
104 |     logger.info('Constructing Model..')
105 |     model = getattr(models, args.model)(config) # initialize the model
106 |     ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
107 |     model.load_state_dict(torch.load(ckpt, map_location=device))
108 |     
109 |     test(config, model, device)    
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/util_desc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import argparse
  3 | from collections import Counter
  4 | import json
  5 | import h5py
  6 | 
  7 | PAD_ID, UNK_ID = [0, 1]
  8 | 
  9 | def make_shuffle_index(args):
 10 |     dir_path = args.data_path + args.dataset
 11 |     all_desc_file_path = dir_path + args.all_desc_file
 12 |     with open(all_desc_file_path, 'r') as all_desc_file:
 13 |         lines = all_desc_file.readlines()
 14 |         all_num = int(len(lines)/2) 
 15 | 
 16 |     index = np.arange(all_num)
 17 |     np.random.seed(16)
 18 |     np.random.shuffle(index)
 19 |     print(len(index))
 20 |     np.save(args.shuffle_index_file, index)
 21 | 
 22 | def split_desc_data(args):
 23 |     index = np.load(args.shuffle_index_file)
 24 | 
 25 |     dir_path = args.data_path + args.dataset
 26 |     all_desc_file_path = dir_path + args.all_desc_file
 27 |     train_desc_file_path = dir_path + args.train_desc_file
 28 |     test_desc_file_path = dir_path + args.test_desc_file
 29 | 
 30 |     input_desc = []
 31 |     with open(all_desc_file_path, 'r') as all_desc_file:
 32 |         lines = all_desc_file.readlines()
 33 |         for line in lines:
 34 |             if (line[0:10] != 'BeginFunc:'):
 35 |                 input_desc.append(line)
 36 |         print('number of input desc:\n', len(input_desc))
 37 | 
 38 |     with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
 39 |         for i in range(0, args.trainset_num):
 40 |             train_desc_file.write(input_desc[index[i]])
 41 |         for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
 42 |             test_desc_file.write(input_desc[index[i]])
 43 |     
 44 | 
 45 | def create_desc_dict_file(args):
 46 |     dir_path = args.data_path + args.dataset
 47 |     desc_file_path = dir_path + args.train_desc_file 
 48 | 
 49 |     input_desc = []
 50 |     with open(desc_file_path, 'r') as desc_file:
 51 |         input_desc = desc_file.readlines()
 52 |     desc_words = []
 53 |     for i in range(0, len(input_desc)):
 54 |         input_desc[i] = input_desc[i].rstrip('\n')
 55 |         desc_word_list = input_desc[i].split()
 56 |         for desc_word in desc_word_list:
 57 |             desc_words.append(desc_word)
 58 |     vocab_desc_info = Counter(desc_words)
 59 |     print(len(vocab_desc_info))
 60 | 
 61 |     vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
 62 |     vocab_desc_index = {'<pad>':0, '<unk>':1}
 63 |     vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
 64 | 
 65 |     
 66 |     vocab_desc_file_path = dir_path + args.vocab_desc_file
 67 |     desc_dic_str = json.dumps(vocab_desc_index)
 68 |     with open(vocab_desc_file_path, 'w') as vocab_desc_file:
 69 |         vocab_desc_file.write(desc_dic_str)
 70 | 
 71 | 
 72 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
 73 |     phrases, indices = [], []
 74 |     with open(sent_file_path, 'r') as sent_file:
 75 |         sents = sent_file.readlines()
 76 |     vocab = json.loads(open(vocab_file_path, "r").readline())
 77 |     start_index = 0
 78 |     for i in range(0, len(sents)):
 79 |         sent = sents[i].rstrip('\n')
 80 |         word_list = sent.split()
 81 |         sent_len = min(len(word_list), maxlen)
 82 |         indices.append((sent_len, start_index))
 83 |         for j in range(0, sent_len):
 84 |             word = word_list[j]
 85 |             phrases.append(vocab.get(word, UNK_ID))
 86 |         start_index += sent_len
 87 |     output_file_path = sent_file_path[0:-3] + 'h5'
 88 |     output_file = h5py.File(output_file_path, 'w')
 89 |     output_file['phrases'] = phrases
 90 |     output_file['indices'] = indices
 91 |     output_file.close()
 92 | 
 93 | 
 94 | def parse_args():
 95 |     parser = argparse.ArgumentParser("Parse Description data for TokenEmbedder")
 96 |     
 97 |     parser.add_argument('--data_path', type=str, default='./data/')
 98 |     parser.add_argument('--dataset', type=str, default='github11/')
 99 | 
100 |     parser.add_argument('--origin_desc_file', type=str, default='origin.desc.txt')
101 |     parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
102 |     parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
103 |     parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
104 |     parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
105 |     
106 |     parser.add_argument('--trainset_num', type=int, default=33845)
107 |     parser.add_argument('--testset_num', type=int, default=2000)
108 |     parser.add_argument('--desc_word_num', type=int, default=10000)
109 |     parser.add_argument('--desc_maxlen', type=int, default=30)
110 |     parser.add_argument('--testset_start_index', type=int, default=33845)
111 | 
112 |     parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
113 |  
114 |     return parser.parse_args()
115 | 
116 | if __name__ == '__main__':
117 | 
118 |     args = parse_args()
119 | 
120 |     #make_shuffle_index(args)
121 |     '''
122 |     split_desc_data(args)
123 |     create_desc_dict_file(args)
124 | 
125 |     dir_path = args.data_path + args.dataset
126 |     # train.desc.txt -> train.desc.h5(and test...) 
127 |     sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
128 |     sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
129 |     '''
130 | 


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/util_tok.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import argparse
  3 | from collections import Counter
  4 | import json
  5 | import h5py
  6 | 
  7 | PAD_ID, UNK_ID = [0, 1]
  8 | 
  9 | def split_token_data(args):
 10 |     index = np.load(args.shuffle_index_file)
 11 | 
 12 |     dir_path = args.data_path + args.dataset
 13 |     all_token_file_path = dir_path + args.all_token_file
 14 |     train_token_file_path = dir_path + args.train_token_file
 15 |     test_token_file_path = dir_path + args.test_token_file
 16 | 
 17 |     input_token = []
 18 |     with open(all_token_file_path, 'r') as all_token_file:
 19 |         lines = all_token_file.readlines()
 20 |         for line in lines:
 21 |             if (line[0:10] != 'BeginFunc:'):
 22 |                 input_token.append(line)
 23 |         print('number of input token:\n', len(input_token))
 24 | 
 25 |     with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file:
 26 |         for i in range(0, args.trainset_num):
 27 |             train_token_file.write(input_token[index[i]])
 28 |         for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
 29 |             test_token_file.write(input_token[index[i]])
 30 |     
 31 | 
 32 | def create_token_dict_file(args):
 33 |     dir_path = args.data_path + args.dataset
 34 |     token_file_path = dir_path + args.train_token_file 
 35 | 
 36 |     input_token = []
 37 |     with open(token_file_path, 'r') as token_file:
 38 |         input_token = token_file.readlines()
 39 |     token_words = []
 40 |     for i in range(0, len(input_token)):
 41 |         input_token[i] = input_token[i].rstrip('\n')
 42 |         token_word_list = input_token[i].split()
 43 |         for token_word in token_word_list:
 44 |             token_words.append(token_word)
 45 |     vocab_token_info = Counter(token_words)
 46 |     print(len(vocab_token_info))
 47 | 
 48 |     vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]]
 49 |     vocab_token_index = {'<pad>':0, '<unk>':1}
 50 |     vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))]))
 51 | 
 52 |     
 53 |     vocab_token_file_path = dir_path + args.vocab_token_file
 54 |     token_dic_str = json.dumps(vocab_token_index)
 55 |     with open(vocab_token_file_path, 'w') as vocab_token_file:
 56 |         vocab_token_file.write(token_dic_str)
 57 | 
 58 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
 59 |     phrases, indices = [], []
 60 |     with open(sent_file_path, 'r') as sent_file:
 61 |         sents = sent_file.readlines()
 62 |     vocab = json.loads(open(vocab_file_path, "r").readline())
 63 |     start_index = 0
 64 |     for i in range(0, len(sents)):
 65 |         sent = sents[i].rstrip('\n')
 66 |         word_list = sent.split()
 67 |         sent_len = min(len(word_list), maxlen)
 68 |         indices.append((sent_len, start_index))
 69 |         for j in range(0, sent_len):
 70 |             word = word_list[j]
 71 |             phrases.append(vocab.get(word, UNK_ID))
 72 |         start_index += sent_len
 73 |     output_file_path = sent_file_path[0:-3] + 'h5'
 74 |     output_file = h5py.File(output_file_path, 'w')
 75 |     output_file['phrases'] = phrases
 76 |     output_file['indices'] = indices
 77 |     output_file.close()
 78 | 
 79 | def parse_args():
 80 |     parser = argparse.ArgumentParser("Parse token data for TokenEmbedder")
 81 |     
 82 |     parser.add_argument('--data_path', type=str, default='./data/')
 83 |     parser.add_argument('--dataset', type=str, default='github11/')
 84 | 
 85 |     parser.add_argument('--all_token_file', type=str, default='all.token.txt')
 86 |     parser.add_argument('--train_token_file', type=str, default='train.token.txt')
 87 |     parser.add_argument('--test_token_file', type=str, default='test.token.txt')
 88 |     parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json')
 89 |     
 90 |     parser.add_argument('--trainset_num', type=int, default=39152)
 91 |     parser.add_argument('--testset_num', type=int, default=2000)
 92 |     parser.add_argument('--token_word_num', type=int, default=25000)
 93 |     parser.add_argument('--token_maxlen', type=int, default=100)
 94 |     parser.add_argument('--testset_start_index', type=int, default=39152)
 95 | 
 96 | 
 97 |     parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
 98 |  
 99 |     return parser.parse_args()
100 | 
101 | if __name__ == '__main__':
102 |     args = parse_args()
103 |     '''
104 |     dir_path = args.data_path + args.dataset
105 |     with open(dir_path+'origin.test.token.txt', 'r') as in_file, open(dir_path+'test.token.txt', 'w') as out_file:
106 |         lines = in_file.readlines()
107 |         for i in range(0, len(lines)):
108 |             if lines[i][0:10] != 'BeginFunc:':
109 |                 out_file.write(lines[i])
110 |     '''
111 |     
112 |     split_token_data(args)
113 |     create_token_dict_file(args)
114 | 
115 |     dir_path = args.data_path + args.dataset
116 |     # train.token.txt -> train.token.h5(and test...) 
117 |     sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
118 |     sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
119 |     
120 | 
121 |     '''
122 |     dir_path = args.data_path + args.dataset
123 |     all_token_file_path = dir_path + args.all_token_file
124 |     with open(all_token_file_path, 'r') as all_token_file:
125 |         lines = all_token_file.readlines()
126 |         print(len(lines))
127 |     for i in range(0, len(lines)):
128 |         line = lines[i]
129 |         if line[0:10] != 'BeginFunc:':
130 |             words = line.split()
131 |             if len(words) == 0:
132 |                 print(lines[i-1])
133 |                 #print(lines[i])
134 |     '''
135 | 


--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import math
  4 | import torch
  5 | from torch.nn import functional as F
  6 | 
  7 | PAD_ID, UNK_ID = [0, 1]
  8 | 
  9 | def cos_approx(data1,data2):
 10 |     """numpy implementation of cosine similarity for matrix"""
 11 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
 12 |     dotted = np.dot(data1,np.transpose(data2))
 13 |     norm1 = np.linalg.norm(data1,axis=1)
 14 |     norm2 = np.linalg.norm(data2,axis=1)
 15 |     matrix_vector_norms = np.multiply(norm1, norm2)
 16 |     neighbors = np.divide(dotted, matrix_vector_norms)
 17 |     return neighbors
 18 | 
 19 | def normalize(data):
 20 |     """normalize matrix by rows"""
 21 |     return data/np.linalg.norm(data,axis=1,keepdims=True)
 22 | 
 23 | def dot_np(data1,data2):
 24 |     """cosine similarity for normalized vectors"""
 25 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
 26 |     return np.dot(data1, data2.T)
 27 | 
 28 | def sigmoid(x):
 29 |     return 1/(1 + np.exp(-x)) 
 30 | 
 31 | def similarity(vec1, vec2, measure='cos'):
 32 |     if measure=='cos':
 33 |         vec1_norm = normalize(vec1)
 34 |         vec2_norm = normalize(vec2)
 35 |         return np.dot(vec1_norm, vec2_norm.T)[:,0]
 36 |     elif measure=='poly':
 37 |         return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
 38 |     elif measure=='sigmoid':
 39 |         return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
 40 |     elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf 
 41 |         euc_dist = np.linalg.norm(vec1-vec2, axis=1)
 42 |         euc_sim = 1 / (1 + euc_dist)
 43 |         if measure=='euc': return euc_sim                
 44 |         sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
 45 |         if measure == 'gesd': return euc_sim * sigmoid_sim
 46 |         elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
 47 | 
 48 | #######################################################################
 49 | 
 50 | def asMinutes(s):
 51 |     m = math.floor(s / 60)
 52 |     s -= m * 60
 53 |     return '%d:%d'% (m, s)
 54 | 
 55 | def timeSince(since, percent):
 56 |     now = time.time()
 57 |     s = now - since
 58 |     es = s / (percent)
 59 |     rs = es - s
 60 |     return '%s<%s'%(asMinutes(s), asMinutes(rs))
 61 | 
 62 | #######################################################################
 63 | '''
 64 | import nltk
 65 | try: nltk.word_tokenize("hello world")
 66 | except LookupError: nltk.download('punkt')
 67 |     
 68 | def sent2indexes(sentence, vocab, maxlen):
 69 |    
 70 |     def convert_sent(sent, vocab, maxlen):
 71 |         idxes = np.zeros(maxlen, dtype=np.int64)
 72 |         idxes.fill(PAD_ID)
 73 |         tokens = nltk.word_tokenize(sent.strip())
 74 |         idx_len = min(len(tokens), maxlen)
 75 |         for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID)
 76 |         return idxes, idx_len
 77 |     if type(sentence) is list:
 78 |         inds, lens = [], []
 79 |         for sent in sentence:
 80 |             idxes, idx_len = convert_sent(sent, vocab, maxlen)
 81 |             #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len])
 82 |             inds.append(idxes)
 83 |             lens.append(idx_len)
 84 |         return np.vstack(inds), np.vstack(lens)
 85 |     else:
 86 |         inds, lens = sent2indexes([sentence], vocab, maxlen)
 87 |         return inds[0], lens[0]
 88 | '''
 89 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 
 90 |     '''indexes: numpy array'''
 91 |     def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
 92 |         indexes=filter(lambda i: i!=ignore_tok, indexes)
 93 |         toks, length = [], 0        
 94 |         for idx in indexes:
 95 |             toks.append(ivocab.get(idx, '<unk>'))
 96 |             length+=1
 97 |         return ' '.join(toks), length
 98 |     
 99 |     ivocab = {v: k for k, v in vocab.items()}
100 |     if indexes.ndim==1:# one sentence
101 |         return revert_sent(indexes, ivocab, ignore_tok)
102 |     else:# dim>1
103 |         sentences, lens =[], [] # a batch of sentences
104 |         for inds in indexes:
105 |             sentence, length = revert_sent(inds, ivocab, ignore_tok)
106 |             sentences.append(sentence)
107 |             lens.append(length)
108 |         return sentences, lens
109 | 
110 | ########################################################################
111 | 


--------------------------------------------------------------------------------
/Baseline methods/readme.md:
--------------------------------------------------------------------------------
 1 | # Baseline methods
 2 | ### DeepCS
 3 | ```
 4 | @inproceedings{gu2018deep,
 5 |   title={Deep code search},
 6 |   author={Gu, Xiaodong and Zhang, Hongyu and Kim, Sunghun},
 7 |   booktitle={2018 IEEE/ACM 40th International Conference on Software Engineering (ICSE)},
 8 |   pages={933--944},
 9 |   year={2018},
10 |   organization={IEEE}
11 | }
12 | ```
13 | ### MMAN
14 | ```
15 | @inproceedings{wan2019multi,
16 |   title={Multi-modal attention network learning for semantic source code retrieval},
17 |   author={Wan, Yao and Shu, Jingdong and Sui, Yulei and Xu, Guandong and Zhao, Zhou and Wu, Jian and Yu, Philip},
18 |   booktitle={2019 34th IEEE/ACM International Conference on Automated Software Engineering (ASE)},
19 |   pages={13--25},
20 |   year={2019},
21 |   organization={IEEE}
22 | }
23 | ```
24 | ### MMAN(TDC)
25 | Exploit Token + Variable-based data flow + Variable-based control flow to perform code search tasks
26 | ### AST-Att 
27 | Exploit AST and the attention mechanism to perform code search tasks
28 | ### Tok-Att
29 | Exploit Token and the attention mechanism to perform code search tasks
30 | ### CFG-Att
31 | Exploit CFG and the attention mechanism to perform code search tasks
32 | 


--------------------------------------------------------------------------------
/IR2graph/readme.md:
--------------------------------------------------------------------------------
1 | # Generate our varibale-based flow graph from the input IR
2 | Given the input ".ll" file generated from the original ".c" file from LLVM, the output is the nodes and graphs in our constructed variable-based flow graph.
3 | 
4 | ## Generate the VFG
5 | ```
6 | python vfg_construct.py
7 | ```
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeGraphCS
 2 | 
 3 | 
 4 | # Project Overview
 5 | This project provides a collection of datasets and source code which are used in our DeGraphCS model. The content of project is as follows:
 6 | 
 7 | 1. Dataset
 8 |  
 9 | 2. DeGraphCS Source Code
10 | 
11 | 3. Variable-based Flow Graph Construction
12 | 
13 | 4. Baseline methods
14 | 
15 | 5. User Study
16 | 
17 | 6. Appendix
18 | 
19 | ## Dataset
20 | To help people to reproduce our work, we provide raw datasets which are consist of **C code snippet**, corresponding **code comment** and **generated IR**.
21 | 
22 | The raw datasets can be accessed in [Google Drive](https://drive.google.com/file/d/1PZ9TAfsrSlXLDpOCp6-0aZQxrzlP4kBA/view?usp=sharing)
23 | 
24 | To feed into our model, we first generate Variable-based Flow Graph of 41152 methods and extract corresponding comments. Then we split the datasets into 39152 training set and 2000 test set. All of the data are puted in `dataset/` directory. 
25 | 
26 | ## DeGraphCS Source Code
27 | We provide DeGraphCS model code which are listed in `src/` directory.
28 | 
29 | ## Variable-based Flow Graph Construction
30 | To construct Variable-based Flow Graph according to llvm IR, We provide graph construction code to help users to generate graph which are puted in `IR2graph/` directory.
31 | 
32 | ## Baseline Methods
33 | We have reproduced other code search works which are putted in `Baseline methods/` directory.
34 | 
35 | ## User Study
36 | We make a user study to evaluate our model. 
37 | 
38 | 50 queries of the user study are listed in the `user study/queries.txt`. And according to four models (UNIF, MMAN, DeepCS and DeGraphCS), we obtain corresponding searching result which are listed in  `user study/` directory.
39 | 
40 | ## Appendix
41 | 
42 | # Running Our Model
43 | ## Generate Datasets and Build Dictionary
44 | Run the command to split comments datasets into training set and test set, and build dictionary
45 | ```
46 | python src/util_desc.py
47 | ```
48 | Run the command to split Variable-based Flow Graph datasets into training set and test set, and build dictionary
49 | ```
50 | python src/util_ir.py
51 | ```
52 | ## Train the DeGraphCS Model
53 | ```
54 | python src/train.py
55 | ```
56 | ## Test the DeGraphCS Model
57 | ```
58 | python src/test.py
59 | ```
60 | 


--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
 1 | ## Process Raw Dataset
 2 | 
 3 | To obtain high-quality dataset, we process raw dataset in `/raw_dataset` as follows:
 4 | 
 5 | 1. After we delete duplicate methods, we obtain 74489 methods from 151414 methods.
 6 | 
 7 | 2. To generate a common dataset for all models(DeGraphCS, DeepCS, MMAN and UNIF), we delete those methods which do not generate AST and CFG. Then we obtain 59725 methods.
 8 | 
 9 | 3. To make sure our dataset is high-quality, we constraint comments' length and quality, and the number of nodes in AST, CFG and VFG(Variable-based Flow Graph). 
10 | After we delete those methods which do not meet our requirements, we obtain 41152 methods in `/preprocessed_dataset`.
11 |  
12 | 
13 | 


--------------------------------------------------------------------------------
/dataset/preprocessed_dataset/origin.ir.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f601269dd0365f12862fb1100d7538a957caa999c4767835a26a8e3781bd6d67
3 | size 160986007
4 | 


--------------------------------------------------------------------------------
/dataset/preprocessed_dataset/readme.md:
--------------------------------------------------------------------------------
 1 | ## Directory Introduction
 2 | 
 3 | **origin.desc.txt**: all datasets which includes **41152** descriptions of methods.
 4 | 
 5 | **origin.ir.txt**: all datasets which includes **41152** Variable-based Flow Graph representation of methods.
 6 | 
 7 | **train.desc.txt**: training set which includes **39152** descriptions of methods.
 8 | 
 9 | **train.ir.txt**: training set which includes **39152** Variable-based Flow Graph representation of methods.
10 | 
11 | **test.desc.txt**: test set which includes **2000** descriptions of methods.
12 | 
13 | **test.ir.txt**: test set which includes **2000** Variable-based Flow Graph representation of methods.
14 | 


--------------------------------------------------------------------------------
/dataset/preprocessed_dataset/train.ir.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6203c767bc9854e6addb905668ad89fd73a88fa542968884b681e14e3404fac1
3 | size 146141125
4 | 


--------------------------------------------------------------------------------
/dataset/raw_dataset/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | The raw datasets can be accessed in [Google Drive](https://drive.google.com/file/d/1PZ9TAfsrSlXLDpOCp6-0aZQxrzlP4kBA/view?usp=sharing)
3 | 


--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
 1 | # Core parts of deGraphcS
 2 | configs.py            --Config the hyper-parameters of deGraphCS
 3 | 
 4 | dataloader.py         --Load the data in batch
 5 | 
 6 | util_IR.py            --Preprocess the origin IR to generate grpahs, which can be identified by graph neural networks
 7 | 
 8 | util_desc.py          --Preprocess the comments
 9 | 
10 | generate_interface.py --Generate the interfaces of the third-party libraries
11 | 
12 | # Generate the interfaces to solve the compilation probelm
13 | ## An example to show how the compilation problem of IR can be solved 
14 | ### Initial code snippets crawled from Github
15 | ```
16 | public void range(IHypercube space, IvisitKDNode visitor){
17 |    if(root == null) return;
18 |    root.getRange(space, visitor);
19 | }
20 | ```
21 | The code above cannot be compiled because of the following parts:
22 | 1. The third-library IHypercube and IvisitKDNode is missing.
23 | 2. The object root and its method getRange is missing.
24 | 
25 | The third-library missing probelm can be solved by adding some empty interfaces (the Root class with getRange method, IHypercube class and IvisitKDNode class) since the realization details of the method are not neccessary. 
26 | ### After adding the interface, the example source code can be successfully compiled:
27 | ```
28 | public class Range{
29 |     private Root root;
30 |     public void range(IHypercube space, IvisitKDNode visitor){
31 |         if(root == null) return;
32 |         root.getRange(space, visitor);
33 |     }
34 | }
35 | class Root{
36 |    public void getRange(IHypercube space, IvisitKDNode visitor){
37 |       return;
38 |    }
39 | }
40 | class IHypercube{}
41 | class IvisitKDNode{}
42 | ```
43 | 


--------------------------------------------------------------------------------
/src/configs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def config_IREmbeder():   
 3 |     conf = {
 4 |             # added_params
 5 |             'transform_every_modal': 0,
 6 |             'use_attn': 0,
 7 |             'use_tanh': 1,
 8 |             'save_attn_weight': 0,
 9 | 
10 |             # GGNN
11 |             'state_dim': 512, # GGNN hidden state size
12 |             'annotation_dim': 300,
13 |             'n_edge_types': 2,
14 |             'n_node': 160, # maximum nodenum
15 |             'n_steps': 5, # propogation steps number of GGNN
16 |             'output_type': 'no_reduce',
17 |             'batch_size': 32,
18 |             'n_layers': 1,
19 |             'n_hidden': 512,
20 |             'ir_attn_mode': 'sigmoid_scalar',
21 |             'word_split': True,
22 |             'pooling_type': 'max_pooling', # ave_pooling
23 |             'max_word_num': 5,
24 | 
25 |             # data_params
26 |             'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
27 |             # training data
28 |             'train_ir':'train.ir.json',
29 |             'train_desc':'train.desc.h5',
30 |             # test data
31 |             'test_ir':'test.ir.json',
32 |             'test_desc':'test.desc.h5', 
33 |                    
34 |             # parameters
35 |             'desc_len': 30,
36 |             'n_desc_words': 10000, 
37 |             'n_ir_words': 15000,
38 |             # vocabulary info
39 |             'vocab_ir':'vocab.ir.json',
40 |             'vocab_desc':'vocab.desc.json',
41 |                     
42 |             #training_params            
43 |             'nb_epoch': 100,
44 |             #'optimizer': 'adam',
45 |             'learning_rate':0.0003, # try 1e-4(paper)
46 |             'adam_epsilon':1e-8,
47 |             'warmup_steps':5000,
48 |             'fp16': False,
49 |             'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
50 | 
51 |             # model_params
52 |             'emb_size': 300,
53 |             # recurrent  
54 |             'margin': 0.6,
55 |             'sim_measure':'cos',
56 |             'dropout': 0
57 |     }
58 |     return conf
59 | 
60 | 


--------------------------------------------------------------------------------
/src/data_loader.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import torch 
  3 | import torch.utils.data as data
  4 | import torch.nn as nn
  5 | import tables
  6 | import json
  7 | import random
  8 | import numpy as np
  9 | import pickle
 10 | 
 11 | from utils import PAD_ID, UNK_ID, indexes2sent
 12 | import configs
 13 | from util_ir import get_one_ir_npy_info
 14 | 
 15 | import logging
 16 | logger = logging.getLogger(__name__)
 17 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 18 | 
 19 |     
 20 | class CodeSearchDataset(data.Dataset):
 21 |     """
 22 |     Dataset that has only positive samples.
 23 |     """
 24 |     def __init__(self, config, data_dir, f_irs, max_node_num, f_descs=None, max_desc_len=None):
 25 |     
 26 |         self.max_node_num = max_node_num
 27 |         self.max_desc_len = max_desc_len
 28 | 
 29 |         self.n_edge_types = config['n_edge_types']
 30 |         self.state_dim = config['state_dim']
 31 |         self.max_word_num = config['max_word_num']
 32 | 
 33 |         print("Loading Data...")
 34 | 
 35 |         self.graph_dict = json.loads(open(data_dir+f_irs, 'r').readline()) 
 36 |         
 37 |         table_desc = tables.open_file(data_dir+f_descs)
 38 |         self.descs = table_desc.get_node('/phrases')[:].astype(np.long)
 39 |         self.idx_descs = table_desc.get_node('/indices')[:]
 40 |         
 41 |         assert len(self.graph_dict)==self.idx_descs.shape[0]
 42 |         self.data_len = self.idx_descs.shape[0]
 43 |         print("{} entries".format(self.data_len))
 44 |         
 45 |     def pad_seq(self, seq, maxlen):
 46 |         if len(seq) < maxlen:
 47 |             seq = np.append(seq, [PAD_ID]*(maxlen-len(seq)))
 48 |         seq = seq[:maxlen]
 49 |         return seq
 50 |     
 51 |     def __getitem__(self, offset):          
 52 |         # anno:[n_node], adjmat:[n_node x (n_node*n_edge_types*2)], node_mask:[n_node]
 53 |         # node_num:[1], word_num: [n_node]
 54 |         anno, adjmat, node_mask= get_one_ir_npy_info(self.graph_dict[str(offset)], 
 55 |                             self.max_node_num, self.n_edge_types, self.max_word_num)
 56 | 
 57 |         anno = torch.from_numpy(anno).type(torch.LongTensor)
 58 |         adjmat = torch.from_numpy(adjmat).type(torch.FloatTensor)
 59 |         node_mask = torch.Tensor(node_mask)
 60 | 
 61 |         len, pos = self.idx_descs[offset][0], self.idx_descs[offset][1]
 62 |         good_desc_len = min(int(len), self.max_desc_len)
 63 |         good_desc = self.descs[pos: pos+good_desc_len]
 64 |         good_desc = self.pad_seq(good_desc, self.max_desc_len)
 65 |         
 66 |         rand_offset = random.randint(0, self.data_len-1)
 67 |         len, pos = self.idx_descs[rand_offset][0], self.idx_descs[rand_offset][1]
 68 |         bad_desc_len = min(int(len), self.max_desc_len)
 69 |         bad_desc = self.descs[pos: pos+bad_desc_len]
 70 |         bad_desc = self.pad_seq(bad_desc, self.max_desc_len)
 71 | 
 72 |         return anno, adjmat, node_mask, good_desc, good_desc_len, bad_desc, bad_desc_len
 73 |     
 74 |     def __len__(self):
 75 |         return self.data_len
 76 | 
 77 | def load_dict(filename):
 78 |     return json.loads(open(filename, "r").readline())
 79 |     #return pickle.load(open(filename, 'rb')) 
 80 | 
 81 | 
 82 | if __name__ == '__main__':
 83 |     device = 'cpu'
 84 |     config = getattr(configs, 'config_IREmbeder')()
 85 |     input_dir = './data/github1/'
 86 | 
 87 |     test_set = CodeSearchDataset(config, input_dir, 'test.ir.json', 160, 'test.desc.h5', 30)
 88 |     test_data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=1, shuffle=False, drop_last=False, num_workers=1)
 89 |     print('number of batch:\n', len(test_data_loader))
 90 |     print('============ Train Data ================')
 91 |     k = 0
 92 |     
 93 |     for batch in test_data_loader:
 94 |         #print(batch)
 95 |         anno, adjmat, node_mask, good_desc, good_desc_len, bad_desc, bad_desc_len = [tensor.to(device) for tensor in batch]
 96 |         #print(anno)
 97 |         print(adjmat)
 98 |         for i in range(0, 160):
 99 |             for j in range(0, 320):
100 |                 if adjmat[0][i][j] == 1:
101 |                     print(i, j)
102 |         #print(node_num)
103 |         #print(word_num)
104 |         k+=1
105 |         if k>0: break
106 |     
107 |     
108 | 


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .iremb import IREmbeder


--------------------------------------------------------------------------------
/src/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/src/models/__pycache__/cfgemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/cfgemb.cpython-36.pyc


--------------------------------------------------------------------------------
/src/models/__pycache__/iremb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/iremb.cpython-36.pyc


--------------------------------------------------------------------------------
/src/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/jointemb.cpython-36.pyc


--------------------------------------------------------------------------------
/src/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/tokenemb.cpython-36.pyc


--------------------------------------------------------------------------------
/src/models/iremb.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.init as weight_init
  8 | import torch.nn.functional as F
  9 | 
 10 | import logging
 11 | logger = logging.getLogger(__name__)
 12 | parentPath = os.path.abspath("..")
 13 | sys.path.insert(0, parentPath) # add parent folder to path so as to import common modules
 14 | from modules import GGNN, SeqEncoder
 15 | 
 16 | class IREmbeder(nn.Module):
 17 |     def __init__(self, config):
 18 |         super(IREmbeder, self).__init__()
 19 | 
 20 |         self.conf = config
 21 | 
 22 |         self.margin = config['margin']
 23 |         self.dropout = config['dropout']
 24 | 
 25 |         self.n_desc_words = config['n_desc_words']
 26 |         self.emb_size = config['emb_size']
 27 |         self.n_hidden = config['n_hidden']
 28 |         self.ir_attn_mode = config['ir_attn_mode']
 29 | 
 30 |         self.ir_encoder = GGNN(self.conf)
 31 |         self.desc_encoder = SeqEncoder(self.n_desc_words, self.emb_size, self.n_hidden)
 32 | 
 33 |         self.linear_attn_out = nn.Sequential(nn.Linear(self.n_hidden, self.n_hidden),
 34 |                                              nn.Tanh(),
 35 |                                              nn.Linear(self.n_hidden, self.n_hidden))
 36 | 
 37 |         if self.conf['transform_every_modal']:
 38 |             self.linear_single_modal = nn.Sequential(nn.Linear(self.n_hidden, self.n_hidden),
 39 |                                                      nn.Tanh(),
 40 |                                                      nn.Linear(self.n_hidden, self.n_hidden))
 41 | 
 42 |         if self.conf['save_attn_weight']:
 43 |             self.attn_weight_torch = []
 44 |             self.node_mask_torch = []
 45 | 
 46 |         self.self_attn = nn.Linear(self.n_hidden, self.n_hidden)
 47 |         self.self_attn_scalar = nn.Linear(self.n_hidden, 1)
 48 |      
 49 | 
 50 |     def code_encoding(self, ir_init_input_batch, ir_adjmat_batch, ir_node_mask):
 51 |         batch_size = ir_node_mask.size()[0]
 52 | 
 53 |         # code_feat: [batch_size x n_node x state_dim]
 54 |         code_feat = self.ir_encoder(ir_init_input_batch, ir_adjmat_batch) # forward(annotation, A)
 55 | 
 56 |         node_num = code_feat.size()[1] # n_node
 57 |         code_feat = code_feat.reshape(-1, node_num, self.n_hidden) 
 58 |         # mask_1forgt0: [batch_size x n_node]
 59 |         mask_1forgt0 = ir_node_mask.bool().reshape(-1, node_num) 
 60 | 
 61 |         if self.conf['transform_every_modal']:
 62 |             code_feat = torch.tanh(
 63 |                 self.linear_single_modal(F.dropout(code_feat, self.dropout, training=self.training)))
 64 | 
 65 |         code_sa_tanh = torch.tanh(self.self_attn(code_feat.reshape(-1, self.n_hidden))) # [(batch_size * n_node) x n_hidden]
 66 |         code_sa_tanh = F.dropout(code_sa_tanh, self.dropout, training=self.training)
 67 |         # code_sa_tanh: [batch_size x n_node]
 68 |         code_sa_tanh = self.self_attn_scalar(code_sa_tanh).reshape(-1, node_num) 
 69 |         
 70 |         code_feat = code_feat.reshape(-1, node_num, self.n_hidden)
 71 |         batch_size = code_feat.size()[0]
 72 | 
 73 |         self_attn_code_feat = None
 74 |         for _i in range(batch_size):
 75 |             # code_sa_tanh_one: [1 x real_node_num]
 76 |             code_sa_tanh_one = torch.masked_select(code_sa_tanh[_i, :], mask_1forgt0[_i, :]).reshape(1, -1)
 77 |             
 78 |             if self.ir_attn_mode == 'sigmoid_scalar':
 79 |                 # attn_w_one: [1 x 1 x real_node_num]
 80 |                 attn_w_one = torch.sigmoid(code_sa_tanh_one).reshape(1, 1, -1)
 81 |             else:
 82 |                 attn_w_one = F.softmax(code_sa_tanh_one, dim=1).reshape(1, 1, -1)
 83 |             
 84 |             if self.conf['save_attn_weight']:
 85 |                 self.attn_weight_torch.append(attn_w_one.detach().reshape(1, -1).cpu())
 86 |                 self.node_mask_torch.append(mask_1forgt0[_i, :].detach().reshape(1, -1).cpu())
 87 |             
 88 |             # attn_feat_one: [1 x real_node_num x n_hidden]
 89 |             attn_feat_one = torch.masked_select(code_feat[_i, :, :].reshape(1, node_num, self.n_hidden),
 90 |                                                 mask_1forgt0[_i, :].reshape(1, node_num, 1)).reshape(1, -1, self.n_hidden)
 91 |             # out_to_cat: [1 x n_hidden]
 92 |             out_to_cat = torch.bmm(attn_w_one, attn_feat_one).reshape(1, self.n_hidden)   
 93 |             # self_attn_code_feat: [batch_size x n_hidden]                                                
 94 |             self_attn_code_feat = out_to_cat if self_attn_code_feat is None else torch.cat(
 95 |                 (self_attn_code_feat, out_to_cat), 0)
 96 | 
 97 |         if self.conf['use_attn']:
 98 |             self_attn_code_feat = torch.tanh(
 99 |                 self.linear_attn_out(
100 |                     F.dropout(self_attn_code_feat, self.dropout, training=self.training))
101 |             )
102 |         elif self.conf['use_tanh']:
103 |             self_attn_code_feat = torch.tanh(self_attn_code_feat)
104 | 
105 |         # self_attn_code_feat: [batch_size x n_hidden]
106 |         return self_attn_code_feat
107 |         
108 |     def desc_encoding(self, desc, desc_len):
109 |         batch_size = desc.size()[0]
110 |         desc_enc_hidden = self.desc_encoder.init_hidden(batch_size)
111 |         # desc_enc_hidden: [2 x batch_size x n_hidden]
112 |         _, desc_enc_hidden = self.desc_encoder(desc, desc_len)
113 |         # desc_feat: [batch_size x n_hidden]
114 |         desc_feat = desc_enc_hidden[0].reshape(batch_size, self.n_hidden)
115 | 
116 |         if self.conf['transform_every_modal']:
117 |             desc_feat = torch.tanh(
118 |                 self.linear_single_modal(
119 |                     F.dropout(desc_feat, self.dropout, training=self.training)
120 |                 )
121 |             )
122 |         elif self.conf['use_tanh']: 
123 |             desc_feat = torch.tanh(desc_feat)
124 | 
125 |         # desc_feat: [batch_size x n_hidden]
126 |         return desc_feat
127 |     
128 |     
129 |     def forward(self, ir_anno, ir_adjmat, ir_node_mask, desc_anchor, desc_anchor_len, desc_neg, desc_neg_len):
130 |         # code_repr: [batch_size x n_hidden]  
131 |         ir_repr = self.code_encoding(ir_anno, ir_adjmat, ir_node_mask)
132 |         # desc_repr: [batch_size x n_hidden]  
133 |         desc_anchor_repr = self.desc_encoding(desc_anchor, desc_anchor_len)
134 |         desc_neg_repr = self.desc_encoding(desc_neg, desc_neg_len)
135 | 
136 |         # sim: [batch_sz]
137 |         anchor_sim = F.cosine_similarity(ir_repr, desc_anchor_repr)
138 |         neg_sim = F.cosine_similarity(ir_repr, desc_neg_repr) 
139 |         
140 |         loss = (self.margin-anchor_sim+neg_sim).clamp(min=1e-6).mean()
141 |         
142 |         return loss
143 | 


--------------------------------------------------------------------------------
/src/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | import numpy as np
  5 | import argparse
  6 | import threading
  7 | import codecs
  8 | import logging
  9 | from tqdm import tqdm
 10 | logger = logging.getLogger(__name__)
 11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 12 | 
 13 | import torch
 14 | 
 15 | import models, configs, data_loader 
 16 | from modules import get_cosine_schedule_with_warmup
 17 | from utils import similarity, normalize
 18 | from data_loader import *
 19 | 
 20 | 
 21 | def test(config, model, device):
 22 |     logger.info('Test Begin...')
 23 | 
 24 |     model.eval()
 25 |     model.to(device)
 26 | 
 27 |     # load data
 28 |     data_path = args.data_path+args.dataset+'/'
 29 |     test_set = eval(config['dataset_name'])(config, data_path,
 30 |                                 config['test_ir'], config['n_node'],
 31 |                                 config['test_desc'], config['desc_len'])
 32 |     data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
 33 |                                         shuffle=False, drop_last=False, num_workers=1)
 34 |     # encode tokens and descs
 35 |     code_reprs, desc_reprs = [], []
 36 |     n_processed = 0
 37 |     for batch in data_loader:
 38 |         # batch[0:3]: init_input, adjmat, node_mask
 39 |         code_batch = [tensor.to(device) for tensor in batch[:3]]
 40 |         # batch[3:5]: good_desc, good_desc_len
 41 |         desc_batch = [tensor.to(device) for tensor in batch[3:5]]
 42 |         with torch.no_grad():
 43 |             code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
 44 |             desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
 45 |             # normalize when sim_measure=='cos'
 46 |             code_repr = normalize(code_repr)
 47 |             desc_repr = normalize(desc_repr)
 48 |         code_reprs.append(code_repr)
 49 |         desc_reprs.append(desc_repr)
 50 |         n_processed += batch[0].size(0) # +batch_size
 51 |     # code_reprs: [n_processed x n_hidden]
 52 |     code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 
 53 | 
 54 |     # calculate similarity
 55 |     sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
 56 |     test_sim_result, test_rank_result = [], []
 57 |     for i in tqdm(range(0, n_processed)):
 58 |         desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
 59 |         sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
 60 |         negsims = np.negative(sims)
 61 |         predict = np.argsort(negsims)
 62 |         
 63 |         # SuccessRate@k
 64 |         predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
 65 |         sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
 66 |         sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
 67 |         sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
 68 |         # MRR
 69 |         predict_list = predict.tolist()
 70 |         rank = predict_list.index(i)
 71 |         sum_mrr.append(1/float(rank+1))
 72 | 
 73 |         # results need to be saved
 74 |         predict_20 = [int(k) for k in predict[0:20]]
 75 |         sim_20 = [sims[k] for k in predict_20]
 76 |         test_sim_result.append(zip(predict_20, sim_20))
 77 |         test_rank_result.append(rank+1)
 78 | 
 79 |     logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
 80 |     save_path = args.data_path + 'result/'
 81 |     sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
 82 |     np.save(save_path+sim_result_filename, test_sim_result)
 83 |     np.save(save_path+rank_result_filename, test_rank_result)
 84 | 
 85 |     
 86 | def parse_args():
 87 |     parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
 88 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
 89 |     parser.add_argument('--model', type=str, default='IREmbeder', help='model name')
 90 |     parser.add_argument('-d', '--dataset', type=str, default='github1', help='name of dataset.java, python')
 91 |     parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from')
 92 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
 93 |     parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
 94 |     return parser.parse_args()
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     args = parse_args()
 99 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
100 |     config = getattr(configs, 'config_'+args.model)()
101 |     
102 |     ##### Define model ######
103 |     logger.info('Constructing Model..')
104 |     model = getattr(models, args.model)(config) # initialize the model
105 |     ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
106 |     model.load_state_dict(torch.load(ckpt, map_location=device))
107 |     
108 |     test(config, model, device)    
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import math
  4 | import torch
  5 | from torch.nn import functional as F
  6 | 
  7 | PAD_ID, UNK_ID = [0, 1]
  8 | 
  9 | def cos_approx(data1,data2):
 10 |     """numpy implementation of cosine similarity for matrix"""
 11 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
 12 |     dotted = np.dot(data1,np.transpose(data2))
 13 |     norm1 = np.linalg.norm(data1,axis=1)
 14 |     norm2 = np.linalg.norm(data2,axis=1)
 15 |     matrix_vector_norms = np.multiply(norm1, norm2)
 16 |     neighbors = np.divide(dotted, matrix_vector_norms)
 17 |     return neighbors
 18 | 
 19 | def normalize(data):
 20 |     """normalize matrix by rows"""
 21 |     return data/np.linalg.norm(data,axis=1,keepdims=True)
 22 | 
 23 | def dot_np(data1,data2):
 24 |     """cosine similarity for normalized vectors"""
 25 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
 26 |     return np.dot(data1, data2.T)
 27 | 
 28 | def sigmoid(x):
 29 |     return 1/(1 + np.exp(-x)) 
 30 | 
 31 | def similarity(vec1, vec2, measure='cos'):
 32 |     if measure=='cos':
 33 |         vec1_norm = normalize(vec1)
 34 |         vec2_norm = normalize(vec2)
 35 |         return np.dot(vec1_norm, vec2_norm.T)[:,0]
 36 |     elif measure=='poly':
 37 |         return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
 38 |     elif measure=='sigmoid':
 39 |         return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
 40 |     elif measure in ['enc', 'gesd', 'aesd']:  
 41 |         euc_dist = np.linalg.norm(vec1-vec2, axis=1)
 42 |         euc_sim = 1 / (1 + euc_dist)
 43 |         if measure=='euc': return euc_sim                
 44 |         sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
 45 |         if measure == 'gesd': return euc_sim * sigmoid_sim
 46 |         elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
 47 | 
 48 | #######################################################################
 49 | 
 50 | def asMinutes(s):
 51 |     m = math.floor(s / 60)
 52 |     s -= m * 60
 53 |     return '%d:%d'% (m, s)
 54 | 
 55 | def timeSince(since, percent):
 56 |     now = time.time()
 57 |     s = now - since
 58 |     es = s / (percent)
 59 |     rs = es - s
 60 |     return '%s<%s'%(asMinutes(s), asMinutes(rs))
 61 | 
 62 | #######################################################################
 63 | import nltk
 64 | try: nltk.word_tokenize("hello world")
 65 | except LookupError: nltk.download('punkt')
 66 |     
 67 | def sent2indexes(sentence, vocab, maxlen):
 68 |     '''sentence: a string or list of string
 69 |        return: a numpy array of word indices
 70 |     '''      
 71 |     def convert_sent(sent, vocab, maxlen):
 72 |         idxes = np.zeros(maxlen, dtype=np.int64)
 73 |         idxes.fill(PAD_ID)
 74 |         tokens = nltk.word_tokenize(sent.strip())
 75 |         idx_len = min(len(tokens), maxlen)
 76 |         for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID)
 77 |         return idxes, idx_len
 78 |     if type(sentence) is list:
 79 |         inds, lens = [], []
 80 |         for sent in sentence:
 81 |             idxes, idx_len = convert_sent(sent, vocab, maxlen)
 82 |             #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len])
 83 |             inds.append(idxes)
 84 |             lens.append(idx_len)
 85 |         return np.vstack(inds), np.vstack(lens)
 86 |     else:
 87 |         inds, lens = sent2indexes([sentence], vocab, maxlen)
 88 |         return inds[0], lens[0]
 89 |     
 90 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 
 91 |     '''indexes: numpy array'''
 92 |     def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
 93 |         indexes=filter(lambda i: i!=ignore_tok, indexes)
 94 |         toks, length = [], 0        
 95 |         for idx in indexes:
 96 |             toks.append(ivocab.get(idx, '<unk>'))
 97 |             length+=1
 98 |         return ' '.join(toks), length
 99 |     
100 |     ivocab = {v: k for k, v in vocab.items()}
101 |     if indexes.ndim==1:# one sentence
102 |         return revert_sent(indexes, ivocab, ignore_tok)
103 |     else:# dim>1
104 |         sentences, lens =[], [] # a batch of sentences
105 |         for inds in indexes:
106 |             sentence, length = revert_sent(inds, ivocab, ignore_tok)
107 |             sentences.append(sentence)
108 |             lens.append(length)
109 |         return sentences, lens
110 | 
111 | ########################################################################
112 | 


--------------------------------------------------------------------------------
/user study/queries.txt:
--------------------------------------------------------------------------------
 1 | write a byte to output buffer of device
 2 | insert a new value into list
 3 | check string is suffix of another string
 4 | remove all elements in list
 5 | stop logging messages to syslog
 6 | render bignum into decimal
 7 | look for match name in system dictionary
 8 | remove an element from a row vector
 9 | load a bignum from int
10 | grab a lock on a mutex
11 | advance a carray cursor to next row of output
12 | split string into args respecting backwhack and quote
13 | retrieve page from the pager cache
14 | round up to nearest integer
15 | create a new thread safe queue of size siz
16 | push integer to end of list
17 | restart timer from current point in time
18 | return file name in path
19 | get name of current executable
20 | read machine uptime
21 | store integer into register
22 | stop stream server
23 | get length of UCS2 string
24 | contrain maximum of a range
25 | pad given buffer with len padding characters
26 | write data in output buffers to client
27 | generate trace call to print
28 | get dimensions of given bmp file
29 | add extension to filename
30 | read one word from onboard RAM
31 | encode ucs2 string into utf8 string
32 | check pointer is in the heap
33 | search a file in directory recursively
34 | compress block of raw data
35 | allocate and clean buffer
36 | receive N byte from socket
37 | lookup key in a hash map
38 | fast integral power function
39 | check if directory is empty
40 | create a new tree node
41 | free a dirty page
42 | create message with given type
43 | encrypt byte sequence
44 | parse checksum file
45 | remove trailing blanks, tabs and newlines
46 | search last occurrence of char in string
47 | binary search in sorted array of size
48 | calculate checksum of checkpoint
49 | judge whether two strings are equal
50 | return random integer value between min and max


--------------------------------------------------------------------------------
/user study/readme.md:
--------------------------------------------------------------------------------
 1 | # The questionnaire to select the participants
 2 | ```
 3 | 1) What grade are you in?
 4 | 2) Have you taken C language courses in the past few years?
 5 | 3) Do you have C language programming experience, if so, how long is the programming experience?
 6 | 4) Which C language projects have you participated in? Please introduce them.
 7 | ```
 8 | # The contents of each file
 9 | ```
10 | queries.txt:                 the 50 queries randomly selected from our test-set with further filtering(e.g., removing clear technical keywords)
11 | code_search_DeGraphCS.txt:   the top-10 searched results returned by DegraphCS
12 | code_search_DeepCS.txt:      the top-10 searched results returned by DeepCS
13 | code_search_MMAN.txt:        the top-10 searched results returned by MMAN
14 | code_search_UNIF.txt:        the top-10 searched results returned by UNIF 
15 | ```
16 | 


--------------------------------------------------------------------------------