├── Appendix
├── README.md
├── baseline_comparison.png
├── propagation model.png
├── subgraph_compare.png
└── vfg_of_loop_recur.png
├── Baseline methods
├── AST-Att
│ ├── .DS_Store
│ ├── __pycache__
│ │ ├── configs.cpython-36.pyc
│ │ ├── data_loader.cpython-36.pyc
│ │ ├── modules.cpython-36.pyc
│ │ ├── util_ast.cpython-36.pyc
│ │ └── utils.cpython-36.pyc
│ ├── configs.py
│ ├── data
│ │ └── .DS_Store
│ ├── data_loader.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── astemb.cpython-36.pyc
│ │ │ ├── jointemb.cpython-36.pyc
│ │ │ └── tokenemb.cpython-36.pyc
│ │ └── astemb.py
│ ├── modules.py
│ ├── output
│ │ ├── .DS_Store
│ │ └── ASTEmbeder
│ │ │ └── .DS_Store
│ ├── test.py
│ ├── train.py
│ ├── util_ast.py
│ ├── util_desc.py
│ └── utils.py
├── CFG-Att
│ ├── .DS_Store
│ ├── __pycache__
│ │ ├── configs.cpython-36.pyc
│ │ ├── data_loader.cpython-36.pyc
│ │ ├── modules.cpython-36.pyc
│ │ ├── util_cfg.cpython-36.pyc
│ │ └── utils.cpython-36.pyc
│ ├── configs.py
│ ├── data
│ │ └── .DS_Store
│ ├── data_loader.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── cfgemb.cpython-36.pyc
│ │ │ ├── jointemb.cpython-36.pyc
│ │ │ └── tokenemb.cpython-36.pyc
│ │ └── cfgemb.py
│ ├── modules.py
│ ├── output
│ │ ├── .DS_Store
│ │ └── CFGEmbeder
│ │ │ └── .DS_Store
│ ├── test.py
│ ├── train.py
│ ├── util_cfg.py
│ ├── util_desc.py
│ └── utils.py
├── DeepCS
│ ├── .DS_Store
│ ├── Tok-Att.code-workspace
│ ├── __pycache__
│ │ ├── configs.cpython-36.pyc
│ │ ├── data_loader.cpython-36.pyc
│ │ ├── modules.cpython-36.pyc
│ │ └── utils.cpython-36.pyc
│ ├── configs.py
│ ├── data
│ │ └── .DS_Store
│ ├── data_loader.py
│ ├── data_prepare.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── jointemb.cpython-36.pyc
│ │ │ └── tokenemb.cpython-36.pyc
│ │ └── jointemb.py
│ ├── modules.py
│ ├── output
│ │ └── .DS_Store
│ ├── shuffle_index.npy
│ ├── test.py
│ ├── train.py
│ ├── user_study.py
│ ├── util_desc.py
│ ├── util_name.py
│ ├── util_tok.py
│ └── utils.py
├── MMAN(TDC)
│ ├── .DS_Store
│ ├── __pycache__
│ │ ├── configs.cpython-36.pyc
│ │ ├── data_loader.cpython-36.pyc
│ │ ├── modules.cpython-36.pyc
│ │ ├── util_cfg.cpython-36.pyc
│ │ ├── util_dfg.cpython-36.pyc
│ │ └── utils.cpython-36.pyc
│ ├── configs.py
│ ├── data
│ │ └── .DS_Store
│ ├── data_loader.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── cfgemb.cpython-36.pyc
│ │ │ ├── jointemb.cpython-36.pyc
│ │ │ └── tokenemb.cpython-36.pyc
│ │ └── cfgemb.py
│ ├── modules.py
│ ├── output
│ │ └── .DS_Store
│ ├── shuffle_index.npy
│ ├── test.py
│ ├── train.py
│ ├── util_cfg.py
│ ├── util_desc.py
│ ├── util_dfg.py
│ ├── util_tok.py
│ └── utils.py
├── MMAN
│ ├── .DS_Store
│ ├── __pycache__
│ │ ├── configs.cpython-36.pyc
│ │ ├── data_loader.cpython-36.pyc
│ │ ├── modules.cpython-36.pyc
│ │ ├── util_cfg.cpython-36.pyc
│ │ └── utils.cpython-36.pyc
│ ├── configs.py
│ ├── data
│ │ └── .DS_Store
│ ├── data_loader.py
│ ├── data_prepare
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── util_ast.cpython-36.pyc
│ │ │ ├── util_cfg.cpython-36.pyc
│ │ │ ├── util_desc.cpython-36.pyc
│ │ │ ├── util_tok.cpython-36.pyc
│ │ │ └── utils.cpython-36.pyc
│ │ ├── util_ast.py
│ │ ├── util_cfg.py
│ │ ├── util_desc.py
│ │ ├── util_tok.py
│ │ └── utils.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── cfgemb.cpython-36.pyc
│ │ │ ├── jointemb.cpython-36.pyc
│ │ │ ├── multiemb.cpython-36.pyc
│ │ │ └── tokenemb.cpython-36.pyc
│ │ └── multiemb.py
│ ├── modules.py
│ ├── output
│ │ ├── .DS_Store
│ │ └── MultiEmbeder
│ │ │ └── .DS_Store
│ ├── shuffle_index.npy
│ ├── test.py
│ └── train.py
├── Tok-Att
│ ├── .DS_Store
│ ├── Tok-Att.code-workspace
│ ├── __pycache__
│ │ ├── configs.cpython-36.pyc
│ │ ├── data_loader.cpython-36.pyc
│ │ ├── modules.cpython-36.pyc
│ │ └── utils.cpython-36.pyc
│ ├── configs.py
│ ├── data
│ │ └── .DS_Store
│ ├── data_loader.py
│ ├── data_prepare.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── jointemb.cpython-36.pyc
│ │ │ └── tokenemb.cpython-36.pyc
│ │ └── tokenemb.py
│ ├── modules.py
│ ├── output
│ │ ├── .DS_Store
│ │ └── TokenEmbeder
│ │ │ └── .DS_Store
│ ├── shuffle_index.npy
│ ├── test.py
│ ├── train.py
│ ├── util_desc.py
│ ├── util_tok.py
│ └── utils.py
└── readme.md
├── IR2graph
├── readme.md
└── vfg_construct.py
├── README.md
├── dataset
├── README.md
├── preprocessed_dataset
│ ├── origin.desc.txt
│ ├── origin.ir.txt
│ ├── readme.md
│ ├── test.desc.txt
│ ├── test.ir.txt
│ ├── train.desc.txt
│ └── train.ir.txt
└── raw_dataset
│ └── readme.md
├── src
├── README.md
├── configs.py
├── data_loader.py
├── generate_interface.py
├── models
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── cfgemb.cpython-36.pyc
│ │ ├── iremb.cpython-36.pyc
│ │ ├── jointemb.cpython-36.pyc
│ │ └── tokenemb.cpython-36.pyc
│ └── iremb.py
├── modules.py
├── test.py
├── train.py
├── util_desc.py
├── util_ir.py
└── utils.py
└── user study
├── code_search_DeGraphCS.txt
├── code_search_DeepCS.txt
├── code_search_MMAN.txt
├── code_search_UNIF.txt
├── queries.txt
└── readme.md
/Appendix/README.md:
--------------------------------------------------------------------------------
1 | # Online-Repo
2 |
3 | ## An example to show the recursive call and loop functions can be represented as close as possible from our variable-based flow graph
4 | #### An example "get sum" function realized by loop function
5 | ```
6 | int get_sum(int N){
7 | int sum = 0;
8 | while(N != 0){
9 | sum += N;
10 | N-=1;
11 | }
12 | return sum;
13 | }
14 | ```
15 | #### An example "get sum" function realized by recursive call
16 | ```
17 | int get_sum(int N){
18 | if(N == 0){return N;}
19 | else{
20 | int sum;
21 | sum = N + get_sum(N-1);
22 | return sum;
23 | }
24 | }
25 | ```
26 | #### The corresponding generated variable-based flow graphs are shown as below:
27 | Annotation:for recursive call "get_sum(N-1)" in (b), we link the result of "N-1" to input parameter "N", and we regard the return value "sum" of get_sum as the result of "get_sum(N-1)" so that we link return value "sum" to "add" operation):
28 | 
29 |
30 | #### To better illustrate the common charateristics of variable-based flow graph constructed by deGraphCS from the above two different realizations, we extract the core part of the two realizations to make comparison:
31 | ```
32 | sum += N; N-= 1; // in loop function
33 | sum = N + get_sum(N-1) // in recursive call
34 | ```
35 | #### The corresponding sub-graphs of the core part are shown as below, from which we can clearly capture the common part:
36 | 
37 |
38 | #### The corresponding generated AST and CFG of the two above realizations, the difference is obvious:
39 | 
40 |
41 | ## The details of the equations and algorithms in deGraphCS
42 | ### The realization details of the attention mechanism on the whole graph and the comments
43 | ```
44 | self_attn = nn.Linear(self.n_hidden, self.n_hidden)
45 | self_attn_scalar = nn.Linear(self.n_hidden, 1)
46 | ```
47 |
48 | Here, function f() in Equation (2) and Equation (4) means the first MLP layer: nn.Linear(self.n_hidden, self.n_hidden).
49 |
50 | u_vfg means the second MLP layer: nn.Linear(self.n_hidden, 1), which can be seen as a high level representation of the VFG nodes.
51 |
52 | h_vfg means the final weighted sum embedding of the whole graph (the weighted sum of self_attn_scalar and each node's final embedding). The difference between u_vfg and h_vfg is the same for the corresponding part of Equation (4) and Equation (5).
53 |
54 | ### The aggragation function used in Equation (1)
55 | The aggregation function used in Equation (1) can be illustrated as follows:
56 | 
57 |
58 | In the functions above, Eq. 1 is the initialization step, which copies node annotations into the first components
59 | of the hidden state and pads the rest with zeros.
60 |
61 | Eq. 2 is the step that passes information between
62 | different nodes of the graph via incoming and outgoing edges with parameters dependent on the edge
63 | type and direction.
64 |
65 | The remaining are GRU-like updates that incorporate information from the other nodes and from the previous timestep
66 | to update each node’s hidden state.
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/Appendix/baseline_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/baseline_comparison.png
--------------------------------------------------------------------------------
/Appendix/propagation model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/propagation model.png
--------------------------------------------------------------------------------
/Appendix/subgraph_compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/subgraph_compare.png
--------------------------------------------------------------------------------
/Appendix/vfg_of_loop_recur.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Appendix/vfg_of_loop_recur.png
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/configs.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/data_loader.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/modules.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/__pycache__/util_ast.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/util_ast.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/configs.py:
--------------------------------------------------------------------------------
1 |
2 | def config_ASTEmbeder():
3 | conf = {
4 | # added_params
5 | 'transform_attn_out': 0,
6 | 'transform_every_modal': 0, # to make modal more complex?
7 | 'save_attn_weight': 0,
8 | 'use_tanh': 1,
9 | 'use_attn': 1,
10 | 'use_desc_attn': 1,
11 |
12 | # tree lstm
13 | 'treelstm_cell_type': 'nary', # nary or childsum
14 |
15 | # data_params
16 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
17 | #training data
18 | 'train_ast':'train.ast.json',
19 | 'train_desc':'train.desc.h5',
20 | # test data
21 | 'test_ast':'test.ast.json',
22 | 'test_desc':'test.desc.h5',
23 |
24 | #parameters
25 | 'desc_len': 30,
26 | 'n_ast_words': 16000, # len(vocabulary) + 1
27 | 'n_desc_words': 10000, # wait to decide
28 | #vocabulary info
29 | 'vocab_ast':'vocab.ast.json',
30 | 'vocab_desc':'vocab.desc.json',
31 |
32 | #training_params
33 | 'batch_size': 32,
34 | 'nb_epoch': 200,
35 | #'optimizer': 'adam',
36 | 'learning_rate':0.0003, # try 1e-4(paper)
37 | 'adam_epsilon':1e-8,
38 | 'warmup_steps':5000,
39 | 'fp16': False,
40 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
41 | #"See details at https://nvidia.github.io/apex/amp.html"
42 |
43 | # model_params
44 | 'emb_size': 300,
45 | 'n_hidden': 512,#number of hidden dimension of code/desc representation
46 | # recurrent
47 | 'margin': 0.6,
48 | 'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
49 | #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
50 | 'dropout':0.1
51 | }
52 | return conf
53 |
54 |
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/data/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .astemb import ASTEmbeder
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/models/__pycache__/astemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/astemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/jointemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/models/__pycache__/tokenemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/output/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/output/ASTEmbeder/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/AST-Att/output/ASTEmbeder/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import traceback
4 | import numpy as np
5 | import argparse
6 | import threading
7 | import codecs
8 | import logging
9 | from tqdm import tqdm
10 | logger = logging.getLogger(__name__)
11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package
13 |
14 | import torch
15 |
16 | import models, configs, data_loader
17 | from modules import get_cosine_schedule_with_warmup
18 | from utils import similarity, normalize
19 | from data_loader import *
20 |
21 |
22 | def test(config, model, device):
23 | logger.info('test begin...')
24 |
25 | model.eval()
26 | model.to(device)
27 |
28 | # load data
29 | data_path = args.data_path+args.dataset+'/'
30 | test_set = eval(config['dataset_name'])(data_path,
31 | config['test_ast'], config['vocab_ast'],
32 | config['test_desc'], config['desc_len'])
33 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
34 | collate_fn=batcher(device), shuffle=False, drop_last=False, num_workers=0)
35 | # encode asts and descs
36 | code_reprs, desc_reprs = [], []
37 | n_processed = 0
38 | for batch in data_loader:
39 | code_batch = [tensor for tensor in batch[:2]]
40 | desc_batch = [tensor for tensor in batch[2:4]]
41 | with torch.no_grad():
42 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
43 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
44 | # normalize when sim_measure=='cos'
45 | code_repr = normalize(code_repr)
46 | desc_repr = normalize(desc_repr)
47 | code_reprs.append(code_repr)
48 | desc_reprs.append(desc_repr)
49 | n_processed += batch[2].size(0) # +batch_size
50 | # code_reprs: [n_processed x n_hidden]
51 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs)
52 |
53 | # calculate similarity
54 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
55 | test_sim_result, test_rank_result = [], []
56 | for i in tqdm(range(0, n_processed)):
57 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
58 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
59 | negsims = np.negative(sims)
60 | predict = np.argsort(negsims)
61 |
62 | # SuccessRate@k
63 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
64 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
65 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
66 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
67 | # MRR
68 | predict_list = predict.tolist()
69 | rank = predict_list.index(i)
70 | sum_mrr.append(1/float(rank+1))
71 |
72 | # results need to be saved
73 | predict_20 = [int(k) for k in predict[0:20]]
74 | sim_20 = [sims[k] for k in predict_20]
75 | test_sim_result.append(zip(predict_20, sim_20))
76 | test_rank_result.append(rank+1)
77 |
78 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
79 | save_path = args.data_path + 'result/'
80 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
81 | #np.save(save_path+sim_result_filename, test_sim_result)
82 | #np.save(save_path+rank_result_filename, test_rank_result)
83 |
84 |
85 | def parse_args():
86 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
87 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
88 | parser.add_argument('--model', type=str, default='ASTEmbeder', help='model name')
89 | parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python')
90 | parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from')
91 | parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\
92 | 'Note: should be consistent with the same argument in the repr_code.py')
93 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
94 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
95 | return parser.parse_args()
96 |
97 |
98 | if __name__ == '__main__':
99 | args = parse_args()
100 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
101 | config = getattr(configs, 'config_'+args.model)()
102 |
103 | ##### Define model ######
104 | logger.info('Constructing Model..')
105 | model = getattr(models, args.model)(config) # initialize the model
106 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
107 | model.load_state_dict(torch.load(ckpt, map_location=device))
108 |
109 | test(config, model, device)
110 |
111 |
112 |
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/util_desc.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | from collections import Counter
4 | import json
5 | import h5py
6 | from utils import UNK_ID
7 |
8 | def make_shuffle_index(args):
9 | dir_path = args.data_path + args.dataset
10 | all_desc_file_path = dir_path + args.all_desc_file
11 | with open(all_desc_file_path, 'r') as all_desc_file:
12 | lines = all_desc_file.readlines()
13 | all_num = int(len(lines)/2)
14 |
15 | index = np.arange(all_num)
16 | np.random.seed(16)
17 | np.random.shuffle(index)
18 | #print(index)
19 | np.save(args.shuffle_index_file, index)
20 |
21 | def split_data(args):
22 |
23 | dir_path = args.data_path + args.dataset
24 | all_desc_file_path = dir_path + args.all_desc_file
25 | train_desc_file_path = dir_path + args.train_desc_file
26 | test_desc_file_path = dir_path + args.test_desc_file
27 |
28 | input_desc = []
29 | with open(all_desc_file_path, 'r') as all_desc_file:
30 | lines = all_desc_file.readlines()
31 | for line in lines:
32 | if (line[0:10] != 'BeginFunc:'):
33 | input_desc.append(line)
34 | print('number of input desc:\n', len(input_desc))
35 |
36 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
37 | for i in range(0, args.trainset_num):
38 | train_desc_file.write(input_desc[i])
39 | for i in range(args.testset_start_ind, args.testset_start_ind+args.testset_num):
40 | test_desc_file.write(input_desc[i])
41 |
42 |
43 | def create_dict_file(args):
44 | dir_path = args.data_path + args.dataset
45 | desc_file_path = dir_path + args.train_desc_file
46 |
47 | input_desc = []
48 | with open(desc_file_path, 'r') as desc_file:
49 | input_desc = desc_file.readlines()
50 | desc_words = []
51 | for i in range(0, len(input_desc)):
52 | input_desc[i] = input_desc[i].rstrip('\n')
53 | desc_word_list = input_desc[i].split()
54 | for desc_word in desc_word_list:
55 | desc_words.append(desc_word)
56 | vocab_desc_info = Counter(desc_words)
57 | print(len(vocab_desc_info))
58 |
59 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
60 | vocab_desc_index = {'':0, '':1}
61 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
62 |
63 |
64 | vocab_desc_file_path = dir_path + args.vocab_desc_file
65 | desc_dic_str = json.dumps(vocab_desc_index)
66 | with open(vocab_desc_file_path, 'w') as vocab_desc_file:
67 | vocab_desc_file.write(desc_dic_str)
68 |
69 |
70 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
71 | phrases, indices = [], []
72 | with open(sent_file_path, 'r') as sent_file:
73 | sents = sent_file.readlines()
74 | vocab = json.loads(open(vocab_file_path, "r").readline())
75 | start_index = 0
76 | for i in range(0, len(sents)):
77 | sent = sents[i].rstrip('\n')
78 | word_list = sent.split()
79 | sent_len = min(len(word_list), maxlen)
80 | indices.append((sent_len, start_index))
81 | for j in range(0, sent_len):
82 | word = word_list[j]
83 | phrases.append(vocab.get(word, UNK_ID))
84 | start_index += sent_len
85 | output_file_path = sent_file_path[0:-3] + 'h5'
86 | output_file = h5py.File(output_file_path, 'w')
87 | output_file['phrases'] = phrases
88 | output_file['indices'] = indices
89 | output_file.close()
90 |
91 |
92 | def parse_args():
93 | parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder")
94 |
95 | parser.add_argument('--data_path', type=str, default='./data/')
96 | parser.add_argument('--dataset', type=str, default='github/')
97 |
98 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
99 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
100 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
101 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
102 |
103 | parser.add_argument('--trainset_num', type=int, default=32000)
104 | parser.add_argument('--testset_num', type=int, default=1000)
105 | parser.add_argument('--testset_start_ind', type=int, default=32000)
106 | parser.add_argument('--desc_word_num', type=int, default=10000)
107 | parser.add_argument('--desc_maxlen', type=int, default=30)
108 |
109 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
110 |
111 | return parser.parse_args()
112 |
113 | if __name__ == '__main__':
114 | args = parse_args()
115 |
116 | split_data(args)
117 | create_dict_file(args)
118 |
119 | dir_path = args.data_path + args.dataset
120 | # train.desc.txt -> train.desc.h5(and test...)
121 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
122 | sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
123 |
124 |
--------------------------------------------------------------------------------
/Baseline methods/AST-Att/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | import math
4 | import torch
5 | from torch.nn import functional as F
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def cos_approx(data1,data2):
10 | """numpy implementation of cosine similarity for matrix"""
11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
12 | dotted = np.dot(data1,np.transpose(data2))
13 | norm1 = np.linalg.norm(data1,axis=1)
14 | norm2 = np.linalg.norm(data2,axis=1)
15 | matrix_vector_norms = np.multiply(norm1, norm2)
16 | neighbors = np.divide(dotted, matrix_vector_norms)
17 | return neighbors
18 |
19 | def normalize(data):
20 | """normalize matrix by rows"""
21 | return data/np.linalg.norm(data,axis=1,keepdims=True)
22 |
23 | def dot_np(data1,data2):
24 | """cosine similarity for normalized vectors"""
25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
26 | return np.dot(data1, data2.T)
27 |
28 | def sigmoid(x):
29 | return 1/(1 + np.exp(-x))
30 |
31 | def similarity(vec1, vec2, measure='cos'):
32 | if measure=='cos':
33 | vec1_norm = normalize(vec1)
34 | vec2_norm = normalize(vec2)
35 | return np.dot(vec1_norm, vec2_norm.T)[:,0]
36 | elif measure=='poly':
37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
38 | elif measure=='sigmoid':
39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
40 | elif measure in ['enc', 'gesd', 'aesd']:
41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1)
42 | euc_sim = 1 / (1 + euc_dist)
43 | if measure=='euc': return euc_sim
44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
45 | if measure == 'gesd': return euc_sim * sigmoid_sim
46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
47 |
48 | #######################################################################
49 |
50 | def asMinutes(s):
51 | m = math.floor(s / 60)
52 | s -= m * 60
53 | return '%d:%d'% (m, s)
54 |
55 | def timeSince(since, percent):
56 | now = time.time()
57 | s = now - since
58 | es = s / (percent)
59 | rs = es - s
60 | return '%s<%s'%(asMinutes(s), asMinutes(rs))
61 |
62 | #######################################################################
63 | '''
64 | import nltk
65 | try: nltk.word_tokenize("hello world")
66 | except LookupError: nltk.download('punkt')
67 |
68 | def sent2indexes(sentence, vocab, maxlen):
69 |
70 | def convert_sent(sent, vocab, maxlen):
71 | idxes = np.zeros(maxlen, dtype=np.int64)
72 | idxes.fill(PAD_ID)
73 | tokens = nltk.word_tokenize(sent.strip())
74 | idx_len = min(len(tokens), maxlen)
75 | for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID)
76 | return idxes, idx_len
77 | if type(sentence) is list:
78 | inds, lens = [], []
79 | for sent in sentence:
80 | idxes, idx_len = convert_sent(sent, vocab, maxlen)
81 | #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len])
82 | inds.append(idxes)
83 | lens.append(idx_len)
84 | return np.vstack(inds), np.vstack(lens)
85 | else:
86 | inds, lens = sent2indexes([sentence], vocab, maxlen)
87 | return inds[0], lens[0]
88 | '''
89 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID):
90 | '''indexes: numpy array'''
91 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
92 | indexes=filter(lambda i: i!=ignore_tok, indexes)
93 | toks, length = [], 0
94 | for idx in indexes:
95 | toks.append(ivocab.get(idx, ''))
96 | length+=1
97 | return ' '.join(toks), length
98 |
99 | ivocab = {v: k for k, v in vocab.items()}
100 | if indexes.ndim==1:# one sentence
101 | return revert_sent(indexes, ivocab, ignore_tok)
102 | else:# dim>1
103 | sentences, lens =[], [] # a batch of sentences
104 | for inds in indexes:
105 | sentence, length = revert_sent(inds, ivocab, ignore_tok)
106 | sentences.append(sentence)
107 | lens.append(length)
108 | return sentences, lens
109 |
110 | ########################################################################
111 |
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/configs.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/data_loader.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/modules.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/__pycache__/util_cfg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/util_cfg.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/configs.py:
--------------------------------------------------------------------------------
1 |
2 | def config_CFGEmbeder():
3 | conf = {
4 | # added_params
5 | 'transform_every_modal': 0, # to make modal more complex?
6 | 'save_attn_weight': 0,
7 | 'use_tanh': 1,
8 | 'use_attn': 1,
9 |
10 | # GGNN
11 | 'state_dim': 512, # GGNN hidden state size
12 | 'annotation_dim': 5,
13 | 'n_edge_types': 2,
14 | 'n_node': 200, # could be less than 512, like the maximum nodenum
15 | 'n_steps': 5, # propogation steps number of GGNN
16 | 'output_type': 'no_reduce',
17 | 'batch_size': 32,
18 | 'n_layers': 1,
19 | 'n_hidden': 512,
20 | 'cfg_attn_mode': 'sigmoid_scalar',
21 |
22 | # data_params
23 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
24 | #training data
25 | 'train_cfg':'train.cfg.txt',
26 | 'train_desc':'train.desc.h5',
27 | # test data
28 | 'test_cfg':'test.cfg.txt',
29 | 'test_desc':'test.desc.h5',
30 |
31 | #parameters
32 | 'desc_len': 30,
33 | 'n_desc_words': 10000, # wait to decide
34 | #vocabulary info
35 | 'vocab_desc':'vocab.desc.json',
36 |
37 | #training_params
38 | 'chunk_size': 200000,
39 | 'nb_epoch': 200,
40 | #'optimizer': 'adam',
41 | 'learning_rate':0.0003, # try 1e-4(paper)
42 | 'adam_epsilon':1e-8,
43 | 'warmup_steps':5000,
44 | 'fp16': False,
45 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
46 | #"See details at https://nvidia.github.io/apex/amp.html"
47 |
48 | # model_params
49 | 'emb_size': 300,
50 | # recurrent
51 | 'margin': 0.6,
52 | 'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
53 | #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
54 | 'dropout': 0.1
55 | }
56 | return conf
57 |
58 |
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/data/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/data_loader.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | import torch.utils.data as data
4 | import torch.nn as nn
5 | import tables
6 | import json
7 | import random
8 | import numpy as np
9 | import pickle
10 |
11 | from utils import PAD_ID, UNK_ID, indexes2sent
12 | import configs
13 | from util_cfg import get_cfg_npy_info, get_one_cfg_npy_info
14 |
15 | import logging
16 | logger = logging.getLogger(__name__)
17 | logging.basicConfig(level=logging.INFO, format="%(message)s")
18 |
19 |
20 | class CodeSearchDataset(data.Dataset):
21 | """
22 | Dataset that has only positive samples.
23 | """
24 | def __init__(self, config, data_dir, f_cfgs, max_node_num, f_descs=None, max_desc_len=None):
25 |
26 | self.max_node_num = max_node_num
27 | self.max_desc_len = max_desc_len
28 | self.n_edge_types = config['n_edge_types']
29 | self.state_dim = config['state_dim']
30 | self.annotation_dim = config['annotation_dim']
31 |
32 | # initialize file path or list of file names
33 | self.training = False
34 | print("Loading Data...")
35 |
36 | self.mark_list = []
37 | start_index, end_index = [0, 0]
38 | with open(data_dir+f_cfgs, 'r') as cfg_file:
39 | self.cfg_lines = cfg_file.readlines()
40 | for i in range(0, len(self.cfg_lines)):
41 | self.cfg_lines[i] = self.cfg_lines[i].rstrip('\n')
42 | if self.cfg_lines[i][0:10] == 'BeginFunc:' and i != 0:
43 | end_index = i
44 | self.mark_list.append([start_index, end_index])
45 | start_index = i
46 | self.mark_list.append([start_index, len(self.cfg_lines)])
47 |
48 | '''
49 | # cfg_adjmat: [all_num x n_node x (n_node * n_edge_types * 2)]
50 | # cfg_init_input: [all_num x n_node x state_dim]
51 | # cfg_node_mask: [all_num x n_node]
52 | self.cfg_adjmat, self.cfg_init_input, self.cfg_node_mask = get_cfg_npy_info(self.cfg_lines,
53 | self.max_node_num, self.n_edge_types, self.state_dim, self.annotation_dim)
54 | '''
55 |
56 | if f_descs is not None:
57 | self.training = True
58 | table_desc = tables.open_file(data_dir+f_descs)
59 | self.descs = table_desc.get_node('/phrases')[:].astype(np.long)
60 | self.idx_descs = table_desc.get_node('/indices')[:]
61 | '''
62 | if f_descs is not None:
63 | assert len(self.cfg_adjmat)==self.idx_descs.shape[0]
64 | '''
65 | self.data_len = self.idx_descs.shape[0]
66 | print("{} entries".format(self.data_len))
67 |
68 | def pad_seq(self, seq, maxlen):
69 | if len(seq) < maxlen:
70 | seq = np.append(seq, [PAD_ID]*(maxlen-len(seq)))
71 | seq = seq[:maxlen]
72 | return seq
73 |
74 | def __getitem__(self, offset):
75 | #print('offset:\n', offset)
76 | #print('cfg start_index = {}, end_index = {}'.format(self.mark_list[offset][0], self.mark_list[offset][1]))
77 |
78 | input_cfg_lines = self.cfg_lines[self.mark_list[offset][0]: self.mark_list[offset][1]]
79 | adjmat, init_input, node_mask = get_one_cfg_npy_info(input_cfg_lines,
80 | self.max_node_num, self.n_edge_types, self.state_dim, self.annotation_dim)
81 |
82 | if self.training:
83 | len, pos = self.idx_descs[offset][0], self.idx_descs[offset][1]
84 | good_desc_len = min(int(len), self.max_desc_len)
85 | good_desc = self.descs[pos: pos+good_desc_len]
86 | good_desc = self.pad_seq(good_desc, self.max_desc_len)
87 |
88 | rand_offset = random.randint(0, self.data_len-1)
89 | len, pos = self.idx_descs[rand_offset][0], self.idx_descs[rand_offset][1]
90 | bad_desc_len = min(int(len), self.max_desc_len)
91 | bad_desc = self.descs[pos: pos+bad_desc_len]
92 | bad_desc = self.pad_seq(bad_desc, self.max_desc_len)
93 |
94 | return torch.Tensor(init_input), torch.Tensor(adjmat), torch.Tensor(node_mask), good_desc, good_desc_len, bad_desc, bad_desc_len
95 | return torch.Tensor(init_input), torch.Tensor(adjmat), torch.Tensor(node_mask), good_desc, good_desc_len
96 |
97 | def __len__(self):
98 | return self.data_len
99 |
100 | def load_dict(filename):
101 | return json.loads(open(filename, "r").readline())
102 | #return pickle.load(open(filename, 'rb'))
103 |
104 |
105 | if __name__ == '__main__':
106 | device = 'cpu'
107 | config = getattr(configs, 'config_CFGEmbeder')()
108 | input_dir = './data/github/'
109 |
110 | train_set = CodeSearchDataset(config, input_dir, 'train.cfg.txt', 512, 'train.desc.h5', 30)
111 | train_data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=5, shuffle=False, drop_last=False, num_workers=1)
112 | print('number of batch:\n', len(train_data_loader))
113 | '''
114 | use_set = CodeSearchDataset(input_dir, 'use.tokens.h5', 30)
115 | use_data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=1, shuffle=False, num_workers=1)
116 | #print(len(use_data_loader))
117 | vocab_tokens = load_dict(input_dir+'vocab.tokens.json')
118 | vocab_desc = load_dict(input_dir+'vocab.desc.json')
119 | '''
120 | vocab_desc = load_dict(input_dir+'vocab.desc.json')
121 | print('============ Train Data ================')
122 | k = 0
123 | for epo in range(0,3):
124 | for batch in train_data_loader:
125 | print("batch[1].size(): ", batch[1].size())
126 | #batch = tuple([t.numpy() for t in batch])
127 | init_input, adjmat, node_mask, good_desc, good_desc_len, bad_desc, bad_desc_len = [tensor.to(device) for tensor in batch]
128 | print(adjmat.dtype)
129 | #print(batch)
130 | k+=1
131 | #if k>0: break
132 | print('-------------------------------')
133 | print(indexes2sent(good_desc, vocab_desc))
134 | #print(indexes2sent(good_desc, vocab_desc))
135 |
136 |
137 |
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .cfgemb import CFGEmbeder
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/models/__pycache__/cfgemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/cfgemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/jointemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/models/__pycache__/tokenemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/output/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/output/CFGEmbeder/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/CFG-Att/output/CFGEmbeder/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import traceback
4 | import numpy as np
5 | import argparse
6 | import threading
7 | import codecs
8 | import logging
9 | from tqdm import tqdm
10 | logger = logging.getLogger(__name__)
11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package
13 |
14 | import torch
15 |
16 | import models, configs, data_loader
17 | from modules import get_cosine_schedule_with_warmup
18 | from utils import similarity, normalize
19 | from data_loader import *
20 |
21 |
22 | def test(config, model, device):
23 | logger.info('Test Begin...')
24 |
25 | model.eval()
26 | model.to(device)
27 |
28 | # load data
29 | data_path = args.data_path+args.dataset+'/'
30 | test_set = eval(config['dataset_name'])(config, data_path,
31 | config['test_cfg'], config['n_node'],
32 | config['test_desc'], config['desc_len'])
33 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
34 | shuffle=False, drop_last=False, num_workers=1)
35 | # encode tokens and descs
36 | code_reprs, desc_reprs = [], []
37 | n_processed = 0
38 | for batch in data_loader:
39 | # batch[0:3]: init_input, adjmat, node_mask
40 | code_batch = [tensor.to(device) for tensor in batch[:3]]
41 | # batch[3:5]: good_desc, good_desc_len
42 | desc_batch = [tensor.to(device) for tensor in batch[3:5]]
43 | with torch.no_grad():
44 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
45 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
46 | # normalize when sim_measure=='cos'
47 | code_repr = normalize(code_repr)
48 | desc_repr = normalize(desc_repr)
49 | code_reprs.append(code_repr)
50 | desc_reprs.append(desc_repr)
51 | n_processed += batch[0].size(0) # +batch_size
52 | # code_reprs: [n_processed x n_hidden]
53 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs)
54 |
55 | # calculate similarity
56 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
57 | test_sim_result, test_rank_result = [], []
58 | for i in tqdm(range(0, n_processed)):
59 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
60 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
61 | negsims = np.negative(sims)
62 | predict = np.argsort(negsims)
63 |
64 | # SuccessRate@k
65 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
66 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
67 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
68 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
69 | # MRR
70 | predict_list = predict.tolist()
71 | rank = predict_list.index(i)
72 | sum_mrr.append(1/float(rank+1))
73 |
74 | # results need to be saved
75 | predict_20 = [int(k) for k in predict[0:20]]
76 | sim_20 = [sims[k] for k in predict_20]
77 | test_sim_result.append(zip(predict_20, sim_20))
78 | test_rank_result.append(rank+1)
79 |
80 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
81 | save_path = args.data_path + 'result/'
82 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
83 | np.save(save_path+sim_result_filename, test_sim_result)
84 | np.save(save_path+rank_result_filename, test_rank_result)
85 |
86 |
87 | def parse_args():
88 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
89 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
90 | parser.add_argument('--model', type=str, default='CFGEmbeder', help='model name')
91 | parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python')
92 | parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from')
93 | parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\
94 | 'Note: should be consistent with the same argument in the repr_code.py')
95 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
96 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
97 | return parser.parse_args()
98 |
99 |
100 | if __name__ == '__main__':
101 | args = parse_args()
102 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
103 | config = getattr(configs, 'config_'+args.model)()
104 |
105 | ##### Define model ######
106 | logger.info('Constructing Model..')
107 | model = getattr(models, args.model)(config) # initialize the model
108 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
109 | model.load_state_dict(torch.load(ckpt, map_location=device))
110 |
111 | test(config, model, device)
112 |
113 |
114 |
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/util_desc.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | from collections import Counter
4 | import json
5 | import h5py
6 | from utils import UNK_ID
7 |
8 | def make_shuffle_index_num(args, all_num):
9 | index = np.arange(all_num)
10 | np.random.seed(16)
11 | np.random.shuffle(index)
12 | print('index:\n', index)
13 | np.save(args.shuffle_index_file, index)
14 |
15 | def make_shuffle_index(args):
16 | dir_path = args.data_path + args.dataset
17 | all_desc_file_path = dir_path + args.all_desc_file
18 | with open(all_desc_file_path, 'r') as all_desc_file:
19 | lines = all_desc_file.readlines()
20 | all_num = int(len(lines)/2)
21 | print('all_num of desc:\n', all_num)
22 |
23 | index = np.arange(all_num)
24 | np.random.seed(16)
25 | np.random.shuffle(index)
26 | print('index:\n', index)
27 | np.save(args.shuffle_index_file, index)
28 |
29 | def split_data(args):
30 | index = np.load(args.shuffle_index_file)
31 |
32 | dir_path = args.data_path + args.dataset
33 | all_desc_file_path = dir_path + args.all_desc_file
34 | train_desc_file_path = dir_path + args.train_desc_file
35 | test_desc_file_path = dir_path + args.test_desc_file
36 |
37 | input_desc = []
38 | with open(all_desc_file_path, 'r') as all_desc_file:
39 | lines = all_desc_file.readlines()
40 | for line in lines:
41 | if (line[0:10] != 'BeginFunc:'):
42 | input_desc.append(line)
43 | print('number of input desc:\n', len(input_desc))
44 |
45 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
46 | for i in range(0, args.trainset_num):
47 | train_desc_file.write(input_desc[index[i]])
48 | for i in range(32000, 32000+args.testset_num):
49 | test_desc_file.write(input_desc[index[i]])
50 |
51 |
52 | def create_dict_file(args):
53 | dir_path = args.data_path + args.dataset
54 | desc_file_path = dir_path + args.train_desc_file
55 |
56 | input_desc = []
57 | with open(desc_file_path, 'r') as desc_file:
58 | input_desc = desc_file.readlines()
59 | desc_words = []
60 | for i in range(0, len(input_desc)):
61 | input_desc[i] = input_desc[i].rstrip('\n')
62 | desc_word_list = input_desc[i].split()
63 | for desc_word in desc_word_list:
64 | desc_words.append(desc_word)
65 | vocab_desc_info = Counter(desc_words)
66 | print(len(vocab_desc_info))
67 |
68 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
69 | vocab_desc_index = {'':0, '':1}
70 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
71 |
72 |
73 | vocab_desc_file_path = dir_path + args.vocab_desc_file
74 | desc_dic_str = json.dumps(vocab_desc_index)
75 | with open(vocab_desc_file_path, 'w') as vocab_desc_file:
76 | vocab_desc_file.write(desc_dic_str)
77 |
78 |
79 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
80 | phrases, indices = [], []
81 | with open(sent_file_path, 'r') as sent_file:
82 | sents = sent_file.readlines()
83 | vocab = json.loads(open(vocab_file_path, "r").readline())
84 | start_index = 0
85 | for i in range(0, len(sents)):
86 | sent = sents[i].rstrip('\n')
87 | word_list = sent.split()
88 | sent_len = min(len(word_list), maxlen)
89 | indices.append((sent_len, start_index))
90 | for j in range(0, sent_len):
91 | word = word_list[j]
92 | phrases.append(vocab.get(word, UNK_ID))
93 | start_index += sent_len
94 | output_file_path = sent_file_path[0:-3] + 'h5'
95 | output_file = h5py.File(output_file_path, 'w')
96 | output_file['phrases'] = phrases
97 | output_file['indices'] = indices
98 | output_file.close()
99 |
100 |
101 | def parse_args():
102 | parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder")
103 |
104 | parser.add_argument('--data_path', type=str, default='./data/')
105 | parser.add_argument('--dataset', type=str, default='github/')
106 |
107 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
108 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
109 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
110 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
111 |
112 | parser.add_argument('--trainset_num', type=int, default=32000)
113 | parser.add_argument('--testset_num', type=int, default=1000)
114 | parser.add_argument('--desc_word_num', type=int, default=10000)
115 | parser.add_argument('--desc_maxlen', type=int, default=30)
116 |
117 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
118 |
119 | return parser.parse_args()
120 |
121 | if __name__ == '__main__':
122 | args = parse_args()
123 |
124 | #make_shuffle_index_num(args, 33000)
125 | #split_data(args)
126 | #create_dict_file(args)
127 |
128 | dir_path = args.data_path + args.dataset
129 | # train.desc.txt -> train.desc.h5(and test...)
130 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
131 | sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
132 |
133 |
--------------------------------------------------------------------------------
/Baseline methods/CFG-Att/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | import math
4 | import torch
5 | from torch.nn import functional as F
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def cos_approx(data1,data2):
10 | """numpy implementation of cosine similarity for matrix"""
11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
12 | dotted = np.dot(data1,np.transpose(data2))
13 | norm1 = np.linalg.norm(data1,axis=1)
14 | norm2 = np.linalg.norm(data2,axis=1)
15 | matrix_vector_norms = np.multiply(norm1, norm2)
16 | neighbors = np.divide(dotted, matrix_vector_norms)
17 | return neighbors
18 |
19 | def normalize(data):
20 | """normalize matrix by rows"""
21 | return data/np.linalg.norm(data,axis=1,keepdims=True)
22 |
23 | def dot_np(data1,data2):
24 | """cosine similarity for normalized vectors"""
25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
26 | return np.dot(data1, data2.T)
27 |
28 | def sigmoid(x):
29 | return 1/(1 + np.exp(-x))
30 |
31 | def similarity(vec1, vec2, measure='cos'):
32 | if measure=='cos':
33 | vec1_norm = normalize(vec1)
34 | vec2_norm = normalize(vec2)
35 | return np.dot(vec1_norm, vec2_norm.T)[:,0]
36 | elif measure=='poly':
37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
38 | elif measure=='sigmoid':
39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
40 | elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf
41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1)
42 | euc_sim = 1 / (1 + euc_dist)
43 | if measure=='euc': return euc_sim
44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
45 | if measure == 'gesd': return euc_sim * sigmoid_sim
46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
47 |
48 | #######################################################################
49 |
50 | def asMinutes(s):
51 | m = math.floor(s / 60)
52 | s -= m * 60
53 | return '%d:%d'% (m, s)
54 |
55 | def timeSince(since, percent):
56 | now = time.time()
57 | s = now - since
58 | es = s / (percent)
59 | rs = es - s
60 | return '%s<%s'%(asMinutes(s), asMinutes(rs))
61 |
62 | #######################################################################
63 |
64 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID):
65 | '''indexes: numpy array'''
66 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
67 | indexes=filter(lambda i: i!=ignore_tok, indexes)
68 | toks, length = [], 0
69 | for idx in indexes:
70 | toks.append(ivocab.get(idx, ''))
71 | length+=1
72 | return ' '.join(toks), length
73 |
74 | ivocab = {v: k for k, v in vocab.items()}
75 | if indexes.ndim==1:# one sentence
76 | return revert_sent(indexes, ivocab, ignore_tok)
77 | else:# dim>1
78 | sentences, lens =[], [] # a batch of sentences
79 | for inds in indexes:
80 | sentence, length = revert_sent(inds, ivocab, ignore_tok)
81 | sentences.append(sentence)
82 | lens.append(length)
83 | return sentences, lens
84 |
85 | ########################################################################
86 |
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/Tok-Att.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 | "folders": [
3 | {
4 | "path": "../CFG-Att"
5 | },
6 | {
7 | "path": "../AST-Att"
8 | },
9 | {
10 | "path": "."
11 | }
12 | ]
13 | }
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/configs.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/data_loader.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/modules.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/configs.py:
--------------------------------------------------------------------------------
1 |
2 | def config_JointEmbeder():
3 | conf = {
4 | # data_params
5 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
6 | #training data
7 | 'train_name':'train.name.h5',
8 | 'train_tokens':'train.token.h5',
9 | 'train_desc':'train.desc.h5',
10 | # test data
11 | 'test_name':'test.name.h5',
12 | 'test_tokens':'test.token.h5',
13 | 'test_desc':'test.desc.h5',
14 | # user study data
15 | 'all_name': 'all.name.h5',
16 | 'all_tokens': 'all.token.h5',
17 | 'query_desc': 'query.desc.h5',
18 | #parameters
19 | 'name_len': 6,
20 | 'tokens_len': 50,
21 | 'desc_len': 30,
22 | 'n_words': 10000, # len(vocabulary) + 1
23 | #vocabulary info
24 | 'vocab_name':'vocab.name.json',
25 | 'vocab_tokens':'vocab.token.json',
26 | 'vocab_desc':'vocab.desc.json',
27 |
28 | #training_params
29 | 'batch_size': 32,
30 | 'nb_epoch': 200,
31 | #'optimizer': 'adam',
32 | 'learning_rate':0.0003, # try 1e-4(paper)
33 | 'adam_epsilon':1e-8,
34 | 'warmup_steps':5000,
35 | 'fp16': False,
36 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
37 | #"See details at https://nvidia.github.io/apex/amp.html"
38 |
39 | # model_params
40 | 'use_desc_attn': 1,
41 | 'use_tanh': 1,
42 | 'emb_size': 512,
43 | 'n_hidden': 512,#number of hidden dimension of code/desc representation
44 | 'lstm_dims': 256,
45 | # recurrent
46 | 'margin': 0.6,
47 | 'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
48 | #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
49 | 'dropout':0,
50 | }
51 | return conf
52 |
53 |
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/data/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .jointemb import JointEmbeder
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/models/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/models/__pycache__/jointemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/models/__pycache__/tokenemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/models/jointemb.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import numpy as np
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.init as weight_init
8 | import torch.nn.functional as F
9 |
10 | import logging
11 | logger = logging.getLogger(__name__)
12 | parentPath = os.path.abspath("..")
13 | sys.path.insert(0, parentPath)# add parent folder to path so as to import common modules
14 | from modules import SeqEncoder, BOWEncoder, SeqEncoder2
15 |
16 | class JointEmbeder(nn.Module):
17 | def __init__(self, config):
18 | super(JointEmbeder, self).__init__()
19 | self.conf = config
20 | self.margin = config['margin']
21 | self.dropout = config['dropout']
22 | self.n_hidden = config['n_hidden']
23 |
24 | self.name_encoder = SeqEncoder(config['n_words'],config['emb_size'],config['lstm_dims'])
25 | self.tok_encoder = BOWEncoder(config['n_words'],config['emb_size'],config['n_hidden'])
26 | self.desc_encoder = SeqEncoder2(config['n_words'],config['emb_size'],config['n_hidden'])
27 |
28 | self.w_name = nn.Linear(2*config['lstm_dims'], config['n_hidden'])
29 | self.w_tok = nn.Linear(config['emb_size'], config['n_hidden'])
30 | #self.w_desc = nn.Linear(2*config['lstm_dims'], config['n_hidden'])
31 | self.fuse3 = nn.Linear(config['n_hidden'], config['n_hidden'])
32 |
33 | self.self_attn2 = nn.Linear(self.n_hidden, self.n_hidden)
34 | self.self_attn_scalar2 = nn.Linear(self.n_hidden, 1)
35 |
36 | self.init_weights()
37 |
38 | def init_weights(self):# Initialize Linear Weight
39 | for m in [self.w_name, self.w_tok, self.fuse3]:
40 | m.weight.data.uniform_(-0.1, 0.1) #nn.init.xavier_normal_(m.weight)
41 | nn.init.constant_(m.bias, 0.)
42 |
43 | def code_encoding(self, name, name_len, tokens, tok_len):
44 | name_repr = self.name_encoder(name, name_len)
45 | tok_repr = self.tok_encoder(tokens, tok_len)
46 | code_repr = self.fuse3(torch.tanh(self.w_name(name_repr)+self.w_tok(tok_repr)))
47 | return code_repr
48 |
49 |
50 | def desc_encoding(self, desc, desc_len):
51 | batch_size = desc.size()[0]
52 | desc_enc_hidden = self.desc_encoder.init_hidden(batch_size)
53 | # desc_enc_hidden: [2 x batch_size x n_hidden]
54 | desc_feat, desc_enc_hidden = self.desc_encoder(desc, desc_len, desc_enc_hidden)
55 | desc_enc_hidden = desc_enc_hidden[0]
56 |
57 | if self.conf['use_desc_attn']:
58 | seq_len = desc_feat.size()[1]
59 |
60 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
61 | unpack_len_list = desc_len.long().to(device)
62 | range_tensor = torch.arange(seq_len).to(device)
63 | mask_1forgt0 = range_tensor[None, :] < unpack_len_list[:, None]
64 | mask_1forgt0 = mask_1forgt0.reshape(-1, seq_len)
65 |
66 | desc_sa_tanh = torch.tanh(self.self_attn2(desc_feat.reshape(-1, self.n_hidden))) # [(batch_sz * seq_len) x n_hidden]
67 | desc_sa_tanh = F.dropout(desc_sa_tanh, self.dropout, training=self.training)
68 | desc_sa_tanh = self.self_attn_scalar2(desc_sa_tanh).reshape(-1, seq_len) # [batch_sz x seq_len]
69 | desc_feat = desc_feat.reshape(-1, seq_len, self.n_hidden)
70 |
71 | self_attn_desc_feat = None
72 | for _i in range(batch_size):
73 | desc_sa_tanh_one = torch.masked_select(desc_sa_tanh[_i, :], mask_1forgt0[_i, :]).reshape(1, -1)
74 | # attn_w_one: [1 x 1 x seq_len]
75 | attn_w_one = F.softmax(desc_sa_tanh_one, dim=1).reshape(1, 1, -1)
76 |
77 | # attn_feat_one: [1 x seq_len x n_hidden]
78 | attn_feat_one = torch.masked_select(desc_feat[_i, :, :].reshape(1, seq_len, self.n_hidden),
79 | mask_1forgt0[_i, :].reshape(1, seq_len, 1)).reshape(1, -1, self.n_hidden)
80 | # out_to_cat: [1 x n_hidden]
81 | out_to_cat = torch.bmm(attn_w_one, attn_feat_one).reshape(1, self.n_hidden)
82 | # self_attn_code_feat: [batch_sz x n_hidden]
83 | self_attn_desc_feat = out_to_cat if self_attn_desc_feat is None else torch.cat(
84 | (self_attn_desc_feat, out_to_cat), 0)
85 |
86 | else:
87 | self_attn_desc_feat = desc_enc_hidden.reshape(batch_size, self.n_hidden)
88 |
89 | if self.conf['use_tanh']:
90 | self_attn_desc_feat = torch.tanh(self_attn_desc_feat)
91 |
92 | # desc_feat: [batch_size x n_hidden]
93 | return self_attn_desc_feat
94 |
95 |
96 | def similarity(self, code_vec, desc_vec):
97 | assert self.conf['sim_measure'] in ['cos', 'poly', 'euc', 'sigmoid', 'gesd', 'aesd'], "invalid similarity measure"
98 | if self.conf['sim_measure']=='cos':
99 | return F.cosine_similarity(code_vec, desc_vec)
100 | elif self.conf['sim_measure']=='poly':
101 | return (0.5*torch.matmul(code_vec, desc_vec.t()).diag()+1)**2
102 | elif self.conf['sim_measure']=='sigmoid':
103 | return torch.tanh(torch.matmul(code_vec, desc_vec.t()).diag()+1)
104 | elif self.conf['sim_measure'] in ['euc', 'gesd', 'aesd']:
105 | euc_dist = torch.dist(code_vec, desc_vec, 2) # or torch.norm(code_vec-desc_vec,2)
106 | euc_sim = 1 / (1 + euc_dist)
107 | if self.conf['sim_measure']=='euc': return euc_sim
108 | sigmoid_sim = torch.sigmoid(torch.matmul(code_vec, desc_vec.t()).diag()+1)
109 | if self.conf['sim_measure']=='gesd':
110 | return euc_sim * sigmoid_sim
111 | elif self.conf['sim_measure']=='aesd':
112 | return 0.5*(euc_sim+sigmoid_sim)
113 |
114 | def forward(self, name, name_len, tokens, tok_len, desc_anchor, desc_anchor_len, desc_neg, desc_neg_len):
115 | # code_repr: [batch_sz x n_hidden]
116 | code_repr = self.code_encoding(name, name_len, tokens, tok_len)
117 | # desc_repr: [batch_sz x n_hidden]
118 | desc_anchor_repr = self.desc_encoding(desc_anchor, desc_anchor_len)
119 | desc_neg_repr = self.desc_encoding(desc_neg, desc_neg_len)
120 |
121 | # sim: [batch_sz]
122 | anchor_sim = self.similarity(code_repr, desc_anchor_repr)
123 | neg_sim = self.similarity(code_repr, desc_neg_repr)
124 |
125 | loss = (self.margin-anchor_sim+neg_sim).clamp(min=1e-6).mean()
126 |
127 | return loss
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/output/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/shuffle_index.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/DeepCS/shuffle_index.npy
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
3 | import sys
4 | import traceback
5 | import numpy as np
6 | import argparse
7 | import threading
8 | import codecs
9 | import logging
10 | from tqdm import tqdm
11 | logger = logging.getLogger(__name__)
12 | logging.basicConfig(level=logging.INFO, format="%(message)s")
13 |
14 | import torch
15 |
16 | import models, configs, data_loader
17 | from modules import get_cosine_schedule_with_warmup
18 | from utils import similarity, normalize
19 | from data_loader import *
20 |
21 |
22 | def test(config, model, device):
23 | logger.info('test begin...')
24 |
25 | model.eval()
26 | model.to(device)
27 |
28 | # load data
29 | data_path = args.data_path+args.dataset+'/'
30 | test_set = eval(config['dataset_name'])(data_path,
31 | config['test_name'], config['name_len'],
32 | config['test_tokens'], config['tokens_len'],
33 | config['test_desc'], config['desc_len'])
34 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=2000,
35 | shuffle=False, drop_last=False, num_workers=1)
36 | # encode tokens and descs
37 | code_reprs, desc_reprs = [], []
38 | n_processed = 0
39 | for batch in data_loader:
40 | code_batch = [tensor.to(device) for tensor in batch[:4]]
41 | desc_batch = [tensor.to(device) for tensor in batch[4:6]]
42 | with torch.no_grad():
43 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
44 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
45 | # normalize when sim_measure=='cos'
46 | code_repr = normalize(code_repr)
47 | desc_repr = normalize(desc_repr)
48 | code_reprs.append(code_repr)
49 | desc_reprs.append(desc_repr)
50 | n_processed += batch[0].size(0) # +batch_size
51 | # code_reprs: [n_processed x n_hidden]
52 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs)
53 |
54 | # calculate similarity
55 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
56 | test_sim_result, test_rank_result = [], []
57 | for i in tqdm(range(0, n_processed)):
58 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
59 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
60 | negsims = np.negative(sims)
61 | predict = np.argsort(negsims)
62 |
63 | # SuccessRate@k
64 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
65 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
66 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
67 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
68 | # MRR
69 | predict_list = predict.tolist()
70 | rank = predict_list.index(i)
71 | sum_mrr.append(1/float(rank+1))
72 |
73 | # results need to be saved
74 | predict_20 = [int(k) for k in predict[0:20]]
75 | sim_20 = [sims[k] for k in predict_20]
76 | test_sim_result.append(zip(predict_20, sim_20))
77 | test_rank_result.append(rank+1)
78 |
79 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
80 | save_path = args.data_path + 'result/'
81 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
82 | np.save(save_path+sim_result_filename, test_sim_result)
83 | np.save(save_path+rank_result_filename, test_rank_result)
84 |
85 |
86 | def parse_args():
87 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
88 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
89 | parser.add_argument('--model', type=str, default='JointEmbeder', help='model name')
90 | parser.add_argument('-d', '--dataset', type=str, default='github11', help='name of dataset.java, python')
91 | parser.add_argument('--reload_from', type=int, default=200, help='epoch to reload from')
92 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
93 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
94 | return parser.parse_args()
95 |
96 |
97 | if __name__ == '__main__':
98 | args = parse_args()
99 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
100 | config = getattr(configs, 'config_'+args.model)()
101 |
102 | ##### Define model ######
103 | logger.info('Constructing Model..')
104 | model = getattr(models, args.model)(config) # initialize the model
105 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
106 | model.load_state_dict(torch.load(ckpt, map_location=device))
107 |
108 | test(config, model, device)
109 |
110 |
111 |
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/user_study.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import traceback
4 | import numpy as np
5 | import argparse
6 | import threading
7 | import codecs
8 | import logging
9 | from tqdm import tqdm
10 | logger = logging.getLogger(__name__)
11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
12 |
13 | import torch
14 |
15 | import models, configs, data_loader
16 | from modules import get_cosine_schedule_with_warmup
17 | from utils import similarity, normalize
18 | from data_loader import *
19 |
20 |
21 | def test(config, model, device):
22 | logger.info('Test Begin...')
23 |
24 | model.eval()
25 | model.to(device)
26 |
27 |
28 | data_path = args.data_path+args.dataset+'/'
29 |
30 | code_base_set = eval(config['dataset_name'])(data_path,
31 | config['all_name'], config['name_len'],
32 | config['all_tokens'], config['tokens_len'])
33 | code_data_loader = torch.utils.data.DataLoader(dataset=code_base_set, batch_size=32,
34 | shuffle=False, drop_last=False, num_workers=1)
35 |
36 | code_reprs = []
37 | code_processed = 0
38 | for batch in code_data_loader:
39 | # batch[0:4]: name, name_len, token, token_len
40 | code_batch = [tensor.to(device) for tensor in batch[:4]]
41 | with torch.no_grad():
42 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
43 | code_repr = normalize(code_repr)
44 | code_reprs.append(code_repr)
45 | code_processed += batch[0].size(0) # +batch_size
46 | # code_reprs: [code_processed x n_hidden]
47 | code_reprs = np.vstack(code_reprs)
48 | print('processed code num: ', code_processed)
49 |
50 |
51 | query_desc_set = eval(config['dataset_name'])(data_path,
52 | f_descs=config['query_desc'], max_desc_len=config['desc_len'])
53 | desc_data_loader = torch.utils.data.DataLoader(dataset=query_desc_set, batch_size=32,
54 | shuffle=False, drop_last=False, num_workers=1)
55 |
56 | desc_reprs = []
57 | desc_processed = 0
58 | for batch in desc_data_loader:
59 | # batch[0:2]: good_desc, good_desc_len
60 | desc_batch = [tensor.to(device) for tensor in batch[0:2]]
61 | with torch.no_grad():
62 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hidden_size]
63 | desc_repr = normalize(desc_repr)
64 | desc_reprs.append(desc_repr)
65 | desc_processed += batch[0].size(0) # +batch_size
66 | # desc_reprs: [desc_processed x n_hidden]
67 | desc_reprs = np.vstack(desc_reprs)
68 | print('processed desc num: ', desc_processed)
69 |
70 |
71 | query_desc_index_file_path = data_path + args.query_desc_index_file
72 | desc_index = []
73 | with open(query_desc_index_file_path, 'r') as query_desc_index_file:
74 | lines = query_desc_index_file.readlines()
75 | for i in range(0, len(lines)):
76 | line = lines[i].strip()
77 | desc_index.append(int(line))
78 | print('desc_index: ', desc_index)
79 |
80 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
81 | test_sim_result, test_rank_result = [], []
82 | for i in tqdm(range(0, desc_processed)):
83 | ind = desc_index[i]
84 |
85 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
86 | sims = np.dot(code_reprs, desc_vec.T)[:, 0] # [code_processed]
87 | negsims = np.negative(sims)
88 | predict = np.argsort(negsims)
89 |
90 | # SuccessRate@k
91 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
92 | sum_1.append(1.0) if ind in predict_1 else sum_1.append(0.0)
93 | sum_5.append(1.0) if ind in predict_5 else sum_5.append(0.0)
94 | sum_10.append(1.0) if ind in predict_10 else sum_10.append(0.0)
95 | # MRR
96 | predict_list = predict.tolist()
97 | rank = predict_list.index(ind)
98 | sum_mrr.append(1/float(rank+1))
99 |
100 | # results need to be saved
101 | predict_20 = [int(k) for k in predict[0:20]]
102 | sim_20 = [sims[k] for k in predict_20]
103 | test_sim_result.append(zip(predict_20, sim_20))
104 | test_rank_result.append(rank+1)
105 |
106 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
107 | save_path = args.data_path + 'result/'
108 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
109 | np.save(save_path+sim_result_filename, test_sim_result)
110 | np.save(save_path+rank_result_filename, test_rank_result)
111 |
112 |
113 | def parse_args():
114 | parser = argparse.ArgumentParser("Test Code Search(Embedding) Model For User Study")
115 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
116 | parser.add_argument('--model', type=str, default='JointEmbeder', help='model name')
117 | parser.add_argument('-d', '--dataset', type=str, default='github_user_3', help='name of dataset.java, python')
118 | parser.add_argument('--query_desc_index_file', type=str, default='query.desc.index.txt')
119 | parser.add_argument('--reload_from', type=int, default=185, help='epoch to reload from')
120 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
121 | return parser.parse_args()
122 |
123 |
124 | if __name__ == '__main__':
125 | args = parse_args()
126 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
127 | config = getattr(configs, 'config_'+args.model)()
128 |
129 | ##### Define model ######
130 | logger.info('Constructing Model..')
131 |
132 | model = getattr(models, args.model)(config) # initialize the model
133 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
134 | model.load_state_dict(torch.load(ckpt, map_location=device))
135 |
136 | test(config, model, device)
137 |
138 |
139 |
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/util_desc.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | from collections import Counter
4 | import json
5 | import h5py
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def make_shuffle_index(args):
10 | dir_path = args.data_path + args.dataset
11 | all_desc_file_path = dir_path + args.all_desc_file
12 | with open(all_desc_file_path, 'r') as all_desc_file:
13 | lines = all_desc_file.readlines()
14 | all_num = int(len(lines)/2)
15 |
16 | index = np.arange(all_num)
17 | np.random.seed(16)
18 | np.random.shuffle(index)
19 | print(len(index))
20 | np.save(args.shuffle_index_file, index)
21 |
22 | def split_desc_data(args):
23 | index = np.load(args.shuffle_index_file)
24 |
25 | dir_path = args.data_path + args.dataset
26 | all_desc_file_path = dir_path + args.all_desc_file
27 | train_desc_file_path = dir_path + args.train_desc_file
28 | test_desc_file_path = dir_path + args.test_desc_file
29 |
30 | input_desc = []
31 | with open(all_desc_file_path, 'r') as all_desc_file:
32 | lines = all_desc_file.readlines()
33 | for line in lines:
34 | if (line[0:10] != 'BeginFunc:'):
35 | input_desc.append(line)
36 | print('number of input desc:\n', len(input_desc))
37 |
38 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
39 | for i in range(0, args.trainset_num):
40 | train_desc_file.write(input_desc[index[i]])
41 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
42 | test_desc_file.write(input_desc[index[i]])
43 |
44 |
45 | def create_desc_dict_file(args):
46 | dir_path = args.data_path + args.dataset
47 | desc_file_path = dir_path + args.train_desc_file
48 |
49 | input_desc = []
50 | with open(desc_file_path, 'r') as desc_file:
51 | input_desc = desc_file.readlines()
52 | desc_words = []
53 | for i in range(0, len(input_desc)):
54 | input_desc[i] = input_desc[i].rstrip('\n')
55 | desc_word_list = input_desc[i].split()
56 | for desc_word in desc_word_list:
57 | desc_words.append(desc_word)
58 | vocab_desc_info = Counter(desc_words)
59 | print(len(vocab_desc_info))
60 |
61 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
62 | vocab_desc_index = {'':0, '':1}
63 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
64 |
65 |
66 | vocab_desc_file_path = dir_path + args.vocab_desc_file
67 | desc_dic_str = json.dumps(vocab_desc_index)
68 | with open(vocab_desc_file_path, 'w') as vocab_desc_file:
69 | vocab_desc_file.write(desc_dic_str)
70 |
71 |
72 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
73 | phrases, indices = [], []
74 | with open(sent_file_path, 'r') as sent_file:
75 | sents = sent_file.readlines()
76 | vocab = json.loads(open(vocab_file_path, "r").readline())
77 | start_index = 0
78 | for i in range(0, len(sents)):
79 | sent = sents[i].rstrip('\n')
80 | word_list = sent.split()
81 | sent_len = min(len(word_list), maxlen)
82 | indices.append((sent_len, start_index))
83 | for j in range(0, sent_len):
84 | word = word_list[j]
85 | phrases.append(vocab.get(word, UNK_ID))
86 | start_index += sent_len
87 | output_file_path = sent_file_path[0:-3] + 'h5'
88 | output_file = h5py.File(output_file_path, 'w')
89 | output_file['phrases'] = phrases
90 | output_file['indices'] = indices
91 | output_file.close()
92 |
93 |
94 | def parse_args():
95 | parser = argparse.ArgumentParser("Parse Description data for TokenEmbedder")
96 |
97 | parser.add_argument('--data_path', type=str, default='./data/')
98 | parser.add_argument('--dataset', type=str, default='github11/')
99 |
100 | parser.add_argument('--origin_desc_file', type=str, default='origin.desc.txt')
101 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
102 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
103 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
104 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
105 |
106 | parser.add_argument('--trainset_num', type=int, default=39152)
107 | parser.add_argument('--testset_num', type=int, default=2000)
108 | parser.add_argument('--desc_word_num', type=int, default=10000)
109 | parser.add_argument('--desc_maxlen', type=int, default=30)
110 | parser.add_argument('--testset_start_index', type=int, default=39152)
111 |
112 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
113 |
114 | return parser.parse_args()
115 |
116 | if __name__ == '__main__':
117 |
118 | args = parse_args()
119 |
120 | #make_shuffle_index(args)
121 |
122 | #split_desc_data(args)
123 |
124 | create_desc_dict_file(args)
125 |
126 | dir_path = args.data_path + args.dataset
127 | # train.desc.txt -> train.desc.h5(and test...)
128 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
129 | sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
130 |
131 |
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/util_tok.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | from collections import Counter
4 | import json
5 | import h5py
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def split_token_data(args):
10 | index = np.load(args.shuffle_index_file)
11 |
12 | dir_path = args.data_path + args.dataset
13 | all_token_file_path = dir_path + args.all_token_file
14 | train_token_file_path = dir_path + args.train_token_file
15 | test_token_file_path = dir_path + args.test_token_file
16 |
17 | input_token = []
18 | with open(all_token_file_path, 'r') as all_token_file:
19 | lines = all_token_file.readlines()
20 | for line in lines:
21 | if (line[0:10] != 'BeginFunc:'):
22 | input_token.append(line)
23 | print('number of input token:\n', len(input_token))
24 |
25 | with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file:
26 | for i in range(0, args.trainset_num):
27 | train_token_file.write(input_token[index[i]])
28 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
29 | test_token_file.write(input_token[index[i]])
30 |
31 |
32 | def create_token_dict_file(args):
33 | dir_path = args.data_path + args.dataset
34 | token_file_path = dir_path + args.train_token_file
35 |
36 | input_token = []
37 | with open(token_file_path, 'r') as token_file:
38 | input_token = token_file.readlines()
39 | token_words = []
40 | for i in range(0, len(input_token)):
41 | input_token[i] = input_token[i].rstrip('\n')
42 | token_word_list = input_token[i].split()
43 | for token_word in token_word_list:
44 | token_words.append(token_word)
45 | vocab_token_info = Counter(token_words)
46 | print(len(vocab_token_info))
47 |
48 | vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]]
49 | vocab_token_index = {'':0, '':1}
50 | vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))]))
51 |
52 |
53 | vocab_token_file_path = dir_path + args.vocab_token_file
54 | token_dic_str = json.dumps(vocab_token_index)
55 | with open(vocab_token_file_path, 'w') as vocab_token_file:
56 | vocab_token_file.write(token_dic_str)
57 |
58 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
59 | phrases, indices = [], []
60 | with open(sent_file_path, 'r') as sent_file:
61 | sents = sent_file.readlines()
62 | vocab = json.loads(open(vocab_file_path, "r").readline())
63 | start_index = 0
64 | for i in range(0, len(sents)):
65 | sent = sents[i].rstrip('\n')
66 | word_list = sent.split()
67 | sent_len = min(len(word_list), maxlen)
68 | indices.append((sent_len, start_index))
69 | for j in range(0, sent_len):
70 | word = word_list[j]
71 | phrases.append(vocab.get(word, UNK_ID))
72 | start_index += sent_len
73 | output_file_path = sent_file_path[0:-3] + 'h5'
74 | output_file = h5py.File(output_file_path, 'w')
75 | output_file['phrases'] = phrases
76 | output_file['indices'] = indices
77 | output_file.close()
78 |
79 | def remove_dup_tokens(args):
80 | dir_path = args.data_path + args.dataset
81 | origin_token_file_path = dir_path + args.origin_token_file
82 | all_token_file_path = dir_path + args.all_token_file
83 |
84 | with open(origin_token_file_path, 'r') as origin_token_file, open(all_token_file_path, 'w') as all_token_file:
85 | lines = origin_token_file.readlines()
86 | for i in range(0, len(lines)):
87 | if lines[i][0:10] != 'BeginFunc:':
88 | line = lines[i].strip()
89 | words = line.split()
90 | new_words = list(set(words))
91 | new_line = ' '.join(new_words)
92 | all_token_file.write(new_line + '\n')
93 |
94 |
95 | def parse_args():
96 | parser = argparse.ArgumentParser("Parse token data for TokenEmbedder")
97 |
98 | parser.add_argument('--data_path', type=str, default='./data/')
99 | parser.add_argument('--dataset', type=str, default='github_user_3/')
100 |
101 | parser.add_argument('--origin_token_file', type=str, default='origin.token.txt')
102 | parser.add_argument('--all_token_file', type=str, default='all.token.txt')
103 | parser.add_argument('--train_token_file', type=str, default='train.token.txt')
104 | parser.add_argument('--test_token_file', type=str, default='test.token.txt')
105 | parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json')
106 |
107 | parser.add_argument('--trainset_num', type=int, default=39152)
108 | parser.add_argument('--testset_num', type=int, default=2000)
109 | parser.add_argument('--token_word_num', type=int, default=10000)
110 | parser.add_argument('--token_maxlen', type=int, default=50)
111 | parser.add_argument('--testset_start_index', type=int, default=39152)
112 |
113 |
114 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
115 |
116 | return parser.parse_args()
117 |
118 | if __name__ == '__main__':
119 | args = parse_args()
120 | '''
121 | dir_path = args.data_path + args.dataset
122 | with open(dir_path+'origin.test.token.txt', 'r') as in_file, open(dir_path+'test.token.txt', 'w') as out_file:
123 | lines = in_file.readlines()
124 | for i in range(0, len(lines)):
125 | if lines[i][0:10] != 'BeginFunc:':
126 | out_file.write(lines[i])
127 | '''
128 | remove_dup_tokens(args)
129 |
130 | #split_token_data(args)
131 | #create_token_dict_file(args)
132 |
133 | dir_path = args.data_path + args.dataset
134 | sents2indexes(dir_path+args.all_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
135 |
136 | '''
137 | dir_path = args.data_path + args.dataset
138 | # train.token.txt -> train.token.h5(and test...)
139 | sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
140 | sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
141 | '''
142 |
143 | '''
144 | dir_path = args.data_path + args.dataset
145 | all_token_file_path = dir_path + args.all_token_file
146 | with open(all_token_file_path, 'r') as all_token_file:
147 | lines = all_token_file.readlines()
148 | print(len(lines))
149 | for i in range(0, len(lines)):
150 | line = lines[i]
151 | if line[0:10] != 'BeginFunc:':
152 | words = line.split()
153 | if len(words) == 0:
154 | print(lines[i-1])
155 | #print(lines[i])
156 | '''
157 |
--------------------------------------------------------------------------------
/Baseline methods/DeepCS/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | import math
4 | import torch
5 | from torch.nn import functional as F
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def cos_approx(data1,data2):
10 | """numpy implementation of cosine similarity for matrix"""
11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
12 | dotted = np.dot(data1,np.transpose(data2))
13 | norm1 = np.linalg.norm(data1,axis=1)
14 | norm2 = np.linalg.norm(data2,axis=1)
15 | matrix_vector_norms = np.multiply(norm1, norm2)
16 | neighbors = np.divide(dotted, matrix_vector_norms)
17 | return neighbors
18 |
19 | def normalize(data):
20 | """normalize matrix by rows"""
21 | return data/np.linalg.norm(data,axis=1,keepdims=True)
22 |
23 | def dot_np(data1,data2):
24 | """cosine similarity for normalized vectors"""
25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
26 | return np.dot(data1, data2.T)
27 |
28 | def sigmoid(x):
29 | return 1/(1 + np.exp(-x))
30 |
31 | def similarity(vec1, vec2, measure='cos'):
32 | if measure=='cos':
33 | vec1_norm = normalize(vec1)
34 | vec2_norm = normalize(vec2)
35 | return np.dot(vec1_norm, vec2_norm.T)[:,0]
36 | elif measure=='poly':
37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
38 | elif measure=='sigmoid':
39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
40 | elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf
41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1)
42 | euc_sim = 1 / (1 + euc_dist)
43 | if measure=='euc': return euc_sim
44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
45 | if measure == 'gesd': return euc_sim * sigmoid_sim
46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
47 |
48 | #######################################################################
49 |
50 | def asMinutes(s):
51 | m = math.floor(s / 60)
52 | s -= m * 60
53 | return '%d:%d'% (m, s)
54 |
55 | def timeSince(since, percent):
56 | now = time.time()
57 | s = now - since
58 | es = s / (percent)
59 | rs = es - s
60 | return '%s<%s'%(asMinutes(s), asMinutes(rs))
61 |
62 | #######################################################################
63 | '''
64 | import nltk
65 | try: nltk.word_tokenize("hello world")
66 | except LookupError: nltk.download('punkt')
67 |
68 | def sent2indexes(sentence, vocab, maxlen):
69 |
70 | def convert_sent(sent, vocab, maxlen):
71 | idxes = np.zeros(maxlen, dtype=np.int64)
72 | idxes.fill(PAD_ID)
73 | tokens = nltk.word_tokenize(sent.strip())
74 | idx_len = min(len(tokens), maxlen)
75 | for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID)
76 | return idxes, idx_len
77 | if type(sentence) is list:
78 | inds, lens = [], []
79 | for sent in sentence:
80 | idxes, idx_len = convert_sent(sent, vocab, maxlen)
81 | #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len])
82 | inds.append(idxes)
83 | lens.append(idx_len)
84 | return np.vstack(inds), np.vstack(lens)
85 | else:
86 | inds, lens = sent2indexes([sentence], vocab, maxlen)
87 | return inds[0], lens[0]
88 | '''
89 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID):
90 | '''indexes: numpy array'''
91 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
92 | indexes=filter(lambda i: i!=ignore_tok, indexes)
93 | toks, length = [], 0
94 | for idx in indexes:
95 | toks.append(ivocab.get(idx, ''))
96 | length+=1
97 | return ' '.join(toks), length
98 |
99 | ivocab = {v: k for k, v in vocab.items()}
100 | if indexes.ndim==1:# one sentence
101 | return revert_sent(indexes, ivocab, ignore_tok)
102 | else:# dim>1
103 | sentences, lens =[], [] # a batch of sentences
104 | for inds in indexes:
105 | sentence, length = revert_sent(inds, ivocab, ignore_tok)
106 | sentences.append(sentence)
107 | lens.append(length)
108 | return sentences, lens
109 |
110 | ########################################################################
111 |
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/configs.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/data_loader.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/modules.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/util_cfg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/util_cfg.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/util_dfg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/util_dfg.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/configs.py:
--------------------------------------------------------------------------------
1 |
2 | def config_CFGEmbeder():
3 | conf = {
4 | # added_params
5 | 'transform_every_modal': 0, # to make modal more complex?
6 | 'save_attn_weight': 0,
7 | 'use_tanh': 1,
8 | 'use_attn': 1,
9 | 'use_desc_attn': 1,
10 |
11 | # GGNN
12 | 'state_dim': 512, # GGNN hidden state size
13 | 'annotation_dim': 5,
14 | 'n_edge_types': 1,
15 | 'n_node': 150, # could be less than 512, like the maximum nodenum
16 | 'n_steps': 5, # propogation steps number of GGNN
17 | 'output_type': 'no_reduce',
18 | 'batch_size': 32,
19 | 'n_layers': 1,
20 | 'n_hidden': 512,
21 | 'cfg_attn_mode': 'sigmoid_scalar',
22 |
23 | # data_params
24 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
25 | #training data
26 | 'train_token':'train.token.h5',
27 | 'train_dfg':'train.dfg.txt',
28 | 'train_cfg':'train.cfg.txt',
29 | 'train_desc':'train.desc.h5',
30 | # test data
31 | 'test_token':'test.token.h5',
32 | 'test_dfg':'test.dfg.txt',
33 | 'test_cfg':'test.cfg.txt',
34 | 'test_desc':'test.desc.h5',
35 | #vocabulary info
36 | 'vocab_desc':'vocab.desc.json',
37 | 'vocab_token':'vocab.token.json',
38 |
39 | #parameters
40 | 'desc_len': 30,
41 | 'tok_len': 100,
42 | 'n_desc_words': 10000, # wait to decide
43 | 'n_token_words': 20000,
44 |
45 | #training_params
46 | 'nb_epoch': 200,
47 | #'optimizer': 'adam',
48 | 'learning_rate':0.0003, # try 1e-4(paper)
49 | 'adam_epsilon':1e-8,
50 | 'warmup_steps':5000,
51 | 'fp16': False,
52 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
53 | #"See details at https://nvidia.github.io/apex/amp.html"
54 |
55 | # model_params
56 | 'emb_size': 300,
57 | # recurrent
58 | 'margin': 0.6,
59 | 'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
60 | #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
61 | 'dropout': 0.1
62 | }
63 | return conf
64 |
65 |
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/data/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .cfgemb import CFGEmbeder
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/models/__pycache__/cfgemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/cfgemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/jointemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/models/__pycache__/tokenemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/output/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/shuffle_index.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN(TDC)/shuffle_index.npy
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import traceback
4 | import numpy as np
5 | import argparse
6 | import threading
7 | import codecs
8 | import logging
9 | from tqdm import tqdm
10 | logger = logging.getLogger(__name__)
11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package
13 |
14 | import torch
15 |
16 | import models, configs, data_loader
17 | from modules import get_cosine_schedule_with_warmup
18 | from utils import similarity, normalize
19 | from data_loader import *
20 |
21 |
22 | def test(config, model, device):
23 | logger.info('Test Begin...')
24 |
25 | model.eval()
26 | model.to(device)
27 |
28 | # load data
29 | data_path = args.data_path+args.dataset+'/'
30 | test_set = eval(config['dataset_name'])(config, data_path,
31 | config['test_cfg'], config['n_node'],
32 | config['test_desc'], config['desc_len'])
33 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
34 | shuffle=False, drop_last=False, num_workers=1)
35 | # encode tokens and descs
36 | code_reprs, desc_reprs = [], []
37 | n_processed = 0
38 | for batch in data_loader:
39 | # batch[0:3]: init_input, adjmat, node_mask
40 | code_batch = [tensor.to(device) for tensor in batch[:3]]
41 | # batch[3:5]: good_desc, good_desc_len
42 | desc_batch = [tensor.to(device) for tensor in batch[3:5]]
43 | with torch.no_grad():
44 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
45 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
46 | # normalize when sim_measure=='cos'
47 | code_repr = normalize(code_repr)
48 | desc_repr = normalize(desc_repr)
49 | code_reprs.append(code_repr)
50 | desc_reprs.append(desc_repr)
51 | n_processed += batch[0].size(0) # +batch_size
52 | # code_reprs: [n_processed x n_hidden]
53 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs)
54 |
55 | # calculate similarity
56 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
57 | test_sim_result, test_rank_result = [], []
58 | for i in tqdm(range(0, n_processed)):
59 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
60 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
61 | negsims = np.negative(sims)
62 | predict = np.argsort(negsims)
63 |
64 | # SuccessRate@k
65 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
66 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
67 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
68 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
69 | # MRR
70 | predict_list = predict.tolist()
71 | rank = predict_list.index(i)
72 | sum_mrr.append(1/float(rank+1))
73 |
74 | # results need to be saved
75 | predict_20 = [int(k) for k in predict[0:20]]
76 | sim_20 = [sims[k] for k in predict_20]
77 | test_sim_result.append(zip(predict_20, sim_20))
78 | test_rank_result.append(rank+1)
79 |
80 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
81 | save_path = args.data_path + 'result/'
82 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
83 | np.save(save_path+sim_result_filename, test_sim_result)
84 | np.save(save_path+rank_result_filename, test_rank_result)
85 |
86 |
87 | def parse_args():
88 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
89 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
90 | parser.add_argument('--model', type=str, default='CFGEmbeder', help='model name')
91 | parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python')
92 | parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from')
93 | parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\
94 | 'Note: should be consistent with the same argument in the repr_code.py')
95 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
96 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
97 | return parser.parse_args()
98 |
99 |
100 | if __name__ == '__main__':
101 | args = parse_args()
102 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
103 | config = getattr(configs, 'config_'+args.model)()
104 |
105 | ##### Define model ######
106 | logger.info('Constructing Model..')
107 | model = getattr(models, args.model)(config) # initialize the model
108 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
109 | model.load_state_dict(torch.load(ckpt, map_location=device))
110 |
111 | test(config, model, device)
112 |
113 |
114 |
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/util_desc.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | from collections import Counter
4 | import json
5 | import h5py
6 | from utils import UNK_ID
7 |
8 | def make_shuffle_index_num(args, all_num):
9 | index = np.arange(all_num)
10 | np.random.seed(16)
11 | np.random.shuffle(index)
12 | print('index:\n', index)
13 | np.save(args.shuffle_index_file, index)
14 |
15 | def make_shuffle_index(args):
16 | dir_path = args.data_path + args.dataset
17 | all_desc_file_path = dir_path + args.all_desc_file
18 | with open(all_desc_file_path, 'r') as all_desc_file:
19 | lines = all_desc_file.readlines()
20 | all_num = int(len(lines)/2)
21 | print('all_num of desc:\n', all_num)
22 |
23 | index = np.arange(all_num)
24 | np.random.seed(16)
25 | np.random.shuffle(index)
26 | print('index:\n', index)
27 | np.save(args.shuffle_index_file, index)
28 |
29 | def split_data(args):
30 | index = np.load(args.shuffle_index_file)
31 |
32 | dir_path = args.data_path + args.dataset
33 | all_desc_file_path = dir_path + args.all_desc_file
34 | train_desc_file_path = dir_path + args.train_desc_file
35 | test_desc_file_path = dir_path + args.test_desc_file
36 |
37 | input_desc = []
38 | with open(all_desc_file_path, 'r') as all_desc_file:
39 | lines = all_desc_file.readlines()
40 | for line in lines:
41 | if (line[0:10] != 'BeginFunc:'):
42 | input_desc.append(line)
43 | print('number of input desc:\n', len(input_desc))
44 |
45 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
46 | for i in range(0, args.trainset_num):
47 | train_desc_file.write(input_desc[index[i]])
48 | for i in range(32000, 32000+args.testset_num):
49 | test_desc_file.write(input_desc[index[i]])
50 |
51 |
52 | def create_dict_file(args):
53 | dir_path = args.data_path + args.dataset
54 | desc_file_path = dir_path + args.train_desc_file
55 |
56 | input_desc = []
57 | with open(desc_file_path, 'r') as desc_file:
58 | input_desc = desc_file.readlines()
59 | desc_words = []
60 | for i in range(0, len(input_desc)):
61 | input_desc[i] = input_desc[i].rstrip('\n')
62 | desc_word_list = input_desc[i].split()
63 | for desc_word in desc_word_list:
64 | desc_words.append(desc_word)
65 | vocab_desc_info = Counter(desc_words)
66 | print(len(vocab_desc_info))
67 |
68 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
69 | vocab_desc_index = {'':0, '':1}
70 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
71 |
72 |
73 | vocab_desc_file_path = dir_path + args.vocab_desc_file
74 | desc_dic_str = json.dumps(vocab_desc_index)
75 | with open(vocab_desc_file_path, 'w') as vocab_desc_file:
76 | vocab_desc_file.write(desc_dic_str)
77 |
78 |
79 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
80 | phrases, indices = [], []
81 | with open(sent_file_path, 'r') as sent_file:
82 | sents = sent_file.readlines()
83 | vocab = json.loads(open(vocab_file_path, "r").readline())
84 | start_index = 0
85 | for i in range(0, len(sents)):
86 | sent = sents[i].rstrip('\n')
87 | word_list = sent.split()
88 | sent_len = min(len(word_list), maxlen)
89 | indices.append((sent_len, start_index))
90 | for j in range(0, sent_len):
91 | word = word_list[j]
92 | phrases.append(vocab.get(word, UNK_ID))
93 | start_index += sent_len
94 | output_file_path = sent_file_path[0:-3] + 'h5'
95 | output_file = h5py.File(output_file_path, 'w')
96 | output_file['phrases'] = phrases
97 | output_file['indices'] = indices
98 | output_file.close()
99 |
100 |
101 | def parse_args():
102 | parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder")
103 |
104 | parser.add_argument('--data_path', type=str, default='./data/')
105 | parser.add_argument('--dataset', type=str, default='github/')
106 |
107 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
108 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
109 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
110 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
111 |
112 | parser.add_argument('--trainset_num', type=int, default=32000)
113 | parser.add_argument('--testset_num', type=int, default=1000)
114 | parser.add_argument('--desc_word_num', type=int, default=10000)
115 | parser.add_argument('--desc_maxlen', type=int, default=30)
116 |
117 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
118 |
119 | return parser.parse_args()
120 |
121 | if __name__ == '__main__':
122 | args = parse_args()
123 |
124 | #make_shuffle_index_num(args, 33000)
125 | #split_data(args)
126 | #create_dict_file(args)
127 |
128 | dir_path = args.data_path + args.dataset
129 | # train.desc.txt -> train.desc.h5(and test...)
130 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
131 | sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
132 |
133 |
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/util_tok.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | from collections import Counter
4 | import json
5 | import h5py
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def split_token_data(args):
10 | index = np.load(args.shuffle_index_file)
11 |
12 | dir_path = args.data_path + args.dataset
13 | all_token_file_path = dir_path + args.all_token_file
14 | train_token_file_path = dir_path + args.train_token_file
15 | test_token_file_path = dir_path + args.test_token_file
16 |
17 | input_token = []
18 | with open(all_token_file_path, 'r') as all_token_file:
19 | lines = all_token_file.readlines()
20 | for line in lines:
21 | if (line[0:10] != 'BeginFunc:'):
22 | input_token.append(line)
23 | print('number of input token:\n', len(input_token))
24 |
25 | with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file:
26 | for i in range(0, args.trainset_num):
27 | train_token_file.write(input_token[index[i]])
28 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
29 | test_token_file.write(input_token[index[i]])
30 |
31 |
32 | def create_token_dict_file(args):
33 | dir_path = args.data_path + args.dataset
34 | token_file_path = dir_path + args.train_token_file
35 |
36 | input_token = []
37 | with open(token_file_path, 'r') as token_file:
38 | input_token = token_file.readlines()
39 | token_words = []
40 | for i in range(0, len(input_token)):
41 | input_token[i] = input_token[i].rstrip('\n')
42 | token_word_list = input_token[i].split()
43 | for token_word in token_word_list:
44 | token_words.append(token_word)
45 | vocab_token_info = Counter(token_words)
46 | print(len(vocab_token_info))
47 |
48 | vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]]
49 | vocab_token_index = {'':0, '':1}
50 | vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))]))
51 |
52 |
53 | vocab_token_file_path = dir_path + args.vocab_token_file
54 | token_dic_str = json.dumps(vocab_token_index)
55 | with open(vocab_token_file_path, 'w') as vocab_token_file:
56 | vocab_token_file.write(token_dic_str)
57 |
58 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
59 | phrases, indices = [], []
60 | with open(sent_file_path, 'r') as sent_file:
61 | sents = sent_file.readlines()
62 | vocab = json.loads(open(vocab_file_path, "r").readline())
63 | start_index = 0
64 | for i in range(0, len(sents)):
65 | sent = sents[i].rstrip('\n')
66 | word_list = sent.split()
67 | sent_len = min(len(word_list), maxlen)
68 | indices.append((sent_len, start_index))
69 | for j in range(0, sent_len):
70 | word = word_list[j]
71 | phrases.append(vocab.get(word, UNK_ID))
72 | start_index += sent_len
73 | output_file_path = sent_file_path[0:-3] + 'h5'
74 | output_file = h5py.File(output_file_path, 'w')
75 | output_file['phrases'] = phrases
76 | output_file['indices'] = indices
77 | output_file.close()
78 |
79 | def parse_args():
80 | parser = argparse.ArgumentParser("Parse token data for TokenEmbedder")
81 |
82 | parser.add_argument('--data_path', type=str, default='./data/')
83 | parser.add_argument('--dataset', type=str, default='github11/')
84 |
85 | parser.add_argument('--all_token_file', type=str, default='all.token.txt')
86 | parser.add_argument('--train_token_file', type=str, default='train.token.txt')
87 | parser.add_argument('--test_token_file', type=str, default='test.token.txt')
88 | parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json')
89 |
90 | parser.add_argument('--trainset_num', type=int, default=39152)
91 | parser.add_argument('--testset_num', type=int, default=2000)
92 | parser.add_argument('--token_word_num', type=int, default=20000)
93 | parser.add_argument('--token_maxlen', type=int, default=100)
94 | parser.add_argument('--testset_start_index', type=int, default=39152)
95 |
96 |
97 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
98 |
99 | return parser.parse_args()
100 |
101 | if __name__ == '__main__':
102 | args = parse_args()
103 | '''
104 | dir_path = args.data_path + args.dataset
105 | with open(dir_path+'origin.test.token.txt', 'r') as in_file, open(dir_path+'test.token.txt', 'w') as out_file:
106 | lines = in_file.readlines()
107 | for i in range(0, len(lines)):
108 | if lines[i][0:10] != 'BeginFunc:':
109 | out_file.write(lines[i])
110 | '''
111 |
112 | split_token_data(args)
113 | create_token_dict_file(args)
114 |
115 | dir_path = args.data_path + args.dataset
116 | # train.token.txt -> train.token.h5(and test...)
117 | sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
118 | sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
119 |
120 |
121 | '''
122 | dir_path = args.data_path + args.dataset
123 | all_token_file_path = dir_path + args.all_token_file
124 | with open(all_token_file_path, 'r') as all_token_file:
125 | lines = all_token_file.readlines()
126 | print(len(lines))
127 | for i in range(0, len(lines)):
128 | line = lines[i]
129 | if line[0:10] != 'BeginFunc:':
130 | words = line.split()
131 | if len(words) == 0:
132 | print(lines[i-1])
133 | #print(lines[i])
134 | '''
135 |
--------------------------------------------------------------------------------
/Baseline methods/MMAN(TDC)/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | import math
4 | import torch
5 | from torch.nn import functional as F
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def cos_approx(data1,data2):
10 | """numpy implementation of cosine similarity for matrix"""
11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
12 | dotted = np.dot(data1,np.transpose(data2))
13 | norm1 = np.linalg.norm(data1,axis=1)
14 | norm2 = np.linalg.norm(data2,axis=1)
15 | matrix_vector_norms = np.multiply(norm1, norm2)
16 | neighbors = np.divide(dotted, matrix_vector_norms)
17 | return neighbors
18 |
19 | def normalize(data):
20 | """normalize matrix by rows"""
21 | return data/np.linalg.norm(data,axis=1,keepdims=True)
22 |
23 | def dot_np(data1,data2):
24 | """cosine similarity for normalized vectors"""
25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
26 | return np.dot(data1, data2.T)
27 |
28 | def sigmoid(x):
29 | return 1/(1 + np.exp(-x))
30 |
31 | def similarity(vec1, vec2, measure='cos'):
32 | if measure=='cos':
33 | vec1_norm = normalize(vec1)
34 | vec2_norm = normalize(vec2)
35 | return np.dot(vec1_norm, vec2_norm.T)[:,0]
36 | elif measure=='poly':
37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
38 | elif measure=='sigmoid':
39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
40 | elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf
41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1)
42 | euc_sim = 1 / (1 + euc_dist)
43 | if measure=='euc': return euc_sim
44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
45 | if measure == 'gesd': return euc_sim * sigmoid_sim
46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
47 |
48 | #######################################################################
49 |
50 | def asMinutes(s):
51 | m = math.floor(s / 60)
52 | s -= m * 60
53 | return '%d:%d'% (m, s)
54 |
55 | def timeSince(since, percent):
56 | now = time.time()
57 | s = now - since
58 | es = s / (percent)
59 | rs = es - s
60 | return '%s<%s'%(asMinutes(s), asMinutes(rs))
61 |
62 | #######################################################################
63 |
64 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID):
65 | '''indexes: numpy array'''
66 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
67 | indexes=filter(lambda i: i!=ignore_tok, indexes)
68 | toks, length = [], 0
69 | for idx in indexes:
70 | toks.append(ivocab.get(idx, ''))
71 | length+=1
72 | return ' '.join(toks), length
73 |
74 | ivocab = {v: k for k, v in vocab.items()}
75 | if indexes.ndim==1:# one sentence
76 | return revert_sent(indexes, ivocab, ignore_tok)
77 | else:# dim>1
78 | sentences, lens =[], [] # a batch of sentences
79 | for inds in indexes:
80 | sentence, length = revert_sent(inds, ivocab, ignore_tok)
81 | sentences.append(sentence)
82 | lens.append(length)
83 | return sentences, lens
84 |
85 | ########################################################################
86 |
--------------------------------------------------------------------------------
/Baseline methods/MMAN/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/MMAN/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/configs.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/data_loader.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/modules.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/__pycache__/util_cfg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/util_cfg.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/configs.py:
--------------------------------------------------------------------------------
1 |
2 | def config_MultiEmbeder():
3 | conf = {
4 | # GGNN
5 | 'state_dim': 512, # GGNN hidden state size
6 | 'annotation_dim': 5,
7 | 'n_edge_types': 2,
8 | 'n_node': 200, # could be less than 512, like the maximum nodenum
9 | 'n_steps': 5, # propogation steps number of GGNN
10 | 'output_type': 'no_reduce',
11 | 'batch_size': 32,
12 | 'n_layers': 1,
13 | 'n_hidden': 512,
14 | 'cfg_attn_mode': 'sigmoid_scalar',
15 |
16 | # TreeLSTM
17 | 'treelstm_cell_type': 'nary', # nary or childsum
18 | 'n_ast_words': 50000,
19 |
20 | # Token and Description
21 | 'desc_len': 30,
22 | 'tok_len': 100,
23 | 'n_desc_words': 10000,
24 | 'n_token_words': 25000,
25 |
26 | # data_params
27 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
28 | # training data
29 | 'train_token':'train.token.h5',
30 | 'train_ast':'train.ast.json',
31 | 'train_cfg':'train.cfg.txt',
32 | 'train_desc':'train.desc.h5',
33 | # test data
34 | 'test_token':'test.token.h5',
35 | 'test_ast':'test.ast.json',
36 | 'test_cfg':'test.cfg.txt',
37 | 'test_desc':'test.desc.h5',
38 | # vocabulary info
39 | 'vocab_token':'vocab.token.json',
40 | 'vocab_ast':'vocab.ast.json',
41 | 'vocab_desc':'vocab.desc.json',
42 |
43 | # model_params
44 | 'emb_size': 300,
45 | # recurrent
46 | 'margin': 0.6,
47 | 'sim_measure':'cos',
48 | 'dropout': 0.1,
49 |
50 |
51 | # training_params
52 | 'nb_epoch': 200,
53 | #'optimizer': 'adamW',
54 | 'learning_rate':0.0003, # try 1e-4(paper)
55 | 'adam_epsilon':1e-8,
56 | 'warmup_steps':5000,
57 | 'fp16': False,
58 | 'fp16_opt_level': 'O1' #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
59 | #"See details at https://nvidia.github.io/apex/amp.html"
60 |
61 |
62 | }
63 | return conf
64 |
65 |
--------------------------------------------------------------------------------
/Baseline methods/MMAN/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__init__.py
--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/util_ast.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_ast.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/util_cfg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_cfg.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/util_desc.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_desc.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/util_tok.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/util_tok.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/data_prepare/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/util_desc.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | from collections import Counter
4 | import json
5 | import h5py
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def make_shuffle_index(args):
10 | dir_path = args.data_path + args.dataset
11 | all_desc_file_path = dir_path + args.all_desc_file
12 | with open(all_desc_file_path, 'r') as all_desc_file:
13 | lines = all_desc_file.readlines()
14 | all_num = int(len(lines)/2)
15 |
16 | index = np.arange(all_num)
17 | np.random.seed(16)
18 | np.random.shuffle(index)
19 | #print(index)
20 | np.save(args.shuffle_index_file, index)
21 |
22 | def split_desc_data(args):
23 | index = np.load(args.shuffle_index_file)
24 |
25 | dir_path = args.data_path + args.dataset
26 | all_desc_file_path = dir_path + args.all_desc_file
27 | train_desc_file_path = dir_path + args.train_desc_file
28 | test_desc_file_path = dir_path + args.test_desc_file
29 |
30 | input_desc = []
31 | with open(all_desc_file_path, 'r') as all_desc_file:
32 | lines = all_desc_file.readlines()
33 | for line in lines:
34 | if (line[0:10] != 'BeginFunc:'):
35 | input_desc.append(line)
36 | print('number of input desc:\n', len(input_desc))
37 |
38 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
39 | for i in range(0, args.trainset_num):
40 | train_desc_file.write(input_desc[index[i]])
41 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
42 | test_desc_file.write(input_desc[index[i]])
43 |
44 |
45 | def create_desc_dict_file(args):
46 | dir_path = args.data_path + args.dataset
47 | desc_file_path = dir_path + args.train_desc_file
48 |
49 | input_desc = []
50 | with open(desc_file_path, 'r') as desc_file:
51 | input_desc = desc_file.readlines()
52 | desc_words = []
53 | for i in range(0, len(input_desc)):
54 | input_desc[i] = input_desc[i].rstrip('\n')
55 | desc_word_list = input_desc[i].split()
56 | for desc_word in desc_word_list:
57 | desc_words.append(desc_word)
58 | vocab_desc_info = Counter(desc_words)
59 | print(len(vocab_desc_info))
60 |
61 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
62 | vocab_desc_index = {'':0, '':1}
63 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
64 |
65 |
66 | vocab_desc_file_path = dir_path + args.vocab_desc_file
67 | desc_dic_str = json.dumps(vocab_desc_index)
68 | with open(vocab_desc_file_path, 'w') as vocab_desc_file:
69 | vocab_desc_file.write(desc_dic_str)
70 |
71 |
72 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
73 | phrases, indices = [], []
74 | with open(sent_file_path, 'r') as sent_file:
75 | sents = sent_file.readlines()
76 | vocab = json.loads(open(vocab_file_path, "r").readline())
77 | start_index = 0
78 | for i in range(0, len(sents)):
79 | sent = sents[i].rstrip('\n')
80 | word_list = sent.split()
81 | sent_len = min(len(word_list), maxlen)
82 | indices.append((sent_len, start_index))
83 | for j in range(0, sent_len):
84 | word = word_list[j]
85 | phrases.append(vocab.get(word, UNK_ID))
86 | start_index += sent_len
87 | output_file_path = sent_file_path[0:-3] + 'h5'
88 | output_file = h5py.File(output_file_path, 'w')
89 | output_file['phrases'] = phrases
90 | output_file['indices'] = indices
91 | output_file.close()
92 |
93 | '''
94 | def parse_args():
95 | parser = argparse.ArgumentParser("Parse Description data for CFGEmbedder")
96 |
97 | parser.add_argument('--data_path', type=str, default='./data/')
98 | parser.add_argument('--dataset', type=str, default='example/')
99 |
100 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
101 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
102 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
103 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
104 |
105 | parser.add_argument('--trainset_num', type=int, default=12)
106 | parser.add_argument('--testset_num', type=int, default=1000)
107 | parser.add_argument('--desc_word_num', type=int, default=50)
108 | parser.add_argument('--desc_maxlen', type=int, default=50)
109 | parser.add_argument('--testset_start_index', type=int, default=33000)
110 |
111 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
112 |
113 | return parser.parse_args()
114 |
115 | if __name__ == '__main__':
116 |
117 | args = parse_args()
118 |
119 | #make_shuffle_index(args)
120 | #split_data(args)
121 | create_desc_dict_file(args)
122 |
123 |
124 | dir_path = args.data_path + args.dataset
125 | # train.desc.txt -> train.desc.h5(and test...)
126 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
127 | #sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
128 | '''
129 |
--------------------------------------------------------------------------------
/Baseline methods/MMAN/data_prepare/util_tok.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | from collections import Counter
4 | import json
5 | import h5py
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def split_token_data(args):
10 | index = np.load(args.shuffle_index_file)
11 |
12 | dir_path = args.data_path + args.dataset
13 | all_token_file_path = dir_path + args.all_token_file
14 | train_token_file_path = dir_path + args.train_token_file
15 | test_token_file_path = dir_path + args.test_token_file
16 |
17 | input_token = []
18 | with open(all_token_file_path, 'r') as all_token_file:
19 | lines = all_token_file.readlines()
20 | for line in lines:
21 | if (line[0:10] != 'BeginFunc:'):
22 | input_token.append(line)
23 | print('number of input token:\n', len(input_token))
24 |
25 | with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file:
26 | for i in range(0, args.trainset_num):
27 | train_token_file.write(input_token[index[i]])
28 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
29 | test_token_file.write(input_token[index[i]])
30 |
31 |
32 | def create_token_dict_file(args):
33 | dir_path = args.data_path + args.dataset
34 | token_file_path = dir_path + args.train_token_file
35 |
36 | input_token = []
37 | with open(token_file_path, 'r') as token_file:
38 | input_token = token_file.readlines()
39 | token_words = []
40 | for i in range(0, len(input_token)):
41 | input_token[i] = input_token[i].rstrip('\n')
42 | token_word_list = input_token[i].split()
43 | for token_word in token_word_list:
44 | token_words.append(token_word)
45 | vocab_token_info = Counter(token_words)
46 | print(len(vocab_token_info))
47 |
48 | vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]]
49 | vocab_token_index = {'':0, '':1}
50 | vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))]))
51 |
52 |
53 | vocab_token_file_path = dir_path + args.vocab_token_file
54 | token_dic_str = json.dumps(vocab_token_index)
55 | with open(vocab_token_file_path, 'w') as vocab_token_file:
56 | vocab_token_file.write(token_dic_str)
57 |
58 |
59 | '''
60 | def parse_args():
61 | parser = argparse.ArgumentParser("Parse tokenription data for CFGEmbedder")
62 |
63 | parser.add_argument('--data_path', type=str, default='./data/')
64 | parser.add_argument('--dataset', type=str, default='example/')
65 |
66 | parser.add_argument('--all_token_file', type=str, default='all.token.txt')
67 | parser.add_argument('--train_token_file', type=str, default='train.token.txt')
68 | parser.add_argument('--test_token_file', type=str, default='test.token.txt')
69 | parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json')
70 |
71 | parser.add_argument('--trainset_num', type=int, default=12)
72 | parser.add_argument('--testset_num', type=int, default=1000)
73 | parser.add_argument('--token_word_num', type=int, default=50)
74 | parser.add_argument('--token_maxlen', type=int, default=50)
75 | parser.add_argument('--testset_start_index', type=int, default=33000)
76 |
77 |
78 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
79 |
80 | return parser.parse_args()
81 |
82 | if __name__ == '__main__':
83 | args = parse_args()
84 |
85 | #make_shuffle_index(args)
86 | #split_data(args)
87 | #create_token_dict_file(args)
88 |
89 |
90 | #dir_path = args.data_path + args.dataset
91 | # train.token.txt -> train.token.h5(and test...)
92 | #sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
93 | #sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
94 | '''
95 |
--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .multiemb import MultiEmbeder
--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__pycache__/cfgemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/cfgemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/jointemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__pycache__/multiemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/multiemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/models/__pycache__/tokenemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/MMAN/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/output/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/MMAN/output/MultiEmbeder/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/output/MultiEmbeder/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/MMAN/shuffle_index.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/MMAN/shuffle_index.npy
--------------------------------------------------------------------------------
/Baseline methods/MMAN/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import traceback
4 | import numpy as np
5 | import argparse
6 | import threading
7 | import codecs
8 | import logging
9 | from tqdm import tqdm
10 | logger = logging.getLogger(__name__)
11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
12 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package
13 |
14 | import torch
15 |
16 | import models, configs, data_loader
17 | from modules import get_cosine_schedule_with_warmup
18 | #from utils import similarity, normalize
19 | from data_loader import *
20 |
21 |
22 | def normalize(data):
23 | """normalize matrix by rows"""
24 | return data/np.linalg.norm(data,axis=1,keepdims=True)
25 |
26 | def test(config, model, device):
27 | logger.info('Test Begin...')
28 |
29 | model.eval()
30 | model.to(device)
31 |
32 | # load data
33 | data_path = args.data_path+args.dataset+'/'
34 | test_set = eval(config['dataset_name'])(config, data_path,
35 | config['test_token'], config['tok_len'],
36 | config['test_ast'], config['vocab_ast'],
37 | config['test_cfg'], config['n_node'],
38 | config['test_desc'], config['desc_len'])
39 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
40 | collate_fn=batcher(device), shuffle=False, drop_last=False, num_workers=0)
41 | # encode tokens and descs
42 | code_reprs, desc_reprs = [], []
43 | n_processed = 0
44 | for batch in data_loader:
45 | # batch[0:7]: tokens, tok_len, tree, tree_node_num, init_input, adjmat, node_mask
46 | code_batch = [tensor for tensor in batch[:7]]
47 | # batch[7:9]: good_desc, good_desc_len
48 | desc_batch = [tensor for tensor in batch[7:9]]
49 | with torch.no_grad():
50 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
51 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
52 | # normalize when sim_measure=='cos'
53 | code_repr = normalize(code_repr)
54 | desc_repr = normalize(desc_repr)
55 | code_reprs.append(code_repr)
56 | desc_reprs.append(desc_repr)
57 | n_processed += batch[0].size(0) # +batch_size
58 | # code_reprs: [n_processed x n_hidden]
59 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs)
60 |
61 | # calculate similarity
62 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
63 | test_sim_result, test_rank_result = [], []
64 | for i in tqdm(range(0, n_processed)):
65 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
66 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
67 | negsims = np.negative(sims)
68 | predict = np.argsort(negsims)
69 |
70 | # SuccessRate@k
71 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
72 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
73 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
74 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
75 | # MRR
76 | predict_list = predict.tolist()
77 | rank = predict_list.index(i)
78 | sum_mrr.append(1/float(rank+1))
79 |
80 | # results need to be saved
81 | predict_20 = [int(k) for k in predict[0:20]]
82 | sim_20 = [sims[k] for k in predict_20]
83 | test_sim_result.append(zip(predict_20, sim_20))
84 | test_rank_result.append(rank+1)
85 |
86 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
87 | save_path = args.data_path + 'result/'
88 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
89 | np.save(save_path+sim_result_filename, test_sim_result)
90 | np.save(save_path+rank_result_filename, test_rank_result)
91 |
92 |
93 | def parse_args():
94 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
95 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
96 | parser.add_argument('--model', type=str, default='MultiEmbeder', help='model name')
97 | parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python')
98 | parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from')
99 | parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\
100 | 'Note: should be consistent with the same argument in the repr_code.py')
101 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
102 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
103 | return parser.parse_args()
104 |
105 |
106 | if __name__ == '__main__':
107 | args = parse_args()
108 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
109 | config = getattr(configs, 'config_'+args.model)()
110 |
111 | ##### Define model ######
112 | logger.info('Constructing Model..')
113 | model = getattr(models, args.model)(config) # initialize the model
114 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
115 | model.load_state_dict(torch.load(ckpt, map_location=device))
116 |
117 | test(config, model, device)
118 |
119 |
120 |
--------------------------------------------------------------------------------
/Baseline methods/MMAN/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
3 | import sys
4 | import random
5 | import time
6 | from datetime import datetime
7 | import numpy as np
8 | import math
9 | import argparse
10 | random.seed(42)
11 | from tqdm import tqdm
12 |
13 | import logging
14 | logger = logging.getLogger(__name__)
15 | logging.basicConfig(level=logging.INFO, format="%(message)s")
16 |
17 | import torch
18 |
19 | import models, configs
20 | from modules import get_cosine_schedule_with_warmup
21 | from data_loader import *
22 |
23 |
24 | def train(args):
25 | fh = logging.FileHandler(f"./output/{args.model}/{args.dataset}/logs.txt")
26 | # create file handler which logs even debug messages
27 | logger.addHandler(fh)# add the handlers to the logger
28 | timestamp = datetime.now().strftime('%Y%m%d%H%M')
29 |
30 | random.seed(args.seed)
31 | np.random.seed(args.seed)
32 | torch.manual_seed(args.seed)
33 | torch.cuda.manual_seed(args.seed)
34 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
35 |
36 | def save_model(model, epoch):
37 | torch.save(model.state_dict(), f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5')
38 |
39 | def load_model(model, epoch, to_device):
40 | assert os.path.exists(f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5'), f'Weights at epoch {epoch} not found'
41 | model.load_state_dict(torch.load(f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5', map_location=to_device))
42 |
43 | config = getattr(configs, 'config_'+args.model)()
44 | print(config)
45 |
46 | # load data
47 | data_path = args.data_path+args.dataset+'/'
48 | train_set = eval(config['dataset_name'])(config, data_path,
49 | config['train_token'], config['tok_len'],
50 | config['train_ast'], config['vocab_ast'],
51 | config['train_cfg'], config['n_node'],
52 | config['train_desc'], config['desc_len'])
53 |
54 | data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'],
55 | collate_fn=batcher(device), shuffle=True, drop_last=False, num_workers=0)
56 |
57 | # define the models
58 | logger.info('Constructing Model..')
59 | model = getattr(models, args.model)(config) #initialize the model
60 | if args.reload_from>0:
61 | load_model(model, args.reload_from, device)
62 | logger.info('done')
63 | model.to(device)
64 |
65 | no_decay = ['bias', 'LayerNorm.weight']
66 | optimizer_grouped_parameters = [
67 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
68 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
69 | ]
70 | optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon'])
71 | scheduler = get_cosine_schedule_with_warmup(
72 | optimizer, num_warmup_steps=config['warmup_steps'],
73 | num_training_steps=len(data_loader)*config['nb_epoch']) # do not forget to modify the number when dataset is changed
74 |
75 | print('---model parameters---')
76 | num_params = 0
77 | for param in model.parameters():
78 | num_params += param.numel()
79 | print(num_params / 1e6)
80 |
81 | n_iters = len(data_loader)
82 | itr_global = args.reload_from+1
83 | for epoch in range(int(args.reload_from)+1, config['nb_epoch']+1):
84 | itr_start_time = time.time()
85 | losses=[]
86 | for batch in data_loader:
87 |
88 | model.train()
89 | batch_gpu = [tensor for tensor in batch]
90 | loss = model(*batch_gpu)
91 |
92 | loss.backward()
93 | torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
94 |
95 | optimizer.step()
96 | scheduler.step()
97 | model.zero_grad()
98 |
99 | losses.append(loss.item())
100 |
101 | if itr_global % args.log_every == 0:
102 | elapsed = time.time() - itr_start_time
103 | logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f'%
104 | (epoch, config['nb_epoch'], itr_global%n_iters, n_iters, elapsed, np.mean(losses)))
105 |
106 | losses=[]
107 | itr_start_time = time.time()
108 | itr_global = itr_global + 1
109 |
110 | # save every epoch
111 | if epoch >= 90:
112 | if epoch % 5 == 0:
113 | save_model(model, epoch)
114 |
115 |
116 | def parse_args():
117 | parser = argparse.ArgumentParser("Train and Validate The Code Search (Embedding) Model")
118 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
119 | parser.add_argument('--model', type=str, default='MultiEmbeder', help='model name')
120 | parser.add_argument('--dataset', type=str, default='github', help='name of dataset.java, python')
121 | parser.add_argument('--reload_from', type=int, default=-1, help='epoch to reload from')
122 |
123 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
124 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
125 | # Training Arguments
126 | parser.add_argument('--log_every', type=int, default=50, help='interval to log autoencoder training results')
127 | parser.add_argument('--seed', type=int, default=1111, help='random seed')
128 |
129 |
130 | return parser.parse_args()
131 |
132 | if __name__ == '__main__':
133 | args = parse_args()
134 |
135 | # make output directory if it doesn't already exist
136 | os.makedirs(f'./output/{args.model}/{args.dataset}/models', exist_ok=True)
137 | os.makedirs(f'./output/{args.model}/{args.dataset}/tmp_results', exist_ok=True)
138 |
139 | torch.backends.cudnn.benchmark = True # speed up training by using cudnn
140 | torch.backends.cudnn.deterministic = True # fix the random seed in cudnn
141 |
142 | train(args)
143 |
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/Tok-Att.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 | "folders": [
3 | {
4 | "path": "../CFG-Att"
5 | },
6 | {
7 | "path": "../AST-Att"
8 | },
9 | {
10 | "path": "."
11 | }
12 | ]
13 | }
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/__pycache__/configs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/configs.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/data_loader.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/modules.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/configs.py:
--------------------------------------------------------------------------------
1 |
2 | def config_TokenEmbeder():
3 | conf = {
4 | # added_params
5 | 'gpu': 1,
6 | 'transform_every_modal': 0, # to make modal more complex?
7 | 'save_attn_weight': 0,
8 | 'use_tanh': 1,
9 | 'use_attn': 1,
10 |
11 | # data_params
12 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
13 | #training data
14 | 'train_tokens':'train.token.h5',
15 | 'train_desc':'train.desc.h5',
16 | #valid data
17 | 'valid_tokens':'valid.token.h5',
18 | 'valid_desc':'valid.desc.h5',
19 | # test data
20 | 'test_tokens':'test.token.h5',
21 | 'test_desc':'test.desc.h5',
22 |
23 | #parameters
24 | 'tokens_len':50,
25 | 'desc_len': 30,
26 | 'n_token_words': 20000, # len(vocabulary) + 1
27 | 'n_desc_words': 12000, # wait to decide
28 | #vocabulary info
29 | 'vocab_tokens':'vocab.token.json',
30 | 'vocab_desc':'vocab.desc.json',
31 |
32 | #training_params
33 | 'batch_size': 32,
34 | 'chunk_size': 200000,
35 | 'nb_epoch': 200,
36 | #'optimizer': 'adam',
37 | 'learning_rate':0.0003, # try 1e-4(paper)
38 | 'adam_epsilon':1e-8,
39 | 'warmup_steps':5000,
40 | 'fp16': False,
41 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
42 | #"See details at https://nvidia.github.io/apex/amp.html"
43 |
44 | # model_params
45 | 'emb_size': 300,
46 | 'n_hidden': 512,#number of hidden dimension of code/desc representation
47 | # recurrent
48 | 'margin': 0.6,
49 | 'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
50 | #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
51 | 'dropout':0.1
52 | }
53 | return conf
54 |
55 |
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/data/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/data_loader.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | import torch.utils.data as data
4 | import torch.nn as nn
5 | import tables
6 | import json
7 | import random
8 | import numpy as np
9 | import pickle
10 | from utils import PAD_ID, UNK_ID, indexes2sent
11 |
12 | import logging
13 | logger = logging.getLogger(__name__)
14 | logging.basicConfig(level=logging.INFO, format="%(message)s")
15 |
16 |
17 | class CodeSearchDataset(data.Dataset):
18 | """
19 | Dataset that has only positive samples.
20 | """
21 | def __init__(self, data_dir, f_tokens, max_tok_len, f_descs=None, max_desc_len=None):
22 | self.max_tok_len = max_tok_len
23 | self.max_desc_len = max_desc_len
24 | # initialize file path or list of file names
25 | """read training data(list of int arrays) from a hdf5 file"""
26 | self.training = False
27 | print("loading data...")
28 | table_tokens = tables.open_file(data_dir+f_tokens)
29 | self.tokens = table_tokens.get_node('/phrases')[:].astype(np.long)
30 | self.idx_tokens = table_tokens.get_node('/indices')[:]
31 | if f_descs is not None:
32 | self.training=True
33 | table_desc = tables.open_file(data_dir+f_descs)
34 | self.descs = table_desc.get_node('/phrases')[:].astype(np.long)
35 | self.idx_descs = table_desc.get_node('/indices')[:]
36 |
37 | if f_descs is not None:
38 | assert self.idx_tokens.shape[0]==self.idx_descs.shape[0]
39 | self.data_len = self.idx_tokens.shape[0]
40 | print("{} entries".format(self.data_len))
41 |
42 | def pad_seq(self, seq, maxlen):
43 | if len(seq) < maxlen:
44 | # !!!!! numpy appending is slow. Try to optimize the padding
45 | seq = np.append(seq, [PAD_ID]*(maxlen-len(seq)))
46 | seq = seq[:maxlen]
47 | return seq
48 |
49 | def __getitem__(self, offset):
50 | len, pos = self.idx_tokens[offset][0], self.idx_tokens[offset][1]
51 | tok_len = min(int(len), self.max_tok_len)
52 | tokens = self.tokens[pos:pos+tok_len]
53 | tokens = self.pad_seq(tokens, self.max_tok_len)
54 |
55 | if self.training:
56 | len, pos = self.idx_descs[offset][0], self.idx_descs[offset][1]
57 | good_desc_len = min(int(len), self.max_desc_len)
58 | good_desc = self.descs[pos:pos+good_desc_len]
59 | good_desc = self.pad_seq(good_desc, self.max_desc_len)
60 |
61 | rand_offset = random.randint(0, self.data_len-1)
62 | len, pos = self.idx_descs[rand_offset][0], self.idx_descs[rand_offset][1]
63 | bad_desc_len=min(int(len), self.max_desc_len)
64 | bad_desc = self.descs[pos:pos+bad_desc_len]
65 | bad_desc = self.pad_seq(bad_desc, self.max_desc_len)
66 |
67 | return tokens, tok_len, good_desc, good_desc_len, bad_desc, bad_desc_len
68 | return tokens, tok_len, good_desc, good_desc_len
69 |
70 | def __len__(self):
71 | return self.data_len
72 |
73 |
74 | def load_dict(filename):
75 | return json.loads(open(filename, "r").readline())
76 | #return pickle.load(open(filename, 'rb'))
77 |
78 | def load_vecs(fin):
79 | """read vectors (2D numpy array) from a hdf5 file"""
80 | h5f = tables.open_file(fin)
81 | h5vecs= h5f.root.vecs
82 |
83 | vecs=np.zeros(shape=h5vecs.shape,dtype=h5vecs.dtype)
84 | vecs[:]=h5vecs[:]
85 | h5f.close()
86 | return vecs
87 |
88 | def save_vecs(vecs, fout):
89 | fvec = tables.open_file(fout, 'w')
90 | atom = tables.Atom.from_dtype(vecs.dtype)
91 | filters = tables.Filters(complib='blosc', complevel=5)
92 | ds = fvec.create_carray(fvec.root,'vecs', atom, vecs.shape,filters=filters)
93 | ds[:] = vecs
94 | print('done')
95 | fvec.close()
96 |
97 | if __name__ == '__main__':
98 | input_dir = './data/github/'
99 | train_set = CodeSearchDataset(input_dir, 'train.token.h5', 60, 'train.desc.h5', 30)
100 | train_data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=32, shuffle=False, num_workers=1)
101 | logger.info('hello')
102 | #print(len(train_data_loader))
103 | '''
104 | use_set = CodeSearchDataset(input_dir, 'use.tokens.h5', 30)
105 | use_data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=1, shuffle=False, num_workers=1)
106 | #print(len(use_data_loader))
107 | vocab_tokens = load_dict(input_dir+'vocab.tokens.json')
108 | vocab_desc = load_dict(input_dir+'vocab.desc.json')
109 | '''
110 | print('============ Train Data ================')
111 | k=0
112 | for batch in train_data_loader:
113 | print("batch[0].size(0): ", batch[0].size(0))
114 | batch = tuple([t.numpy() for t in batch])
115 | tokens, tok_len, good_desc, good_desc_len, bad_desc, bad_desc_len = batch
116 | k+=1
117 | if k>0: break
118 | print('-------------------------------')
119 | #print(indexes2sent(tokens, vocab_tokens))
120 | #print(indexes2sent(good_desc, vocab_desc))
121 |
122 |
123 |
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenemb import TokenEmbeder
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/models/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/models/__pycache__/jointemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/models/__pycache__/tokenemb.cpython-36.pyc
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/modules.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import math
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.init as weight_init
8 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
9 | from torch import optim
10 | import torch.nn.functional as F
11 |
12 | import logging
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | class SeqEncoder(nn.Module):
17 | def __init__(self, vocab_size, emb_size, hidden_size, n_layers=1):
18 | super(SeqEncoder, self).__init__()
19 | self.emb_size = emb_size
20 | self.hidden_size = hidden_size
21 | self.n_layers = n_layers
22 |
23 | self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
24 |
25 | self.init_xavier_linear(self.embedding, init_bias=False)
26 |
27 | self.lstm = nn.LSTM(emb_size, hidden_size, dropout=0.1, batch_first=True, bidirectional=False)
28 |
29 | def init_xavier_linear(self, linear, init_bias=True, gain=1, init_normal_std=1e-4):
30 | torch.nn.init.xavier_uniform_(linear.weight, gain)
31 | if init_bias:
32 | if linear.bias is not None:
33 | linear.bias.data.normal_(std=init_normal_std)
34 |
35 | def init_hidden(self, batch_size):
36 | weight = next(self.parameters()).data
37 | return (weight.new(self.n_layers, batch_size, self.hidden_size).zero_().requires_grad_(), # rnn_type == 'LSTM'
38 | weight.new(self.n_layers, batch_size, self.hidden_size).zero_().requires_grad_())
39 |
40 |
41 | def forward(self, inputs, input_lens=None, hidden=None):
42 | batch_size, seq_len = inputs.size()
43 | inputs = self.embedding(inputs) # input: [batch_sz x seq_len] embedded: [batch_sz x seq_len x emb_sz]
44 | #inputs = F.dropout(inputs, 0.1, self.training) # mark.
45 |
46 | if input_lens is not None:# sort and pack sequence
47 | input_lens_sorted, indices = input_lens.sort(descending=True)
48 | inputs_sorted = inputs.index_select(0, indices)
49 | inputs = pack_padded_sequence(inputs_sorted, input_lens_sorted.data.tolist(), batch_first=True)
50 |
51 | hids, (h_n, c_n) = self.lstm(inputs, hidden) # hids:[b x seq x hid_sz*2](biRNN)
52 |
53 | if input_lens is not None: # reorder and pad
54 | _, inv_indices = indices.sort()
55 | hids, lens = pad_packed_sequence(hids, batch_first=True)
56 | #hids = F.dropout(hids, p=0.1, training=self.training) # mark.
57 | hids = hids.index_select(0, inv_indices) # [batch_sz x seq_len x hid_sz]
58 | h_n = h_n.index_select(1, inv_indices)
59 | c_n = c_n.index_select(1, inv_indices)
60 |
61 | h_n = h_n[0] # [batch_sz x hid_sz] n_layers==1 and n_dirs==1
62 | c_n = c_n[0]
63 |
64 | return hids, (h_n, c_n)
65 |
66 |
67 | from torch.optim.lr_scheduler import LambdaLR
68 |
69 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
70 | """ Create a schedule with a learning rate that decreases following the
71 | values of the cosine function between 0 and `pi * cycles` after a warmup
72 | period during which it increases linearly between 0 and 1.
73 | """
74 | def lr_lambda(current_step):
75 | if current_step < num_warmup_steps:
76 | return float(current_step) / float(max(1, num_warmup_steps))
77 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
78 | return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
79 |
80 | return LambdaLR(optimizer, lr_lambda, last_epoch)
81 |
82 |
83 | def get_word_weights(vocab_size, padding_idx=0):
84 | '''contruct a word weighting table '''
85 | def cal_weight(word_idx):
86 | return 1-math.exp(-word_idx)
87 | weight_table = np.array([cal_weight(w) for w in range(vocab_size)])
88 | if padding_idx is not None:
89 | weight_table[padding_idx] = 0. # zero vector for padding dimension
90 | return torch.FloatTensor(weight_table)
91 |
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/output/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/output/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/output/TokenEmbeder/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/output/TokenEmbeder/.DS_Store
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/shuffle_index.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/Baseline methods/Tok-Att/shuffle_index.npy
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
3 | import sys
4 | import traceback
5 | import numpy as np
6 | import argparse
7 | import threading
8 | import codecs
9 | import logging
10 | from tqdm import tqdm
11 | logger = logging.getLogger(__name__)
12 | logging.basicConfig(level=logging.INFO, format="%(message)s")
13 |
14 | import torch
15 |
16 | import models, configs, data_loader
17 | from modules import get_cosine_schedule_with_warmup
18 | from utils import similarity, normalize
19 | from data_loader import *
20 |
21 |
22 | def test(config, model, device):
23 | logger.info('test begin...')
24 |
25 | model.eval()
26 | model.to(device)
27 |
28 | # load data
29 | data_path = args.data_path+args.dataset+'/'
30 | test_set = eval(config['dataset_name'])(data_path,
31 | config['test_tokens'], config['tokens_len'],
32 | config['test_desc'], config['desc_len'])
33 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
34 | shuffle=False, drop_last=False, num_workers=1)
35 | # encode tokens and descs
36 | code_reprs, desc_reprs = [], []
37 | n_processed = 0
38 | for batch in data_loader:
39 | code_batch = [tensor.to(device) for tensor in batch[:2]]
40 | desc_batch = [tensor.to(device) for tensor in batch[2:4]]
41 | with torch.no_grad():
42 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
43 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
44 | # normalize when sim_measure=='cos'
45 | code_repr = normalize(code_repr)
46 | desc_repr = normalize(desc_repr)
47 | code_reprs.append(code_repr)
48 | desc_reprs.append(desc_repr)
49 | n_processed += batch[0].size(0) # +batch_size
50 | # code_reprs: [n_processed x n_hidden]
51 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs)
52 |
53 | # calculate similarity
54 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
55 | test_sim_result, test_rank_result = [], []
56 | for i in tqdm(range(0, n_processed)):
57 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
58 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
59 | negsims = np.negative(sims)
60 | predict = np.argsort(negsims)
61 |
62 | # SuccessRate@k
63 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
64 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
65 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
66 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
67 | # MRR
68 | predict_list = predict.tolist()
69 | rank = predict_list.index(i)
70 | sum_mrr.append(1/float(rank+1))
71 |
72 | # results need to be saved
73 | predict_20 = [int(k) for k in predict[0:20]]
74 | sim_20 = [sims[k] for k in predict_20]
75 | test_sim_result.append(zip(predict_20, sim_20))
76 | test_rank_result.append(rank+1)
77 |
78 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
79 | save_path = args.data_path + 'result/'
80 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
81 | np.save(save_path+sim_result_filename, test_sim_result)
82 | np.save(save_path+rank_result_filename, test_rank_result)
83 |
84 |
85 | def parse_args():
86 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
87 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
88 | parser.add_argument('--model', type=str, default='TokenEmbeder', help='model name')
89 | parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python')
90 | parser.add_argument('--reload_from', type=int, default=200, help='epoch to reload from')
91 | parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\
92 | 'Note: should be consistent with the same argument in the repr_code.py')
93 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
94 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
95 | return parser.parse_args()
96 |
97 |
98 | if __name__ == '__main__':
99 | args = parse_args()
100 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
101 | config = getattr(configs, 'config_'+args.model)()
102 |
103 | ##### Define model ######
104 | logger.info('Constructing Model..')
105 | model = getattr(models, args.model)(config) # initialize the model
106 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
107 | model.load_state_dict(torch.load(ckpt, map_location=device))
108 |
109 | test(config, model, device)
110 |
111 |
112 |
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/util_desc.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | from collections import Counter
4 | import json
5 | import h5py
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def make_shuffle_index(args):
10 | dir_path = args.data_path + args.dataset
11 | all_desc_file_path = dir_path + args.all_desc_file
12 | with open(all_desc_file_path, 'r') as all_desc_file:
13 | lines = all_desc_file.readlines()
14 | all_num = int(len(lines)/2)
15 |
16 | index = np.arange(all_num)
17 | np.random.seed(16)
18 | np.random.shuffle(index)
19 | print(len(index))
20 | np.save(args.shuffle_index_file, index)
21 |
22 | def split_desc_data(args):
23 | index = np.load(args.shuffle_index_file)
24 |
25 | dir_path = args.data_path + args.dataset
26 | all_desc_file_path = dir_path + args.all_desc_file
27 | train_desc_file_path = dir_path + args.train_desc_file
28 | test_desc_file_path = dir_path + args.test_desc_file
29 |
30 | input_desc = []
31 | with open(all_desc_file_path, 'r') as all_desc_file:
32 | lines = all_desc_file.readlines()
33 | for line in lines:
34 | if (line[0:10] != 'BeginFunc:'):
35 | input_desc.append(line)
36 | print('number of input desc:\n', len(input_desc))
37 |
38 | with open(train_desc_file_path, 'w') as train_desc_file, open(test_desc_file_path, 'w') as test_desc_file:
39 | for i in range(0, args.trainset_num):
40 | train_desc_file.write(input_desc[index[i]])
41 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
42 | test_desc_file.write(input_desc[index[i]])
43 |
44 |
45 | def create_desc_dict_file(args):
46 | dir_path = args.data_path + args.dataset
47 | desc_file_path = dir_path + args.train_desc_file
48 |
49 | input_desc = []
50 | with open(desc_file_path, 'r') as desc_file:
51 | input_desc = desc_file.readlines()
52 | desc_words = []
53 | for i in range(0, len(input_desc)):
54 | input_desc[i] = input_desc[i].rstrip('\n')
55 | desc_word_list = input_desc[i].split()
56 | for desc_word in desc_word_list:
57 | desc_words.append(desc_word)
58 | vocab_desc_info = Counter(desc_words)
59 | print(len(vocab_desc_info))
60 |
61 | vocab_desc = [item[0] for item in vocab_desc_info.most_common()[:args.desc_word_num-2]]
62 | vocab_desc_index = {'':0, '':1}
63 | vocab_desc_index.update(zip(vocab_desc, [item+2 for item in range(len(vocab_desc))]))
64 |
65 |
66 | vocab_desc_file_path = dir_path + args.vocab_desc_file
67 | desc_dic_str = json.dumps(vocab_desc_index)
68 | with open(vocab_desc_file_path, 'w') as vocab_desc_file:
69 | vocab_desc_file.write(desc_dic_str)
70 |
71 |
72 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
73 | phrases, indices = [], []
74 | with open(sent_file_path, 'r') as sent_file:
75 | sents = sent_file.readlines()
76 | vocab = json.loads(open(vocab_file_path, "r").readline())
77 | start_index = 0
78 | for i in range(0, len(sents)):
79 | sent = sents[i].rstrip('\n')
80 | word_list = sent.split()
81 | sent_len = min(len(word_list), maxlen)
82 | indices.append((sent_len, start_index))
83 | for j in range(0, sent_len):
84 | word = word_list[j]
85 | phrases.append(vocab.get(word, UNK_ID))
86 | start_index += sent_len
87 | output_file_path = sent_file_path[0:-3] + 'h5'
88 | output_file = h5py.File(output_file_path, 'w')
89 | output_file['phrases'] = phrases
90 | output_file['indices'] = indices
91 | output_file.close()
92 |
93 |
94 | def parse_args():
95 | parser = argparse.ArgumentParser("Parse Description data for TokenEmbedder")
96 |
97 | parser.add_argument('--data_path', type=str, default='./data/')
98 | parser.add_argument('--dataset', type=str, default='github11/')
99 |
100 | parser.add_argument('--origin_desc_file', type=str, default='origin.desc.txt')
101 | parser.add_argument('--all_desc_file', type=str, default='all.desc.txt')
102 | parser.add_argument('--train_desc_file', type=str, default='train.desc.txt')
103 | parser.add_argument('--test_desc_file', type=str, default='test.desc.txt')
104 | parser.add_argument('--vocab_desc_file', type=str, default='vocab.desc.json')
105 |
106 | parser.add_argument('--trainset_num', type=int, default=33845)
107 | parser.add_argument('--testset_num', type=int, default=2000)
108 | parser.add_argument('--desc_word_num', type=int, default=10000)
109 | parser.add_argument('--desc_maxlen', type=int, default=30)
110 | parser.add_argument('--testset_start_index', type=int, default=33845)
111 |
112 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
113 |
114 | return parser.parse_args()
115 |
116 | if __name__ == '__main__':
117 |
118 | args = parse_args()
119 |
120 | #make_shuffle_index(args)
121 | '''
122 | split_desc_data(args)
123 | create_desc_dict_file(args)
124 |
125 | dir_path = args.data_path + args.dataset
126 | # train.desc.txt -> train.desc.h5(and test...)
127 | sents2indexes(dir_path+args.train_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
128 | sents2indexes(dir_path+args.test_desc_file, dir_path+args.vocab_desc_file, args.desc_maxlen)
129 | '''
130 |
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/util_tok.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | from collections import Counter
4 | import json
5 | import h5py
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def split_token_data(args):
10 | index = np.load(args.shuffle_index_file)
11 |
12 | dir_path = args.data_path + args.dataset
13 | all_token_file_path = dir_path + args.all_token_file
14 | train_token_file_path = dir_path + args.train_token_file
15 | test_token_file_path = dir_path + args.test_token_file
16 |
17 | input_token = []
18 | with open(all_token_file_path, 'r') as all_token_file:
19 | lines = all_token_file.readlines()
20 | for line in lines:
21 | if (line[0:10] != 'BeginFunc:'):
22 | input_token.append(line)
23 | print('number of input token:\n', len(input_token))
24 |
25 | with open(train_token_file_path, 'w') as train_token_file, open(test_token_file_path, 'w') as test_token_file:
26 | for i in range(0, args.trainset_num):
27 | train_token_file.write(input_token[index[i]])
28 | for i in range(args.testset_start_index, args.testset_start_index+args.testset_num):
29 | test_token_file.write(input_token[index[i]])
30 |
31 |
32 | def create_token_dict_file(args):
33 | dir_path = args.data_path + args.dataset
34 | token_file_path = dir_path + args.train_token_file
35 |
36 | input_token = []
37 | with open(token_file_path, 'r') as token_file:
38 | input_token = token_file.readlines()
39 | token_words = []
40 | for i in range(0, len(input_token)):
41 | input_token[i] = input_token[i].rstrip('\n')
42 | token_word_list = input_token[i].split()
43 | for token_word in token_word_list:
44 | token_words.append(token_word)
45 | vocab_token_info = Counter(token_words)
46 | print(len(vocab_token_info))
47 |
48 | vocab_token = [item[0] for item in vocab_token_info.most_common()[:args.token_word_num-2]]
49 | vocab_token_index = {'':0, '':1}
50 | vocab_token_index.update(zip(vocab_token, [item+2 for item in range(len(vocab_token))]))
51 |
52 |
53 | vocab_token_file_path = dir_path + args.vocab_token_file
54 | token_dic_str = json.dumps(vocab_token_index)
55 | with open(vocab_token_file_path, 'w') as vocab_token_file:
56 | vocab_token_file.write(token_dic_str)
57 |
58 | def sents2indexes(sent_file_path, vocab_file_path, maxlen):
59 | phrases, indices = [], []
60 | with open(sent_file_path, 'r') as sent_file:
61 | sents = sent_file.readlines()
62 | vocab = json.loads(open(vocab_file_path, "r").readline())
63 | start_index = 0
64 | for i in range(0, len(sents)):
65 | sent = sents[i].rstrip('\n')
66 | word_list = sent.split()
67 | sent_len = min(len(word_list), maxlen)
68 | indices.append((sent_len, start_index))
69 | for j in range(0, sent_len):
70 | word = word_list[j]
71 | phrases.append(vocab.get(word, UNK_ID))
72 | start_index += sent_len
73 | output_file_path = sent_file_path[0:-3] + 'h5'
74 | output_file = h5py.File(output_file_path, 'w')
75 | output_file['phrases'] = phrases
76 | output_file['indices'] = indices
77 | output_file.close()
78 |
79 | def parse_args():
80 | parser = argparse.ArgumentParser("Parse token data for TokenEmbedder")
81 |
82 | parser.add_argument('--data_path', type=str, default='./data/')
83 | parser.add_argument('--dataset', type=str, default='github11/')
84 |
85 | parser.add_argument('--all_token_file', type=str, default='all.token.txt')
86 | parser.add_argument('--train_token_file', type=str, default='train.token.txt')
87 | parser.add_argument('--test_token_file', type=str, default='test.token.txt')
88 | parser.add_argument('--vocab_token_file', type=str, default='vocab.token.json')
89 |
90 | parser.add_argument('--trainset_num', type=int, default=39152)
91 | parser.add_argument('--testset_num', type=int, default=2000)
92 | parser.add_argument('--token_word_num', type=int, default=25000)
93 | parser.add_argument('--token_maxlen', type=int, default=100)
94 | parser.add_argument('--testset_start_index', type=int, default=39152)
95 |
96 |
97 | parser.add_argument('--shuffle_index_file', type=str, default='shuffle_index.npy')
98 |
99 | return parser.parse_args()
100 |
101 | if __name__ == '__main__':
102 | args = parse_args()
103 | '''
104 | dir_path = args.data_path + args.dataset
105 | with open(dir_path+'origin.test.token.txt', 'r') as in_file, open(dir_path+'test.token.txt', 'w') as out_file:
106 | lines = in_file.readlines()
107 | for i in range(0, len(lines)):
108 | if lines[i][0:10] != 'BeginFunc:':
109 | out_file.write(lines[i])
110 | '''
111 |
112 | split_token_data(args)
113 | create_token_dict_file(args)
114 |
115 | dir_path = args.data_path + args.dataset
116 | # train.token.txt -> train.token.h5(and test...)
117 | sents2indexes(dir_path+args.train_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
118 | sents2indexes(dir_path+args.test_token_file, dir_path+args.vocab_token_file, args.token_maxlen)
119 |
120 |
121 | '''
122 | dir_path = args.data_path + args.dataset
123 | all_token_file_path = dir_path + args.all_token_file
124 | with open(all_token_file_path, 'r') as all_token_file:
125 | lines = all_token_file.readlines()
126 | print(len(lines))
127 | for i in range(0, len(lines)):
128 | line = lines[i]
129 | if line[0:10] != 'BeginFunc:':
130 | words = line.split()
131 | if len(words) == 0:
132 | print(lines[i-1])
133 | #print(lines[i])
134 | '''
135 |
--------------------------------------------------------------------------------
/Baseline methods/Tok-Att/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | import math
4 | import torch
5 | from torch.nn import functional as F
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def cos_approx(data1,data2):
10 | """numpy implementation of cosine similarity for matrix"""
11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
12 | dotted = np.dot(data1,np.transpose(data2))
13 | norm1 = np.linalg.norm(data1,axis=1)
14 | norm2 = np.linalg.norm(data2,axis=1)
15 | matrix_vector_norms = np.multiply(norm1, norm2)
16 | neighbors = np.divide(dotted, matrix_vector_norms)
17 | return neighbors
18 |
19 | def normalize(data):
20 | """normalize matrix by rows"""
21 | return data/np.linalg.norm(data,axis=1,keepdims=True)
22 |
23 | def dot_np(data1,data2):
24 | """cosine similarity for normalized vectors"""
25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
26 | return np.dot(data1, data2.T)
27 |
28 | def sigmoid(x):
29 | return 1/(1 + np.exp(-x))
30 |
31 | def similarity(vec1, vec2, measure='cos'):
32 | if measure=='cos':
33 | vec1_norm = normalize(vec1)
34 | vec2_norm = normalize(vec2)
35 | return np.dot(vec1_norm, vec2_norm.T)[:,0]
36 | elif measure=='poly':
37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
38 | elif measure=='sigmoid':
39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
40 | elif measure in ['enc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf
41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1)
42 | euc_sim = 1 / (1 + euc_dist)
43 | if measure=='euc': return euc_sim
44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
45 | if measure == 'gesd': return euc_sim * sigmoid_sim
46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
47 |
48 | #######################################################################
49 |
50 | def asMinutes(s):
51 | m = math.floor(s / 60)
52 | s -= m * 60
53 | return '%d:%d'% (m, s)
54 |
55 | def timeSince(since, percent):
56 | now = time.time()
57 | s = now - since
58 | es = s / (percent)
59 | rs = es - s
60 | return '%s<%s'%(asMinutes(s), asMinutes(rs))
61 |
62 | #######################################################################
63 | '''
64 | import nltk
65 | try: nltk.word_tokenize("hello world")
66 | except LookupError: nltk.download('punkt')
67 |
68 | def sent2indexes(sentence, vocab, maxlen):
69 |
70 | def convert_sent(sent, vocab, maxlen):
71 | idxes = np.zeros(maxlen, dtype=np.int64)
72 | idxes.fill(PAD_ID)
73 | tokens = nltk.word_tokenize(sent.strip())
74 | idx_len = min(len(tokens), maxlen)
75 | for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID)
76 | return idxes, idx_len
77 | if type(sentence) is list:
78 | inds, lens = [], []
79 | for sent in sentence:
80 | idxes, idx_len = convert_sent(sent, vocab, maxlen)
81 | #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len])
82 | inds.append(idxes)
83 | lens.append(idx_len)
84 | return np.vstack(inds), np.vstack(lens)
85 | else:
86 | inds, lens = sent2indexes([sentence], vocab, maxlen)
87 | return inds[0], lens[0]
88 | '''
89 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID):
90 | '''indexes: numpy array'''
91 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
92 | indexes=filter(lambda i: i!=ignore_tok, indexes)
93 | toks, length = [], 0
94 | for idx in indexes:
95 | toks.append(ivocab.get(idx, ''))
96 | length+=1
97 | return ' '.join(toks), length
98 |
99 | ivocab = {v: k for k, v in vocab.items()}
100 | if indexes.ndim==1:# one sentence
101 | return revert_sent(indexes, ivocab, ignore_tok)
102 | else:# dim>1
103 | sentences, lens =[], [] # a batch of sentences
104 | for inds in indexes:
105 | sentence, length = revert_sent(inds, ivocab, ignore_tok)
106 | sentences.append(sentence)
107 | lens.append(length)
108 | return sentences, lens
109 |
110 | ########################################################################
111 |
--------------------------------------------------------------------------------
/Baseline methods/readme.md:
--------------------------------------------------------------------------------
1 | # Baseline methods
2 | ### DeepCS
3 | ```
4 | @inproceedings{gu2018deep,
5 | title={Deep code search},
6 | author={Gu, Xiaodong and Zhang, Hongyu and Kim, Sunghun},
7 | booktitle={2018 IEEE/ACM 40th International Conference on Software Engineering (ICSE)},
8 | pages={933--944},
9 | year={2018},
10 | organization={IEEE}
11 | }
12 | ```
13 | ### MMAN
14 | ```
15 | @inproceedings{wan2019multi,
16 | title={Multi-modal attention network learning for semantic source code retrieval},
17 | author={Wan, Yao and Shu, Jingdong and Sui, Yulei and Xu, Guandong and Zhao, Zhou and Wu, Jian and Yu, Philip},
18 | booktitle={2019 34th IEEE/ACM International Conference on Automated Software Engineering (ASE)},
19 | pages={13--25},
20 | year={2019},
21 | organization={IEEE}
22 | }
23 | ```
24 | ### MMAN(TDC)
25 | Exploit Token + Variable-based data flow + Variable-based control flow to perform code search tasks
26 | ### AST-Att
27 | Exploit AST and the attention mechanism to perform code search tasks
28 | ### Tok-Att
29 | Exploit Token and the attention mechanism to perform code search tasks
30 | ### CFG-Att
31 | Exploit CFG and the attention mechanism to perform code search tasks
32 |
--------------------------------------------------------------------------------
/IR2graph/readme.md:
--------------------------------------------------------------------------------
1 | # Generate our varibale-based flow graph from the input IR
2 | Given the input ".ll" file generated from the original ".c" file from LLVM, the output is the nodes and graphs in our constructed variable-based flow graph.
3 |
4 | ## Generate the VFG
5 | ```
6 | python vfg_construct.py
7 | ```
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DeGraphCS
2 |
3 |
4 | # Project Overview
5 | This project provides a collection of datasets and source code which are used in our DeGraphCS model. The content of project is as follows:
6 |
7 | 1. Dataset
8 |
9 | 2. DeGraphCS Source Code
10 |
11 | 3. Variable-based Flow Graph Construction
12 |
13 | 4. Baseline methods
14 |
15 | 5. User Study
16 |
17 | 6. Appendix
18 |
19 | ## Dataset
20 | To help people to reproduce our work, we provide raw datasets which are consist of **C code snippet**, corresponding **code comment** and **generated IR**.
21 |
22 | The raw datasets can be accessed in [Google Drive](https://drive.google.com/file/d/1PZ9TAfsrSlXLDpOCp6-0aZQxrzlP4kBA/view?usp=sharing)
23 |
24 | To feed into our model, we first generate Variable-based Flow Graph of 41152 methods and extract corresponding comments. Then we split the datasets into 39152 training set and 2000 test set. All of the data are puted in `dataset/` directory.
25 |
26 | ## DeGraphCS Source Code
27 | We provide DeGraphCS model code which are listed in `src/` directory.
28 |
29 | ## Variable-based Flow Graph Construction
30 | To construct Variable-based Flow Graph according to llvm IR, We provide graph construction code to help users to generate graph which are puted in `IR2graph/` directory.
31 |
32 | ## Baseline Methods
33 | We have reproduced other code search works which are putted in `Baseline methods/` directory.
34 |
35 | ## User Study
36 | We make a user study to evaluate our model.
37 |
38 | 50 queries of the user study are listed in the `user study/queries.txt`. And according to four models (UNIF, MMAN, DeepCS and DeGraphCS), we obtain corresponding searching result which are listed in `user study/` directory.
39 |
40 | ## Appendix
41 |
42 | # Running Our Model
43 | ## Generate Datasets and Build Dictionary
44 | Run the command to split comments datasets into training set and test set, and build dictionary
45 | ```
46 | python src/util_desc.py
47 | ```
48 | Run the command to split Variable-based Flow Graph datasets into training set and test set, and build dictionary
49 | ```
50 | python src/util_ir.py
51 | ```
52 | ## Train the DeGraphCS Model
53 | ```
54 | python src/train.py
55 | ```
56 | ## Test the DeGraphCS Model
57 | ```
58 | python src/test.py
59 | ```
60 |
--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
1 | ## Process Raw Dataset
2 |
3 | To obtain high-quality dataset, we process raw dataset in `/raw_dataset` as follows:
4 |
5 | 1. After we delete duplicate methods, we obtain 74489 methods from 151414 methods.
6 |
7 | 2. To generate a common dataset for all models(DeGraphCS, DeepCS, MMAN and UNIF), we delete those methods which do not generate AST and CFG. Then we obtain 59725 methods.
8 |
9 | 3. To make sure our dataset is high-quality, we constraint comments' length and quality, and the number of nodes in AST, CFG and VFG(Variable-based Flow Graph).
10 | After we delete those methods which do not meet our requirements, we obtain 41152 methods in `/preprocessed_dataset`.
11 |
12 |
13 |
--------------------------------------------------------------------------------
/dataset/preprocessed_dataset/origin.ir.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f601269dd0365f12862fb1100d7538a957caa999c4767835a26a8e3781bd6d67
3 | size 160986007
4 |
--------------------------------------------------------------------------------
/dataset/preprocessed_dataset/readme.md:
--------------------------------------------------------------------------------
1 | ## Directory Introduction
2 |
3 | **origin.desc.txt**: all datasets which includes **41152** descriptions of methods.
4 |
5 | **origin.ir.txt**: all datasets which includes **41152** Variable-based Flow Graph representation of methods.
6 |
7 | **train.desc.txt**: training set which includes **39152** descriptions of methods.
8 |
9 | **train.ir.txt**: training set which includes **39152** Variable-based Flow Graph representation of methods.
10 |
11 | **test.desc.txt**: test set which includes **2000** descriptions of methods.
12 |
13 | **test.ir.txt**: test set which includes **2000** Variable-based Flow Graph representation of methods.
14 |
--------------------------------------------------------------------------------
/dataset/preprocessed_dataset/train.ir.txt:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6203c767bc9854e6addb905668ad89fd73a88fa542968884b681e14e3404fac1
3 | size 146141125
4 |
--------------------------------------------------------------------------------
/dataset/raw_dataset/readme.md:
--------------------------------------------------------------------------------
1 |
2 | The raw datasets can be accessed in [Google Drive](https://drive.google.com/file/d/1PZ9TAfsrSlXLDpOCp6-0aZQxrzlP4kBA/view?usp=sharing)
3 |
--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
1 | # Core parts of deGraphcS
2 | configs.py --Config the hyper-parameters of deGraphCS
3 |
4 | dataloader.py --Load the data in batch
5 |
6 | util_IR.py --Preprocess the origin IR to generate grpahs, which can be identified by graph neural networks
7 |
8 | util_desc.py --Preprocess the comments
9 |
10 | generate_interface.py --Generate the interfaces of the third-party libraries
11 |
12 | # Generate the interfaces to solve the compilation probelm
13 | ## An example to show how the compilation problem of IR can be solved
14 | ### Initial code snippets crawled from Github
15 | ```
16 | public void range(IHypercube space, IvisitKDNode visitor){
17 | if(root == null) return;
18 | root.getRange(space, visitor);
19 | }
20 | ```
21 | The code above cannot be compiled because of the following parts:
22 | 1. The third-library IHypercube and IvisitKDNode is missing.
23 | 2. The object root and its method getRange is missing.
24 |
25 | The third-library missing probelm can be solved by adding some empty interfaces (the Root class with getRange method, IHypercube class and IvisitKDNode class) since the realization details of the method are not neccessary.
26 | ### After adding the interface, the example source code can be successfully compiled:
27 | ```
28 | public class Range{
29 | private Root root;
30 | public void range(IHypercube space, IvisitKDNode visitor){
31 | if(root == null) return;
32 | root.getRange(space, visitor);
33 | }
34 | }
35 | class Root{
36 | public void getRange(IHypercube space, IvisitKDNode visitor){
37 | return;
38 | }
39 | }
40 | class IHypercube{}
41 | class IvisitKDNode{}
42 | ```
43 |
--------------------------------------------------------------------------------
/src/configs.py:
--------------------------------------------------------------------------------
1 |
2 | def config_IREmbeder():
3 | conf = {
4 | # added_params
5 | 'transform_every_modal': 0,
6 | 'use_attn': 0,
7 | 'use_tanh': 1,
8 | 'save_attn_weight': 0,
9 |
10 | # GGNN
11 | 'state_dim': 512, # GGNN hidden state size
12 | 'annotation_dim': 300,
13 | 'n_edge_types': 2,
14 | 'n_node': 160, # maximum nodenum
15 | 'n_steps': 5, # propogation steps number of GGNN
16 | 'output_type': 'no_reduce',
17 | 'batch_size': 32,
18 | 'n_layers': 1,
19 | 'n_hidden': 512,
20 | 'ir_attn_mode': 'sigmoid_scalar',
21 | 'word_split': True,
22 | 'pooling_type': 'max_pooling', # ave_pooling
23 | 'max_word_num': 5,
24 |
25 | # data_params
26 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
27 | # training data
28 | 'train_ir':'train.ir.json',
29 | 'train_desc':'train.desc.h5',
30 | # test data
31 | 'test_ir':'test.ir.json',
32 | 'test_desc':'test.desc.h5',
33 |
34 | # parameters
35 | 'desc_len': 30,
36 | 'n_desc_words': 10000,
37 | 'n_ir_words': 15000,
38 | # vocabulary info
39 | 'vocab_ir':'vocab.ir.json',
40 | 'vocab_desc':'vocab.desc.json',
41 |
42 | #training_params
43 | 'nb_epoch': 100,
44 | #'optimizer': 'adam',
45 | 'learning_rate':0.0003, # try 1e-4(paper)
46 | 'adam_epsilon':1e-8,
47 | 'warmup_steps':5000,
48 | 'fp16': False,
49 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
50 |
51 | # model_params
52 | 'emb_size': 300,
53 | # recurrent
54 | 'margin': 0.6,
55 | 'sim_measure':'cos',
56 | 'dropout': 0
57 | }
58 | return conf
59 |
60 |
--------------------------------------------------------------------------------
/src/data_loader.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | import torch.utils.data as data
4 | import torch.nn as nn
5 | import tables
6 | import json
7 | import random
8 | import numpy as np
9 | import pickle
10 |
11 | from utils import PAD_ID, UNK_ID, indexes2sent
12 | import configs
13 | from util_ir import get_one_ir_npy_info
14 |
15 | import logging
16 | logger = logging.getLogger(__name__)
17 | logging.basicConfig(level=logging.INFO, format="%(message)s")
18 |
19 |
20 | class CodeSearchDataset(data.Dataset):
21 | """
22 | Dataset that has only positive samples.
23 | """
24 | def __init__(self, config, data_dir, f_irs, max_node_num, f_descs=None, max_desc_len=None):
25 |
26 | self.max_node_num = max_node_num
27 | self.max_desc_len = max_desc_len
28 |
29 | self.n_edge_types = config['n_edge_types']
30 | self.state_dim = config['state_dim']
31 | self.max_word_num = config['max_word_num']
32 |
33 | print("Loading Data...")
34 |
35 | self.graph_dict = json.loads(open(data_dir+f_irs, 'r').readline())
36 |
37 | table_desc = tables.open_file(data_dir+f_descs)
38 | self.descs = table_desc.get_node('/phrases')[:].astype(np.long)
39 | self.idx_descs = table_desc.get_node('/indices')[:]
40 |
41 | assert len(self.graph_dict)==self.idx_descs.shape[0]
42 | self.data_len = self.idx_descs.shape[0]
43 | print("{} entries".format(self.data_len))
44 |
45 | def pad_seq(self, seq, maxlen):
46 | if len(seq) < maxlen:
47 | seq = np.append(seq, [PAD_ID]*(maxlen-len(seq)))
48 | seq = seq[:maxlen]
49 | return seq
50 |
51 | def __getitem__(self, offset):
52 | # anno:[n_node], adjmat:[n_node x (n_node*n_edge_types*2)], node_mask:[n_node]
53 | # node_num:[1], word_num: [n_node]
54 | anno, adjmat, node_mask= get_one_ir_npy_info(self.graph_dict[str(offset)],
55 | self.max_node_num, self.n_edge_types, self.max_word_num)
56 |
57 | anno = torch.from_numpy(anno).type(torch.LongTensor)
58 | adjmat = torch.from_numpy(adjmat).type(torch.FloatTensor)
59 | node_mask = torch.Tensor(node_mask)
60 |
61 | len, pos = self.idx_descs[offset][0], self.idx_descs[offset][1]
62 | good_desc_len = min(int(len), self.max_desc_len)
63 | good_desc = self.descs[pos: pos+good_desc_len]
64 | good_desc = self.pad_seq(good_desc, self.max_desc_len)
65 |
66 | rand_offset = random.randint(0, self.data_len-1)
67 | len, pos = self.idx_descs[rand_offset][0], self.idx_descs[rand_offset][1]
68 | bad_desc_len = min(int(len), self.max_desc_len)
69 | bad_desc = self.descs[pos: pos+bad_desc_len]
70 | bad_desc = self.pad_seq(bad_desc, self.max_desc_len)
71 |
72 | return anno, adjmat, node_mask, good_desc, good_desc_len, bad_desc, bad_desc_len
73 |
74 | def __len__(self):
75 | return self.data_len
76 |
77 | def load_dict(filename):
78 | return json.loads(open(filename, "r").readline())
79 | #return pickle.load(open(filename, 'rb'))
80 |
81 |
82 | if __name__ == '__main__':
83 | device = 'cpu'
84 | config = getattr(configs, 'config_IREmbeder')()
85 | input_dir = './data/github1/'
86 |
87 | test_set = CodeSearchDataset(config, input_dir, 'test.ir.json', 160, 'test.desc.h5', 30)
88 | test_data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=1, shuffle=False, drop_last=False, num_workers=1)
89 | print('number of batch:\n', len(test_data_loader))
90 | print('============ Train Data ================')
91 | k = 0
92 |
93 | for batch in test_data_loader:
94 | #print(batch)
95 | anno, adjmat, node_mask, good_desc, good_desc_len, bad_desc, bad_desc_len = [tensor.to(device) for tensor in batch]
96 | #print(anno)
97 | print(adjmat)
98 | for i in range(0, 160):
99 | for j in range(0, 320):
100 | if adjmat[0][i][j] == 1:
101 | print(i, j)
102 | #print(node_num)
103 | #print(word_num)
104 | k+=1
105 | if k>0: break
106 |
107 |
108 |
--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .iremb import IREmbeder
--------------------------------------------------------------------------------
/src/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/src/models/__pycache__/cfgemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/cfgemb.cpython-36.pyc
--------------------------------------------------------------------------------
/src/models/__pycache__/iremb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/iremb.cpython-36.pyc
--------------------------------------------------------------------------------
/src/models/__pycache__/jointemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/jointemb.cpython-36.pyc
--------------------------------------------------------------------------------
/src/models/__pycache__/tokenemb.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/degraphcs/DeGraphCS/ef4d33fdcb8029b52c0f858bdcfe6b079dbead30/src/models/__pycache__/tokenemb.cpython-36.pyc
--------------------------------------------------------------------------------
/src/models/iremb.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import numpy as np
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.init as weight_init
8 | import torch.nn.functional as F
9 |
10 | import logging
11 | logger = logging.getLogger(__name__)
12 | parentPath = os.path.abspath("..")
13 | sys.path.insert(0, parentPath) # add parent folder to path so as to import common modules
14 | from modules import GGNN, SeqEncoder
15 |
16 | class IREmbeder(nn.Module):
17 | def __init__(self, config):
18 | super(IREmbeder, self).__init__()
19 |
20 | self.conf = config
21 |
22 | self.margin = config['margin']
23 | self.dropout = config['dropout']
24 |
25 | self.n_desc_words = config['n_desc_words']
26 | self.emb_size = config['emb_size']
27 | self.n_hidden = config['n_hidden']
28 | self.ir_attn_mode = config['ir_attn_mode']
29 |
30 | self.ir_encoder = GGNN(self.conf)
31 | self.desc_encoder = SeqEncoder(self.n_desc_words, self.emb_size, self.n_hidden)
32 |
33 | self.linear_attn_out = nn.Sequential(nn.Linear(self.n_hidden, self.n_hidden),
34 | nn.Tanh(),
35 | nn.Linear(self.n_hidden, self.n_hidden))
36 |
37 | if self.conf['transform_every_modal']:
38 | self.linear_single_modal = nn.Sequential(nn.Linear(self.n_hidden, self.n_hidden),
39 | nn.Tanh(),
40 | nn.Linear(self.n_hidden, self.n_hidden))
41 |
42 | if self.conf['save_attn_weight']:
43 | self.attn_weight_torch = []
44 | self.node_mask_torch = []
45 |
46 | self.self_attn = nn.Linear(self.n_hidden, self.n_hidden)
47 | self.self_attn_scalar = nn.Linear(self.n_hidden, 1)
48 |
49 |
50 | def code_encoding(self, ir_init_input_batch, ir_adjmat_batch, ir_node_mask):
51 | batch_size = ir_node_mask.size()[0]
52 |
53 | # code_feat: [batch_size x n_node x state_dim]
54 | code_feat = self.ir_encoder(ir_init_input_batch, ir_adjmat_batch) # forward(annotation, A)
55 |
56 | node_num = code_feat.size()[1] # n_node
57 | code_feat = code_feat.reshape(-1, node_num, self.n_hidden)
58 | # mask_1forgt0: [batch_size x n_node]
59 | mask_1forgt0 = ir_node_mask.bool().reshape(-1, node_num)
60 |
61 | if self.conf['transform_every_modal']:
62 | code_feat = torch.tanh(
63 | self.linear_single_modal(F.dropout(code_feat, self.dropout, training=self.training)))
64 |
65 | code_sa_tanh = torch.tanh(self.self_attn(code_feat.reshape(-1, self.n_hidden))) # [(batch_size * n_node) x n_hidden]
66 | code_sa_tanh = F.dropout(code_sa_tanh, self.dropout, training=self.training)
67 | # code_sa_tanh: [batch_size x n_node]
68 | code_sa_tanh = self.self_attn_scalar(code_sa_tanh).reshape(-1, node_num)
69 |
70 | code_feat = code_feat.reshape(-1, node_num, self.n_hidden)
71 | batch_size = code_feat.size()[0]
72 |
73 | self_attn_code_feat = None
74 | for _i in range(batch_size):
75 | # code_sa_tanh_one: [1 x real_node_num]
76 | code_sa_tanh_one = torch.masked_select(code_sa_tanh[_i, :], mask_1forgt0[_i, :]).reshape(1, -1)
77 |
78 | if self.ir_attn_mode == 'sigmoid_scalar':
79 | # attn_w_one: [1 x 1 x real_node_num]
80 | attn_w_one = torch.sigmoid(code_sa_tanh_one).reshape(1, 1, -1)
81 | else:
82 | attn_w_one = F.softmax(code_sa_tanh_one, dim=1).reshape(1, 1, -1)
83 |
84 | if self.conf['save_attn_weight']:
85 | self.attn_weight_torch.append(attn_w_one.detach().reshape(1, -1).cpu())
86 | self.node_mask_torch.append(mask_1forgt0[_i, :].detach().reshape(1, -1).cpu())
87 |
88 | # attn_feat_one: [1 x real_node_num x n_hidden]
89 | attn_feat_one = torch.masked_select(code_feat[_i, :, :].reshape(1, node_num, self.n_hidden),
90 | mask_1forgt0[_i, :].reshape(1, node_num, 1)).reshape(1, -1, self.n_hidden)
91 | # out_to_cat: [1 x n_hidden]
92 | out_to_cat = torch.bmm(attn_w_one, attn_feat_one).reshape(1, self.n_hidden)
93 | # self_attn_code_feat: [batch_size x n_hidden]
94 | self_attn_code_feat = out_to_cat if self_attn_code_feat is None else torch.cat(
95 | (self_attn_code_feat, out_to_cat), 0)
96 |
97 | if self.conf['use_attn']:
98 | self_attn_code_feat = torch.tanh(
99 | self.linear_attn_out(
100 | F.dropout(self_attn_code_feat, self.dropout, training=self.training))
101 | )
102 | elif self.conf['use_tanh']:
103 | self_attn_code_feat = torch.tanh(self_attn_code_feat)
104 |
105 | # self_attn_code_feat: [batch_size x n_hidden]
106 | return self_attn_code_feat
107 |
108 | def desc_encoding(self, desc, desc_len):
109 | batch_size = desc.size()[0]
110 | desc_enc_hidden = self.desc_encoder.init_hidden(batch_size)
111 | # desc_enc_hidden: [2 x batch_size x n_hidden]
112 | _, desc_enc_hidden = self.desc_encoder(desc, desc_len)
113 | # desc_feat: [batch_size x n_hidden]
114 | desc_feat = desc_enc_hidden[0].reshape(batch_size, self.n_hidden)
115 |
116 | if self.conf['transform_every_modal']:
117 | desc_feat = torch.tanh(
118 | self.linear_single_modal(
119 | F.dropout(desc_feat, self.dropout, training=self.training)
120 | )
121 | )
122 | elif self.conf['use_tanh']:
123 | desc_feat = torch.tanh(desc_feat)
124 |
125 | # desc_feat: [batch_size x n_hidden]
126 | return desc_feat
127 |
128 |
129 | def forward(self, ir_anno, ir_adjmat, ir_node_mask, desc_anchor, desc_anchor_len, desc_neg, desc_neg_len):
130 | # code_repr: [batch_size x n_hidden]
131 | ir_repr = self.code_encoding(ir_anno, ir_adjmat, ir_node_mask)
132 | # desc_repr: [batch_size x n_hidden]
133 | desc_anchor_repr = self.desc_encoding(desc_anchor, desc_anchor_len)
134 | desc_neg_repr = self.desc_encoding(desc_neg, desc_neg_len)
135 |
136 | # sim: [batch_sz]
137 | anchor_sim = F.cosine_similarity(ir_repr, desc_anchor_repr)
138 | neg_sim = F.cosine_similarity(ir_repr, desc_neg_repr)
139 |
140 | loss = (self.margin-anchor_sim+neg_sim).clamp(min=1e-6).mean()
141 |
142 | return loss
143 |
--------------------------------------------------------------------------------
/src/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import traceback
4 | import numpy as np
5 | import argparse
6 | import threading
7 | import codecs
8 | import logging
9 | from tqdm import tqdm
10 | logger = logging.getLogger(__name__)
11 | logging.basicConfig(level=logging.INFO, format="%(message)s")
12 |
13 | import torch
14 |
15 | import models, configs, data_loader
16 | from modules import get_cosine_schedule_with_warmup
17 | from utils import similarity, normalize
18 | from data_loader import *
19 |
20 |
21 | def test(config, model, device):
22 | logger.info('Test Begin...')
23 |
24 | model.eval()
25 | model.to(device)
26 |
27 | # load data
28 | data_path = args.data_path+args.dataset+'/'
29 | test_set = eval(config['dataset_name'])(config, data_path,
30 | config['test_ir'], config['n_node'],
31 | config['test_desc'], config['desc_len'])
32 | data_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=32,
33 | shuffle=False, drop_last=False, num_workers=1)
34 | # encode tokens and descs
35 | code_reprs, desc_reprs = [], []
36 | n_processed = 0
37 | for batch in data_loader:
38 | # batch[0:3]: init_input, adjmat, node_mask
39 | code_batch = [tensor.to(device) for tensor in batch[:3]]
40 | # batch[3:5]: good_desc, good_desc_len
41 | desc_batch = [tensor.to(device) for tensor in batch[3:5]]
42 | with torch.no_grad():
43 | code_repr = model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
44 | desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
45 | # normalize when sim_measure=='cos'
46 | code_repr = normalize(code_repr)
47 | desc_repr = normalize(desc_repr)
48 | code_reprs.append(code_repr)
49 | desc_reprs.append(desc_repr)
50 | n_processed += batch[0].size(0) # +batch_size
51 | # code_reprs: [n_processed x n_hidden]
52 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs)
53 |
54 | # calculate similarity
55 | sum_1, sum_5, sum_10, sum_mrr = [], [], [], []
56 | test_sim_result, test_rank_result = [], []
57 | for i in tqdm(range(0, n_processed)):
58 | desc_vec = np.expand_dims(desc_reprs[i], axis=0) # [1 x n_hidden]
59 | sims = np.dot(code_reprs, desc_vec.T)[:,0] # [n_processed]
60 | negsims = np.negative(sims)
61 | predict = np.argsort(negsims)
62 |
63 | # SuccessRate@k
64 | predict_1, predict_5, predict_10 = [int(predict[0])], [int(k) for k in predict[0:5]], [int(k) for k in predict[0:10]]
65 | sum_1.append(1.0) if i in predict_1 else sum_1.append(0.0)
66 | sum_5.append(1.0) if i in predict_5 else sum_5.append(0.0)
67 | sum_10.append(1.0) if i in predict_10 else sum_10.append(0.0)
68 | # MRR
69 | predict_list = predict.tolist()
70 | rank = predict_list.index(i)
71 | sum_mrr.append(1/float(rank+1))
72 |
73 | # results need to be saved
74 | predict_20 = [int(k) for k in predict[0:20]]
75 | sim_20 = [sims[k] for k in predict_20]
76 | test_sim_result.append(zip(predict_20, sim_20))
77 | test_rank_result.append(rank+1)
78 |
79 | logger.info(f'R@1={np.mean(sum_1)}, R@5={np.mean(sum_5)}, R@10={np.mean(sum_10)}, MRR={np.mean(sum_mrr)}')
80 | save_path = args.data_path + 'result/'
81 | sim_result_filename, rank_result_filename = 'sim.npy', 'rank.npy'
82 | np.save(save_path+sim_result_filename, test_sim_result)
83 | np.save(save_path+rank_result_filename, test_rank_result)
84 |
85 |
86 | def parse_args():
87 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
88 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
89 | parser.add_argument('--model', type=str, default='IREmbeder', help='model name')
90 | parser.add_argument('-d', '--dataset', type=str, default='github1', help='name of dataset.java, python')
91 | parser.add_argument('--reload_from', type=int, default=100, help='epoch to reload from')
92 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
93 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
94 | return parser.parse_args()
95 |
96 |
97 | if __name__ == '__main__':
98 | args = parse_args()
99 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
100 | config = getattr(configs, 'config_'+args.model)()
101 |
102 | ##### Define model ######
103 | logger.info('Constructing Model..')
104 | model = getattr(models, args.model)(config) # initialize the model
105 | ckpt=f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
106 | model.load_state_dict(torch.load(ckpt, map_location=device))
107 |
108 | test(config, model, device)
109 |
110 |
111 |
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | import math
4 | import torch
5 | from torch.nn import functional as F
6 |
7 | PAD_ID, UNK_ID = [0, 1]
8 |
9 | def cos_approx(data1,data2):
10 | """numpy implementation of cosine similarity for matrix"""
11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
12 | dotted = np.dot(data1,np.transpose(data2))
13 | norm1 = np.linalg.norm(data1,axis=1)
14 | norm2 = np.linalg.norm(data2,axis=1)
15 | matrix_vector_norms = np.multiply(norm1, norm2)
16 | neighbors = np.divide(dotted, matrix_vector_norms)
17 | return neighbors
18 |
19 | def normalize(data):
20 | """normalize matrix by rows"""
21 | return data/np.linalg.norm(data,axis=1,keepdims=True)
22 |
23 | def dot_np(data1,data2):
24 | """cosine similarity for normalized vectors"""
25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
26 | return np.dot(data1, data2.T)
27 |
28 | def sigmoid(x):
29 | return 1/(1 + np.exp(-x))
30 |
31 | def similarity(vec1, vec2, measure='cos'):
32 | if measure=='cos':
33 | vec1_norm = normalize(vec1)
34 | vec2_norm = normalize(vec2)
35 | return np.dot(vec1_norm, vec2_norm.T)[:,0]
36 | elif measure=='poly':
37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 #(code_vec, desc_vec.T)
38 | elif measure=='sigmoid':
39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
40 | elif measure in ['enc', 'gesd', 'aesd']:
41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1)
42 | euc_sim = 1 / (1 + euc_dist)
43 | if measure=='euc': return euc_sim
44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
45 | if measure == 'gesd': return euc_sim * sigmoid_sim
46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
47 |
48 | #######################################################################
49 |
50 | def asMinutes(s):
51 | m = math.floor(s / 60)
52 | s -= m * 60
53 | return '%d:%d'% (m, s)
54 |
55 | def timeSince(since, percent):
56 | now = time.time()
57 | s = now - since
58 | es = s / (percent)
59 | rs = es - s
60 | return '%s<%s'%(asMinutes(s), asMinutes(rs))
61 |
62 | #######################################################################
63 | import nltk
64 | try: nltk.word_tokenize("hello world")
65 | except LookupError: nltk.download('punkt')
66 |
67 | def sent2indexes(sentence, vocab, maxlen):
68 | '''sentence: a string or list of string
69 | return: a numpy array of word indices
70 | '''
71 | def convert_sent(sent, vocab, maxlen):
72 | idxes = np.zeros(maxlen, dtype=np.int64)
73 | idxes.fill(PAD_ID)
74 | tokens = nltk.word_tokenize(sent.strip())
75 | idx_len = min(len(tokens), maxlen)
76 | for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID)
77 | return idxes, idx_len
78 | if type(sentence) is list:
79 | inds, lens = [], []
80 | for sent in sentence:
81 | idxes, idx_len = convert_sent(sent, vocab, maxlen)
82 | #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len])
83 | inds.append(idxes)
84 | lens.append(idx_len)
85 | return np.vstack(inds), np.vstack(lens)
86 | else:
87 | inds, lens = sent2indexes([sentence], vocab, maxlen)
88 | return inds[0], lens[0]
89 |
90 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID):
91 | '''indexes: numpy array'''
92 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
93 | indexes=filter(lambda i: i!=ignore_tok, indexes)
94 | toks, length = [], 0
95 | for idx in indexes:
96 | toks.append(ivocab.get(idx, ''))
97 | length+=1
98 | return ' '.join(toks), length
99 |
100 | ivocab = {v: k for k, v in vocab.items()}
101 | if indexes.ndim==1:# one sentence
102 | return revert_sent(indexes, ivocab, ignore_tok)
103 | else:# dim>1
104 | sentences, lens =[], [] # a batch of sentences
105 | for inds in indexes:
106 | sentence, length = revert_sent(inds, ivocab, ignore_tok)
107 | sentences.append(sentence)
108 | lens.append(length)
109 | return sentences, lens
110 |
111 | ########################################################################
112 |
--------------------------------------------------------------------------------
/user study/queries.txt:
--------------------------------------------------------------------------------
1 | write a byte to output buffer of device
2 | insert a new value into list
3 | check string is suffix of another string
4 | remove all elements in list
5 | stop logging messages to syslog
6 | render bignum into decimal
7 | look for match name in system dictionary
8 | remove an element from a row vector
9 | load a bignum from int
10 | grab a lock on a mutex
11 | advance a carray cursor to next row of output
12 | split string into args respecting backwhack and quote
13 | retrieve page from the pager cache
14 | round up to nearest integer
15 | create a new thread safe queue of size siz
16 | push integer to end of list
17 | restart timer from current point in time
18 | return file name in path
19 | get name of current executable
20 | read machine uptime
21 | store integer into register
22 | stop stream server
23 | get length of UCS2 string
24 | contrain maximum of a range
25 | pad given buffer with len padding characters
26 | write data in output buffers to client
27 | generate trace call to print
28 | get dimensions of given bmp file
29 | add extension to filename
30 | read one word from onboard RAM
31 | encode ucs2 string into utf8 string
32 | check pointer is in the heap
33 | search a file in directory recursively
34 | compress block of raw data
35 | allocate and clean buffer
36 | receive N byte from socket
37 | lookup key in a hash map
38 | fast integral power function
39 | check if directory is empty
40 | create a new tree node
41 | free a dirty page
42 | create message with given type
43 | encrypt byte sequence
44 | parse checksum file
45 | remove trailing blanks, tabs and newlines
46 | search last occurrence of char in string
47 | binary search in sorted array of size
48 | calculate checksum of checkpoint
49 | judge whether two strings are equal
50 | return random integer value between min and max
--------------------------------------------------------------------------------
/user study/readme.md:
--------------------------------------------------------------------------------
1 | # The questionnaire to select the participants
2 | ```
3 | 1) What grade are you in?
4 | 2) Have you taken C language courses in the past few years?
5 | 3) Do you have C language programming experience, if so, how long is the programming experience?
6 | 4) Which C language projects have you participated in? Please introduce them.
7 | ```
8 | # The contents of each file
9 | ```
10 | queries.txt: the 50 queries randomly selected from our test-set with further filtering(e.g., removing clear technical keywords)
11 | code_search_DeGraphCS.txt: the top-10 searched results returned by DegraphCS
12 | code_search_DeepCS.txt: the top-10 searched results returned by DeepCS
13 | code_search_MMAN.txt: the top-10 searched results returned by MMAN
14 | code_search_UNIF.txt: the top-10 searched results returned by UNIF
15 | ```
16 |
--------------------------------------------------------------------------------