├── LICENSE.md ├── README.md ├── keras ├── LICENSE.md ├── README.md ├── configs.py ├── data │ ├── example │ │ ├── test.apiseq.h5 │ │ ├── test.apiseq.txt │ │ ├── test.desc.h5 │ │ ├── test.desc.txt │ │ ├── test.meta.txt │ │ ├── test.methname.h5 │ │ ├── test.methname.txt │ │ ├── test.tokens.h5 │ │ ├── test.tokens.txt │ │ ├── train.apiseq.h5 │ │ ├── train.apiseq.txt │ │ ├── train.desc.h5 │ │ ├── train.desc.txt │ │ ├── train.methname.h5 │ │ ├── train.methname.txt │ │ ├── train.tokens.h5 │ │ ├── train.tokens.txt │ │ ├── use.apiseq.h5 │ │ ├── use.apiseq.txt │ │ ├── use.desc.h5 │ │ ├── use.desc.txt │ │ ├── use.methname.h5 │ │ ├── use.methname.txt │ │ ├── use.tokens.h5 │ │ ├── use.tokens.txt │ │ ├── vocab.apiseq.pkl │ │ ├── vocab.desc.pkl │ │ ├── vocab.methname.pkl │ │ └── vocab.tokens.pkl │ └── github │ │ ├── test.apiseq.h5 │ │ ├── test.desc.h5 │ │ ├── test.methname.h5 │ │ ├── test.rawcode.txt │ │ ├── test.tokens.h5 │ │ ├── train.apiseq.h5 │ │ ├── train.desc.h5 │ │ ├── train.methname.h5 │ │ ├── train.tokens.h5 │ │ ├── use.apiseq.h5 │ │ ├── use.codevecs.normalized.h5 │ │ ├── use.methname.h5 │ │ ├── use.rawcode.txt │ │ ├── use.tokens.h5 │ │ ├── vocab.apiseq.pkl │ │ ├── vocab.desc.pkl │ │ ├── vocab.methname.pkl │ │ └── vocab.tokens.pkl ├── data_loader.py ├── main.py ├── models.py ├── requirements.txt ├── results │ └── results.xlsx └── utils.py └── pytorch ├── LICENSE.md ├── README.md ├── automl_config.yaml ├── configs.py ├── data ├── example │ ├── test.apiseq.h5 │ ├── test.apiseq.txt │ ├── test.desc.h5 │ ├── test.desc.txt │ ├── test.meta.txt │ ├── test.methname.h5 │ ├── test.methname.txt │ ├── test.tokens.h5 │ ├── test.tokens.txt │ ├── train.apiseq.h5 │ ├── train.apiseq.txt │ ├── train.desc.h5 │ ├── train.desc.txt │ ├── train.methname.h5 │ ├── train.methname.txt │ ├── train.tokens.h5 │ ├── train.tokens.txt │ ├── use.apiseq.h5 │ ├── use.apiseq.txt │ ├── use.desc.h5 │ ├── use.desc.txt │ ├── use.methname.h5 │ ├── use.methname.txt │ ├── use.tokens.h5 │ ├── use.tokens.txt │ ├── vocab.apiseq.pkl │ ├── vocab.desc.pkl │ ├── vocab.methname.pkl │ └── vocab.tokens.pkl └── github │ ├── train.apiseq.h5 │ ├── train.desc.h5 │ ├── train.methname.h5 │ ├── train.tokens.h5 │ ├── use.apiseq.h5 │ ├── use.methname.h5 │ ├── use.rawcode.txt │ ├── use.tokens.h5 │ ├── vocab.apiseq.json │ ├── vocab.desc.json │ ├── vocab.name.json │ └── vocab.tokens.json ├── data_loader.py ├── models ├── __init__.py └── jointemb.py ├── modules.py ├── repr_code.py ├── requirements.txt ├── search.py ├── setup.py ├── train.py └── utils.py /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Xiaodong Gu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Code Search 2 | 3 | Code for the ICSE 2018 paper [Deep Code Search](https://guxd.github.io/papers/deepcs.pdf). 4 | 5 | ## Two Versions 6 | We release both ```Keras``` and ```PyTorch``` code of our approach, in the ```keras``` and ```pytorch``` folders, respectively. 7 | 8 | - The ```Keras``` folder contains the code to run the experiments presented in the paper. The code is frozen to what it was when we originally wrote the paper. (NOTE: we modified some deprecated API invocations to fit for the latest Keras and theano). 9 | 10 | - The ```PyTorch``` is the bleeding-edge reporitory where we packaged it up, improved the code quality and added some features. 11 | 12 | ⚠️ **Note that the PyTorch version is problematic at present. For those who want to replicate DeepCS as a baseline model, it is highly recommended to check out the Keras version. This could greatly save your time and effort**. 13 | 14 | 🤗 Nevertheless, if you are interested in using and improving DeepCS, check out the PyTorch version and feel free to contribute. 15 | 16 | For more information, please refer to the README files under the directory of each component. 17 | 18 | 19 | 20 | ## Tool Demo 21 | 22 | An online tool demo can be found in http://211.249.63.55:81/ (Unavailable due to budget constraint) 23 | 24 | ## Citation 25 | If you find it useful and would like to cite it, the following would be appropriate: 26 | ```bibtex 27 | @inproceedings{gu2018deepcs, 28 | title={Deep Code Search}, 29 | author={Gu, Xiaodong and Zhang, Hongyu and Kim, Sunghun}, 30 | booktitle={Proceedings of the 2018 40th International Conference on Software Engineering (ICSE 2018)}, 31 | year={2018}, 32 | organization={ACM} 33 | } 34 | ``` 35 | -------------------------------------------------------------------------------- /keras/LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Xiaodong Gu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /keras/README.md: -------------------------------------------------------------------------------- 1 | # Deep Code Search 2 | A keras implementation of the paper [Deep Code Search](https://guxd.github.io/papers/deepcs.pdf). 3 | 4 | ## Dependency 5 | > Tested in Ubuntu 16.04 6 | * Python 3.6 7 | * Keras 2.3.1 or newer 8 | * Tensorflow 2.0.0 or Theano 0.8.0~0.9.1 9 | 10 | ## Code Structures 11 | 12 | - `models.py`: Neural network models for code/desc representation and similarity measure. 13 | 14 | - `main.py`: The main entry for code search, including four sub-tasks: 15 | 1) Train: train the code/desc representaton models; 16 | 2) Eval: evaluate the learnt code/desc representation models; 17 | 3) Code Embedding: encode code into vectors and store them to a file; 18 | 4) Search: search relevant code for a given query. 19 | 20 | - `configs.py`: Configurations for models defined in the `models.py`. 21 | Each function defines the hyperparameters for the corresponding model. 22 | 23 | 24 | ## Usage 25 | 26 | ### Data Preparation 27 | The `/data` folder provides a small dummy dataset for quick deployment. 28 | To train and test our model: 29 | 30 | 1) Download and unzip real dataset from [Google Drive](https://drive.google.com/drive/folders/1GZYLT_lzhlVczXjD6dgwVUvDDPHMB6L7?usp=sharing) or [Baidu Pan](https://pan.baidu.com/s/1U_MtFXqq0C-Qh8WUFAWGvg) for Chinese users. 31 | 32 | 2) Replace each file in the `/data` folder with the corresponding real file. 33 | 34 | ### Configuration 35 | 36 | Edit hyper-parameters and settings in `config.py` 37 | 38 | ### Train 39 | 40 | ```bash 41 | python main.py --mode train 42 | ``` 43 | 44 | ### Code Embedding 45 | 46 | First, set `reload` in `config.py` to the number of optimal checkpoint, e.g., 500 47 | 48 | Then, run 49 | ```bash 50 | python main.py --mode repr_code 51 | ``` 52 | 53 | ### Search 54 | 55 | First, set `reload` in `config.py` to the number of optimal checkpoint, e.g., 500 56 | 57 | Then, run 58 | ```bash 59 | python main.py --mode search 60 | ``` 61 | 62 | ## Tool Demo 63 | 64 | An online tool demo can be found at http://211.249.63.55:81/ (Unavailable Now) 65 | 66 | ## Citation 67 | If you find it useful and would like to cite it, the following would be appropriate: 68 | ``` 69 | @inproceedings{gu2018deepcs, 70 | title={Deep Code Search}, 71 | author={Gu, Xiaodong and Zhang, Hongyu and Kim, Sunghun}, 72 | booktitle={Proceedings of the 2018 40th International Conference on Software Engineering (ICSE 2018)}, 73 | year={2018}, 74 | organization={ACM} 75 | } 76 | ``` 77 | -------------------------------------------------------------------------------- /keras/configs.py: -------------------------------------------------------------------------------- 1 | 2 | def config_JointEmbeddingModel(): 3 | config = { 4 | 'data_params':{ 5 | #training data 6 | 'train_methname':'train.methname.h5', 7 | 'train_apiseq':'train.apiseq.h5', 8 | 'train_tokens':'train.tokens.h5', 9 | 'train_desc':'train.desc.h5', 10 | #valid data 11 | 'valid_methname':'test.methname.h5', 12 | 'valid_apiseq':'test.apiseq.h5', 13 | 'valid_tokens':'test.tokens.h5', 14 | 'valid_desc':'test.desc.h5', 15 | #use data (computing code vectors) 16 | 'use_codebase':'use.rawcode.txt',#'use.rawcode.h5' 17 | 'use_methname':'use.methname.h5', 18 | 'use_apiseq':'use.apiseq.h5', 19 | 'use_tokens':'use.tokens.h5', 20 | #results data(code vectors) 21 | 'use_codevecs':'use.codevecs.normalized.h5',#'use.codevecs.h5', 22 | 23 | #parameters 24 | 'methname_len': 6, 25 | 'apiseq_len':30, 26 | 'tokens_len':50, 27 | 'desc_len': 30, 28 | 'n_words': 10000, # len(vocabulary) + 1 29 | #vocabulary info 30 | 'vocab_methname':'vocab.methname.pkl', 31 | 'vocab_apiseq':'vocab.apiseq.pkl', 32 | 'vocab_tokens':'vocab.tokens.pkl', 33 | 'vocab_desc':'vocab.desc.pkl', 34 | }, 35 | 'training_params': { 36 | 'batch_size': 128, 37 | 'chunk_size':100000, 38 | 'nb_epoch': 2000, 39 | 'validation_split': 0.2, 40 | 'optimizer': 'adam', 41 | # 'optimizer': Adam(clip_norm=0.1), 42 | 'valid_every': 5, 43 | 'n_eval': 100, 44 | 'evaluate_all_threshold': { 45 | 'mode': 'all', 46 | 'top1': 0.4, 47 | }, 48 | 'save_every': 10, 49 | 'reload':-1, #epoch that the model is reloaded from . If reload=0, then train from scratch 50 | }, 51 | 52 | 'model_params': { 53 | 'n_embed_dims': 100, 54 | 'n_hidden': 400,#number of hidden dimension of code/desc representation 55 | # recurrent 56 | 'n_lstm_dims': 200, # * 2 57 | 'init_embed_weights_methname': None,#'word2vec_100_methname.h5', 58 | 'init_embed_weights_tokens': None,#'word2vec_100_tokens.h5', 59 | 'init_embed_weights_desc': None,#'word2vec_100_desc.h5', 60 | 'margin': 0.05, 61 | 'sim_measure':'cos',#similarity measure: gesd, cos, aesd 62 | } 63 | } 64 | return config 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /keras/data/example/test.apiseq.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/test.apiseq.h5 -------------------------------------------------------------------------------- /keras/data/example/test.desc.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/test.desc.h5 -------------------------------------------------------------------------------- /keras/data/example/test.meta.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/test.meta.txt -------------------------------------------------------------------------------- /keras/data/example/test.methname.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/test.methname.h5 -------------------------------------------------------------------------------- /keras/data/example/test.tokens.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/test.tokens.h5 -------------------------------------------------------------------------------- /keras/data/example/train.apiseq.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/train.apiseq.h5 -------------------------------------------------------------------------------- /keras/data/example/train.desc.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/train.desc.h5 -------------------------------------------------------------------------------- /keras/data/example/train.methname.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/train.methname.h5 -------------------------------------------------------------------------------- /keras/data/example/train.tokens.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/train.tokens.h5 -------------------------------------------------------------------------------- /keras/data/example/use.apiseq.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/use.apiseq.h5 -------------------------------------------------------------------------------- /keras/data/example/use.desc.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/use.desc.h5 -------------------------------------------------------------------------------- /keras/data/example/use.methname.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/use.methname.h5 -------------------------------------------------------------------------------- /keras/data/example/use.tokens.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/use.tokens.h5 -------------------------------------------------------------------------------- /keras/data/example/vocab.apiseq.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/vocab.apiseq.pkl -------------------------------------------------------------------------------- /keras/data/example/vocab.desc.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/vocab.desc.pkl -------------------------------------------------------------------------------- /keras/data/example/vocab.methname.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/vocab.methname.pkl -------------------------------------------------------------------------------- /keras/data/example/vocab.tokens.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/vocab.tokens.pkl -------------------------------------------------------------------------------- /keras/data/github/test.apiseq.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/test.apiseq.h5 -------------------------------------------------------------------------------- /keras/data/github/test.desc.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/test.desc.h5 -------------------------------------------------------------------------------- /keras/data/github/test.methname.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/test.methname.h5 -------------------------------------------------------------------------------- /keras/data/github/test.tokens.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/test.tokens.h5 -------------------------------------------------------------------------------- /keras/data/github/train.apiseq.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/train.apiseq.h5 -------------------------------------------------------------------------------- /keras/data/github/train.desc.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/train.desc.h5 -------------------------------------------------------------------------------- /keras/data/github/train.methname.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/train.methname.h5 -------------------------------------------------------------------------------- /keras/data/github/train.tokens.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/train.tokens.h5 -------------------------------------------------------------------------------- /keras/data/github/use.apiseq.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/use.apiseq.h5 -------------------------------------------------------------------------------- /keras/data/github/use.codevecs.normalized.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/use.codevecs.normalized.h5 -------------------------------------------------------------------------------- /keras/data/github/use.methname.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/use.methname.h5 -------------------------------------------------------------------------------- /keras/data/github/use.tokens.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/use.tokens.h5 -------------------------------------------------------------------------------- /keras/data/github/vocab.apiseq.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/vocab.apiseq.pkl -------------------------------------------------------------------------------- /keras/data/github/vocab.desc.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/vocab.desc.pkl -------------------------------------------------------------------------------- /keras/data/github/vocab.methname.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/vocab.methname.pkl -------------------------------------------------------------------------------- /keras/data/github/vocab.tokens.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/vocab.tokens.pkl -------------------------------------------------------------------------------- /keras/data_loader.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import codecs 3 | import tables 4 | import numpy as np 5 | from tqdm import tqdm 6 | import logging 7 | logger = logging.getLogger(__name__) 8 | logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") 9 | 10 | 11 | def load_pickle(filename): 12 | return pickle.load(open(filename, 'rb')) 13 | 14 | ##### Data Set ##### 15 | def load_codebase(path, chunk_size): 16 | """load codebase 17 | codefile: h5 file that stores raw code 18 | """ 19 | logger.info('Loading codebase (chunk size={})..'.format(chunk_size)) 20 | codebase=[] 21 | #codes=codecs.open(self.path+self.data_params['use_codebase']).readlines() 22 | codes=codecs.open(path, encoding='utf8',errors='replace').readlines() 23 | #use codecs to read in case of encoding problem 24 | for i in tqdm(range(0,len(codes), chunk_size)): 25 | codebase.append(codes[i:i+chunk_size]) 26 | return codebase 27 | 28 | ### Results Data ### 29 | def load_code_reprs(path, chunk_size): 30 | logger.debug(f'Loading code vectors (chunk size={chunk_size})..') 31 | """reads vectors (2D numpy array) from a hdf5 file""" 32 | codereprs=[] 33 | h5f = tables.open_file(path) 34 | vecs = h5f.root.vecs 35 | for i in range(0, len(vecs), chunk_size): 36 | codereprs.append(vecs[i: i+ chunk_size]) 37 | h5f.close() 38 | return codereprs 39 | 40 | def save_code_reprs(vecs, path): 41 | npvecs=np.array(vecs) 42 | fvec = tables.open_file(path, 'w') 43 | atom = tables.Atom.from_dtype(npvecs.dtype) 44 | filters = tables.Filters(complib='blosc', complevel=5) 45 | ds = fvec.create_carray(fvec.root, 'vecs', atom, npvecs.shape,filters=filters) 46 | ds[:] = npvecs 47 | fvec.close() 48 | 49 | def load_hdf5(vecfile, start_offset, chunk_size): 50 | """reads training sentences(list of int array) from a hdf5 file""" 51 | table = tables.open_file(vecfile) 52 | data = table.get_node('/phrases')[:].astype(np.int) 53 | index = table.get_node('/indices')[:] 54 | data_len = index.shape[0] 55 | if chunk_size==-1:#if chunk_size is set to -1, then, load all data 56 | chunk_size=data_len 57 | start_offset = start_offset%data_len 58 | logger.debug("{} entries".format(data_len)) 59 | logger.debug("starting from offset {} to {}".format(start_offset,start_offset+chunk_size)) 60 | sents = [] 61 | for offset in tqdm(range(start_offset, start_offset+chunk_size)): 62 | offset = offset%data_len 63 | len, pos = index[offset]['length'], index[offset]['pos'] 64 | sents.append(data[pos:pos + len]) 65 | table.close() 66 | return sents 67 | -------------------------------------------------------------------------------- /keras/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import random 4 | import traceback 5 | from tensorflow.keras.optimizers import RMSprop, Adam 6 | from scipy.stats import rankdata 7 | import math 8 | import numpy as np 9 | from tqdm import tqdm 10 | import argparse 11 | random.seed(42) 12 | import threading 13 | import configs 14 | import logging 15 | logger = logging.getLogger(__name__) 16 | logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") 17 | 18 | from utils import normalize, pad, convert, revert 19 | import models, configs, data_loader 20 | 21 | class SearchEngine: 22 | def __init__(self, args, conf=None): 23 | self.data_path = args.data_path + args.dataset+'/' 24 | self.train_params = conf.get('training_params', dict()) 25 | self.data_params = conf.get('data_params',dict()) 26 | self.model_params = conf.get('model_params',dict()) 27 | 28 | self._eval_sets = None 29 | 30 | self._code_reprs = None 31 | self._codebase = None 32 | self._codebase_chunksize = 2000000 33 | 34 | ##### Model Loading / saving ##### 35 | def save_model(self, model, epoch): 36 | model_path = f"./output/{model.__class__.__name__}/models/" 37 | os.makedirs(model_path, exist_ok=True) 38 | model.save(model_path + f"epo{epoch}_code.h5", model_path + f"epo{epoch}_desc.h5", overwrite=True) 39 | 40 | def load_model(self, model, epoch): 41 | model_path = f"./output/{model.__class__.__name__}/models/" 42 | assert os.path.exists(model_path + f"epo{epoch}_code.h5"),f"Weights at epoch {epoch} not found" 43 | assert os.path.exists(model_path + f"epo{epoch}_desc.h5"),f"Weights at epoch {epoch} not found" 44 | model.load(model_path + f"epo{epoch}_code.h5", model_path + f"epo{epoch}_desc.h5") 45 | 46 | 47 | ##### Training ##### 48 | def train(self, model): 49 | if self.train_params['reload']>0: 50 | self.load_model(model, self.train_params['reload']) 51 | valid_every = self.train_params.get('valid_every', None) 52 | save_every = self.train_params.get('save_every', None) 53 | batch_size = self.train_params.get('batch_size', 128) 54 | nb_epoch = self.train_params.get('nb_epoch', 10) 55 | split = self.train_params.get('validation_split', 0) 56 | 57 | val_loss = {'loss': 1., 'epoch': 0} 58 | chunk_size = self.train_params.get('chunk_size', 100000) 59 | 60 | for i in range(self.train_params['reload']+1, nb_epoch): 61 | print('Epoch %d :: \n' % i, end='') 62 | 63 | logger.debug('loading data chunk..') 64 | offset = (i-1)*self.train_params.get('chunk_size', 100000) 65 | 66 | names = data_loader.load_hdf5(self.data_path+self.data_params['train_methname'], offset, chunk_size) 67 | apis = data_loader.load_hdf5(self.data_path+self.data_params['train_apiseq'], offset, chunk_size) 68 | tokens = data_loader.load_hdf5(self.data_path+self.data_params['train_tokens'], offset, chunk_size) 69 | descs = data_loader.load_hdf5(self.data_path+self.data_params['train_desc'], offset, chunk_size) 70 | 71 | logger.debug('padding data..') 72 | methnames = pad(names, self.data_params['methname_len']) 73 | apiseqs = pad(apis, self.data_params['apiseq_len']) 74 | tokens = pad(tokens, self.data_params['tokens_len']) 75 | good_descs = pad(descs,self.data_params['desc_len']) 76 | bad_descs=[desc for desc in descs] 77 | random.shuffle(bad_descs) 78 | bad_descs = pad(bad_descs, self.data_params['desc_len']) 79 | 80 | hist = model.fit([methnames, apiseqs, tokens, good_descs, bad_descs], epochs=1, batch_size=batch_size, validation_split=split) 81 | 82 | if hist.history['val_loss'][0] < val_loss['loss']: 83 | val_loss = {'loss': hist.history['val_loss'][0], 'epoch': i} 84 | print('Best: Loss = {}, Epoch = {}'.format(val_loss['loss'], val_loss['epoch'])) 85 | 86 | if save_every is not None and i % save_every == 0: 87 | self.save_model(model, i) 88 | 89 | if valid_every is not None and i % valid_every == 0: 90 | acc, mrr, map, ndcg = self.valid(model, 1000, 1) 91 | 92 | ##### Evaluation in the develop set ##### 93 | def valid(self, model, poolsize, K): 94 | """ 95 | validate in a code pool. 96 | param: poolsize - size of the code pool, if -1, load the whole test set 97 | """ 98 | def ACC(real,predict): 99 | sum=0.0 100 | for val in real: 101 | try: index=predict.index(val) 102 | except ValueError: index=-1 103 | if index!=-1: sum=sum+1 104 | return sum/float(len(real)) 105 | def MAP(real,predict): 106 | sum=0.0 107 | for id,val in enumerate(real): 108 | try: index=predict.index(val) 109 | except ValueError: index=-1 110 | if index!=-1: sum=sum+(id+1)/float(index+1) 111 | return sum/float(len(real)) 112 | def MRR(real,predict): 113 | sum=0.0 114 | for val in real: 115 | try: index=predict.index(val) 116 | except ValueError: index=-1 117 | if index!=-1: sum=sum+1.0/float(index+1) 118 | return sum/float(len(real)) 119 | def NDCG(real,predict): 120 | dcg=0.0 121 | idcg=IDCG(len(real)) 122 | for i,predictItem in enumerate(predict): 123 | if predictItem in real: 124 | itemRelevance=1 125 | rank = i+1 126 | dcg+=(math.pow(2,itemRelevance)-1.0)*(math.log(2)/math.log(rank+1)) 127 | return dcg/float(idcg) 128 | def IDCG(n): 129 | idcg=0 130 | itemRelevance=1 131 | for i in range(n): 132 | idcg+=(math.pow(2, itemRelevance)-1.0)*(math.log(2)/math.log(i+2)) 133 | return idcg 134 | 135 | #load valid dataset 136 | if self._eval_sets is None: 137 | methnames = data_loader.load_hdf5(self.data_path+self.data_params['valid_methname'], 0, poolsize) 138 | apiseqs= data_loader.load_hdf5(self.data_path+self.data_params['valid_apiseq'], 0, poolsize) 139 | tokens = data_loader.load_hdf5(self.data_path+self.data_params['valid_tokens'], 0, poolsize) 140 | descs = data_loader.load_hdf5(self.data_path+self.data_params['valid_desc'], 0, poolsize) 141 | self._eval_sets={'methnames':methnames, 'apiseqs':apiseqs, 'tokens':tokens, 'descs':descs} 142 | 143 | accs,mrrs,maps,ndcgs = [], [], [], [] 144 | data_len = len(self._eval_sets['descs']) 145 | for i in tqdm(range(data_len)): 146 | desc=self._eval_sets['descs'][i]#good desc 147 | descs = pad([desc]*data_len,self.data_params['desc_len']) 148 | methnames = pad(self._eval_sets['methnames'],self.data_params['methname_len']) 149 | apiseqs= pad(self._eval_sets['apiseqs'],self.data_params['apiseq_len']) 150 | tokens= pad(self._eval_sets['tokens'],self.data_params['tokens_len']) 151 | n_results = K 152 | sims = model.predict([methnames, apiseqs,tokens, descs], batch_size=data_len).flatten() 153 | negsims= np.negative(sims) 154 | predict = np.argpartition(negsims, kth=n_results-1) 155 | predict = predict[:n_results] 156 | predict = [int(k) for k in predict] 157 | real=[i] 158 | accs.append(ACC(real,predict)) 159 | mrrs.append(MRR(real,predict)) 160 | maps.append(MAP(real,predict)) 161 | ndcgs.append(NDCG(real,predict)) 162 | acc, mrr, map_, ndcg = np.mean(accs), np.mean(mrrs), np.mean(maps), np.mean(ndcgs) 163 | logger.info(f'ACC={acc}, MRR={mrr}, MAP={map_}, nDCG={ndcg}') 164 | return acc,mrr,map_,ndcg 165 | 166 | 167 | ##### Compute Representation ##### 168 | def repr_code(self, model): 169 | logger.info('Loading the use data ..') 170 | methnames = data_loader.load_hdf5(self.data_path+self.data_params['use_methname'],0,-1) 171 | apiseqs = data_loader.load_hdf5(self.data_path+self.data_params['use_apiseq'],0,-1) 172 | tokens = data_loader.load_hdf5(self.data_path+self.data_params['use_tokens'],0,-1) 173 | methnames = pad(methnames, self.data_params['methname_len']) 174 | apiseqs = pad(apiseqs, self.data_params['apiseq_len']) 175 | tokens = pad(tokens, self.data_params['tokens_len']) 176 | 177 | logger.info('Representing code ..') 178 | vecs= model.repr_code([methnames, apiseqs, tokens], batch_size=10000) 179 | vecs= vecs.astype(np.float) 180 | vecs= normalize(vecs) 181 | return vecs 182 | 183 | 184 | def search(self, model, vocab, query, n_results=10): 185 | desc=[convert(vocab, query)]#convert desc sentence to word indices 186 | padded_desc = pad(desc, self.data_params['desc_len']) 187 | desc_repr=model.repr_desc([padded_desc]) 188 | desc_repr=desc_repr.astype(np.float32) 189 | desc_repr = normalize(desc_repr).T # [dim x 1] 190 | codes, sims = [], [] 191 | threads=[] 192 | for i,code_reprs_chunk in enumerate(self._code_reprs): 193 | t = threading.Thread(target=self.search_thread, args = (codes,sims,desc_repr,code_reprs_chunk,i,n_results)) 194 | threads.append(t) 195 | for t in threads: 196 | t.start() 197 | for t in threads:#wait until all sub-threads finish 198 | t.join() 199 | return codes,sims 200 | 201 | def search_thread(self, codes, sims, desc_repr, code_reprs, i, n_results): 202 | #1. compute similarity 203 | chunk_sims=np.dot(code_reprs, desc_repr) # [pool_size x 1] 204 | chunk_sims = np.squeeze(chunk_sims, axis=1) 205 | #2. choose top results 206 | negsims=np.negative(chunk_sims) 207 | maxinds = np.argpartition(negsims, kth=n_results-1) 208 | maxinds = maxinds[:n_results] 209 | chunk_codes = [self._codebase[i][k] for k in maxinds] 210 | chunk_sims = chunk_sims[maxinds] 211 | codes.extend(chunk_codes) 212 | sims.extend(chunk_sims) 213 | 214 | def postproc(self,codes_sims): 215 | codes_, sims_ = zip(*codes_sims) 216 | codes= [code for code in codes_] 217 | sims= [sim for sim in sims_] 218 | final_codes=[] 219 | final_sims=[] 220 | n=len(codes_sims) 221 | for i in range(n): 222 | is_dup=False 223 | for j in range(i): 224 | if codes[i][:80]==codes[j][:80] and abs(sims[i]-sims[j])<0.01: 225 | is_dup=True 226 | if not is_dup: 227 | final_codes.append(codes[i]) 228 | final_sims.append(sims[i]) 229 | return zip(final_codes,final_sims) 230 | 231 | 232 | def parse_args(): 233 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model") 234 | parser.add_argument("--data_path", type=str, default='./data/', help="working directory") 235 | parser.add_argument("--model", type=str, default="JointEmbeddingModel", help="model name") 236 | parser.add_argument("--dataset", type=str, default="github", help="dataset name") 237 | parser.add_argument("--mode", choices=["train","eval","repr_code","search"], default='train', 238 | help="The mode to run. The `train` mode trains a model;" 239 | " the `eval` mode evaluat models in a test set " 240 | " The `repr_code/repr_desc` mode computes vectors" 241 | " for a code snippet or a natural language description with a trained model.") 242 | parser.add_argument("--verbose",action="store_true", default=True, help="Be verbose") 243 | return parser.parse_args() 244 | 245 | 246 | if __name__ == '__main__': 247 | args = parse_args() 248 | config=getattr(configs, 'config_'+args.model)() 249 | engine = SearchEngine(args, config) 250 | 251 | ##### Define model ###### 252 | logger.info('Build Model') 253 | model = getattr(models, args.model)(config)#initialize the model 254 | model.build() 255 | model.summary(export_path = f"./output/{args.model}/") 256 | 257 | optimizer = config.get('training_params', dict()).get('optimizer', 'adam') 258 | model.compile(optimizer=optimizer) 259 | 260 | data_path = args.data_path+args.dataset+'/' 261 | 262 | if args.mode=='train': 263 | engine.train(model) 264 | 265 | elif args.mode=='eval': # evaluate for a specific epoch 266 | assert config['training_params']['reload']>0, "please specify the number of epoch of the optimal checkpoint in config.py" 267 | engine.load_model(model, config['training_params']['reload']) 268 | engine.valid(model, -1, 10) 269 | 270 | elif args.mode=='repr_code': 271 | assert config['training_params']['reload']>0, "please specify the number of epoch of the optimal checkpoint in config.py" 272 | engine.load_model(model, config['training_params']['reload']) 273 | vecs = engine.repr_code(model) 274 | data_loader.save_code_reprs(vecs, data_path+config['data_params']['use_codevecs']) 275 | 276 | elif args.mode=='search': 277 | #search code based on a desc 278 | assert config['training_params']['reload']>0, "please specify the number of epoch of the optimal checkpoint in config.py" 279 | engine.load_model(model, config['training_params']['reload']) 280 | engine._code_reprs = data_loader.load_code_reprs(data_path+config['data_params']['use_codevecs'], engine._codebase_chunksize) 281 | engine._codebase = data_loader.load_codebase(data_path+config['data_params']['use_codebase'], engine._codebase_chunksize) 282 | vocab = data_loader.load_pickle(data_path+config['data_params']['vocab_desc']) 283 | while True: 284 | try: 285 | query = input('Input Query: ') 286 | n_results = int(input('How many results? ')) 287 | except Exception: 288 | print("Exception while parsing your input:") 289 | traceback.print_exc() 290 | break 291 | query = query.lower().replace('how to ', '').replace('how do i ', '').replace('how can i ', '').replace('?', '').strip() 292 | codes,sims=engine.search(model, vocab, query, n_results) 293 | zipped=zip(codes,sims) 294 | zipped=sorted(zipped, reverse=True, key=lambda x:x[1]) 295 | zipped=engine.postproc(zipped) 296 | zipped = list(zipped)[:n_results] 297 | results = '\n\n'.join(map(str,zipped)) #combine the result into a returning string 298 | print(results) 299 | -------------------------------------------------------------------------------- /keras/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tensorflow.keras.layers import Input, Concatenate, Dot, Embedding, Dropout, Lambda, Activation, LSTM, Dense 3 | from tensorflow.keras import backend as K 4 | from tensorflow.keras.models import Model 5 | from tensorflow.keras.utils import plot_model 6 | import numpy as np 7 | import logging 8 | logger = logging.getLogger(__name__) 9 | 10 | class JointEmbeddingModel: 11 | def __init__(self, config): 12 | self.model_params = config.get('model_params', dict()) 13 | self.data_params = config.get('data_params',dict()) 14 | self.methname = Input(shape=(self.data_params['methname_len'],), dtype='int32', name='i_methname') 15 | self.apiseq= Input(shape=(self.data_params['apiseq_len'],),dtype='int32',name='i_apiseq') 16 | self.tokens=Input(shape=(self.data_params['tokens_len'],),dtype='int32',name='i_tokens') 17 | self.desc_good = Input(shape=(self.data_params['desc_len'],), dtype='int32', name='i_desc_good') 18 | self.desc_bad = Input(shape=(self.data_params['desc_len'],), dtype='int32', name='i_desc_bad') 19 | 20 | # initialize a bunch of variables that will be set later 21 | self._code_repr_model=None 22 | self._desc_repr_model=None 23 | self._sim_model = None 24 | self._training_model = None 25 | #self.prediction_model = None 26 | 27 | def build(self): 28 | ''' 29 | 1. Build Code Representation Model 30 | ''' 31 | logger.debug('Building Code Representation Model') 32 | methname = Input(shape=(self.data_params['methname_len'],), dtype='int32', name='methname') 33 | apiseq= Input(shape=(self.data_params['apiseq_len'],),dtype='int32',name='apiseq') 34 | tokens=Input(shape=(self.data_params['tokens_len'],),dtype='int32',name='tokens') 35 | 36 | ## method name representation ## 37 | #1.embedding 38 | init_emb_weights = np.load(self.model_params['init_embed_weights_methname']) if self.model_params['init_embed_weights_methname'] is not None else None 39 | if init_emb_weights is not None: init_emb_weights = [init_emb_weights] 40 | embedding = Embedding(input_dim=self.data_params['n_words'], 41 | output_dim=self.model_params.get('n_embed_dims', 100), 42 | weights=init_emb_weights, 43 | mask_zero=False,#Whether 0 in the input is a special "padding" value that should be masked out. 44 | #If True, all subsequent layers in the model must support masking, otherwise an exception will be raised. 45 | name='embedding_methname') 46 | methname_embedding = embedding(methname) 47 | dropout = Dropout(0.25,name='dropout_methname_embed') 48 | methname_dropout = dropout(methname_embedding) 49 | #2.rnn 50 | f_rnn = LSTM(self.model_params.get('n_lstm_dims', 128), recurrent_dropout=0.2, 51 | return_sequences=True, name='lstm_methname_f') 52 | 53 | b_rnn = LSTM(self.model_params.get('n_lstm_dims', 128), return_sequences=True, 54 | recurrent_dropout=0.2, name='lstm_methname_b',go_backwards=True) 55 | methname_f_rnn = f_rnn(methname_dropout) 56 | methname_b_rnn = b_rnn(methname_dropout) 57 | dropout = Dropout(0.25,name='dropout_methname_rnn') 58 | methname_f_dropout = dropout(methname_f_rnn) 59 | methname_b_dropout = dropout(methname_b_rnn) 60 | #3.maxpooling 61 | maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]),name='maxpool_methname') 62 | methname_pool = Concatenate(name='concat_methname_lstms')([maxpool(methname_f_dropout), maxpool(methname_b_dropout)]) 63 | activation = Activation('tanh',name='active_methname') 64 | methname_repr = activation(methname_pool) 65 | 66 | 67 | ## API Sequence Representation ## 68 | #1.embedding 69 | embedding = Embedding(input_dim=self.data_params['n_words'], 70 | output_dim=self.model_params.get('n_embed_dims', 100), 71 | #weights=weights, 72 | mask_zero=False,#Whether 0 in the input is a special "padding" value that should be masked out. 73 | #If True, all subsequent layers must support masking, otherwise an exception will be raised. 74 | name='embedding_apiseq') 75 | apiseq_embedding = embedding(apiseq) 76 | dropout = Dropout(0.25,name='dropout_apiseq_embed') 77 | apiseq_dropout = dropout(apiseq_embedding) 78 | #2.rnn 79 | f_rnn = LSTM(self.model_params.get('n_lstm_dims', 100), return_sequences=True, recurrent_dropout=0.2, 80 | name='lstm_apiseq_f') 81 | b_rnn = LSTM(self.model_params.get('n_lstm_dims', 100), return_sequences=True, recurrent_dropout=0.2, 82 | name='lstm_apiseq_b', go_backwards=True) 83 | apiseq_f_rnn = f_rnn(apiseq_dropout) 84 | apiseq_b_rnn = b_rnn(apiseq_dropout) 85 | dropout = Dropout(0.25,name='dropout_apiseq_rnn') 86 | apiseq_f_dropout = dropout(apiseq_f_rnn) 87 | apiseq_b_dropout = dropout(apiseq_b_rnn) 88 | #3.maxpooling 89 | maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]),name='maxpool_apiseq') 90 | apiseq_pool = Concatenate(name='concat_apiseq_lstms')([maxpool(apiseq_f_dropout), maxpool(apiseq_b_dropout)]) 91 | activation = Activation('tanh',name='active_apiseq') 92 | apiseq_repr = activation(apiseq_pool) 93 | 94 | 95 | ## Tokens Representation ## 96 | #1.embedding 97 | init_emb_weights = np.load(self.model_params['init_embed_weights_tokens']) if self.model_params['init_embed_weights_tokens'] is not None else None 98 | if init_emb_weights is not None: init_emb_weights = [init_emb_weights] 99 | embedding = Embedding(input_dim=self.data_params['n_words'], 100 | output_dim=self.model_params.get('n_embed_dims', 100), 101 | weights=init_emb_weights, 102 | #mask_zero=True,#Whether 0 in the input is a special "padding" value that should be masked out. 103 | #If True, all subsequent layers must support masking, otherwise an exception will be raised. 104 | name='embedding_tokens') 105 | tokens_embedding = embedding(tokens) 106 | dropout = Dropout(0.25,name='dropout_tokens_embed') 107 | tokens_dropout= dropout(tokens_embedding) 108 | 109 | #4.maxpooling 110 | maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]),name='maxpool_tokens') 111 | tokens_pool = maxpool(tokens_dropout) 112 | activation = Activation('tanh',name='active_tokens') 113 | tokens_repr= activation(tokens_pool) 114 | 115 | ## concatenate the representation of code ## 116 | merged_methname_api=Concatenate(name='merge_methname_api')([methname_repr,apiseq_repr]) 117 | merged_code_repr=Concatenate(name='merge_coderepr')([merged_methname_api,tokens_repr]) 118 | code_repr=Dense(self.model_params.get('n_hidden',400),activation='tanh',name='dense_coderepr')(merged_code_repr) 119 | 120 | 121 | self._code_repr_model=Model(inputs=[methname,apiseq,tokens],outputs=[code_repr],name='code_repr_model') 122 | 123 | 124 | ''' 125 | 2. Build Desc Representation Model 126 | ''' 127 | ## Desc Representation ## 128 | logger.debug('Building Desc Representation Model') 129 | desc = Input(shape=(self.data_params['desc_len'],), dtype='int32', name='desc') 130 | #1.embedding 131 | init_emb_weights = np.load(self.model_params['init_embed_weights_desc']) if self.model_params['init_embed_weights_desc'] is not None else None 132 | if init_emb_weights is not None: init_emb_weights = [init_emb_weights] 133 | embedding = Embedding(input_dim=self.data_params['n_words'], 134 | output_dim=self.model_params.get('n_embed_dims', 100), 135 | weights=init_emb_weights, 136 | mask_zero=True,#Whether 0 in the input is a special "padding" value that should be masked out. 137 | #If True, all subsequent layers must support masking, otherwise an exception will be raised. 138 | name='embedding_desc') 139 | desc_embedding = embedding(desc) 140 | dropout = Dropout(0.25,name='dropout_desc_embed') 141 | desc_dropout = dropout(desc_embedding) 142 | #2. rnn 143 | f_rnn = LSTM(self.model_params.get('n_lstm_dims', 100), return_sequences=True, recurrent_dropout=0.2, 144 | name='lstm_desc_f') 145 | b_rnn = LSTM(self.model_params.get('n_lstm_dims', 100), return_sequences=True, recurrent_dropout=0.2, 146 | name='lstm_desc_b', go_backwards=True) 147 | desc_f_rnn = f_rnn(desc_dropout) 148 | desc_b_rnn = b_rnn(desc_dropout) 149 | dropout = Dropout(0.25,name='dropout_desc_rnn') 150 | desc_f_dropout = dropout(desc_f_rnn) 151 | desc_b_dropout = dropout(desc_b_rnn) 152 | #3. maxpooling 153 | maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]),name='maxpool_desc') 154 | desc_pool = Concatenate(name='concat_desc_rnns')([maxpool(desc_f_dropout), maxpool(desc_b_dropout)]) 155 | activation = Activation('tanh',name='active_desc') 156 | desc_repr = activation(desc_pool) 157 | 158 | self._desc_repr_model=Model(inputs=[desc],outputs=[desc_repr],name='desc_repr_model') 159 | 160 | """ 161 | 3: calculate the cosine similarity between code and desc 162 | """ 163 | logger.debug('Building similarity model') 164 | code_repr=self._code_repr_model([methname,apiseq,tokens]) 165 | desc_repr=self._desc_repr_model([desc]) 166 | cos_sim=Dot(axes=1, normalize=True, name='cos_sim')([code_repr, desc_repr]) 167 | 168 | sim_model = Model(inputs=[methname,apiseq,tokens,desc], outputs=[cos_sim],name='sim_model') 169 | self._sim_model=sim_model #for model evaluation 170 | 171 | 172 | ''' 173 | 4:Build training model 174 | ''' 175 | good_sim = sim_model([self.methname,self.apiseq,self.tokens, self.desc_good])# similarity of good output 176 | bad_sim = sim_model([self.methname,self.apiseq,self.tokens, self.desc_bad])#similarity of bad output 177 | loss = Lambda(lambda x: K.maximum(1e-6, self.model_params['margin'] - x[0] + x[1]), 178 | output_shape=lambda x: x[0], name='loss')([good_sim, bad_sim]) 179 | 180 | logger.debug('Building training model') 181 | self._training_model=Model(inputs=[self.methname,self.apiseq,self.tokens,self.desc_good,self.desc_bad], 182 | outputs=[loss],name='training_model') 183 | 184 | 185 | def summary(self, export_path): 186 | print('Summary of the code representation model') 187 | self._code_repr_model.summary() 188 | #plot_model(self._code_repr_model, show_shapes=True, to_file= export_path+'code_repr_model.png') 189 | print('Summary of the desc representation model') 190 | self._desc_repr_model.summary() 191 | #plot_model(self._desc_repr_model, show_shapes=True, to_file=export_path+'desc_repr_model.png') 192 | print ("Summary of the similarity model") 193 | self._sim_model.summary() 194 | #plot_model(self._sim_model, show_shapes=True, to_file= export_path+'sim_model.png') 195 | print ('Summary of the training model') 196 | self._training_model.summary() 197 | #plot_model(self._training_model, show_shapes=True, to_file=export_path+'training_model.png') 198 | 199 | 200 | def compile(self, optimizer, **kwargs): 201 | logger.info('compiling models') 202 | self._code_repr_model.compile(loss='cosine_similarity', optimizer=optimizer, **kwargs) 203 | self._desc_repr_model.compile(loss='cosine_similarity', optimizer=optimizer, **kwargs) 204 | self._training_model.compile(loss=lambda y_true, y_pred: y_pred+y_true-y_true, optimizer=optimizer, **kwargs) 205 | #+y_true-y_true is for avoiding an unused input warning, it can be simply +y_true since y_true is always 0 in the training set. 206 | self._sim_model.compile(loss='binary_crossentropy', optimizer=optimizer, **kwargs) 207 | 208 | def fit(self, x, **kwargs): 209 | assert self._training_model is not None, 'Must compile the model before fitting data' 210 | y = np.zeros(shape=x[0].shape[:1],dtype=np.float32) 211 | return self._training_model.fit(x, y, **kwargs) 212 | 213 | def repr_code(self, x, **kwargs): 214 | return self._code_repr_model.predict(x, **kwargs) 215 | 216 | def repr_desc(self, x, **kwargs): 217 | return self._desc_repr_model.predict(x, **kwargs) 218 | 219 | def predict(self, x, **kwargs): 220 | return self._sim_model.predict(x, **kwargs) 221 | 222 | def save(self, code_model_file, desc_model_file, **kwargs): 223 | assert self._code_repr_model is not None, 'Must compile the model before saving weights' 224 | self._code_repr_model.save_weights(code_model_file, **kwargs) 225 | assert self._desc_repr_model is not None, 'Must compile the model before saving weights' 226 | self._desc_repr_model.save_weights(desc_model_file, **kwargs) 227 | 228 | def load(self, code_model_file, desc_model_file, **kwargs): 229 | assert self._code_repr_model is not None, 'Must compile the model loading weights' 230 | self._code_repr_model.load_weights(code_model_file, **kwargs) 231 | assert self._desc_repr_model is not None, 'Must compile the model loading weights' 232 | self._desc_repr_model.load_weights(desc_model_file, **kwargs) 233 | 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /keras/requirements.txt: -------------------------------------------------------------------------------- 1 | keras=2.3.1 2 | tensorflow-gpu=2.0.0 3 | tables 4 | numpy 5 | tqdm 6 | scipy 7 | scikit-learn -------------------------------------------------------------------------------- /keras/results/results.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/results/results.xlsx -------------------------------------------------------------------------------- /keras/utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Utils for similarity computation 3 | 4 | @author: v-xiaodg 5 | ''' 6 | import numpy as np 7 | 8 | def cos_np(data1,data2): 9 | """numpy implementation of cosine similarity for matrix""" 10 | dotted = np.dot(data1,np.transpose(data2)) 11 | norm1 = np.linalg.norm(data1,axis=1) 12 | norm2 = np.linalg.norm(data2,axis=1) 13 | matrix_vector_norms = np.multiply(norm1, norm2) 14 | neighbors = np.divide(dotted, matrix_vector_norms) 15 | return neighbors 16 | 17 | def normalize(data): 18 | """normalize matrix by rows""" 19 | normalized_data = data/np.linalg.norm(data,axis=1).reshape((data.shape[0], 1)) 20 | return normalized_data 21 | 22 | def cos_np_for_normalized(data1,data2): 23 | """cosine similarity for normalized vectors""" 24 | return np.dot(data1,np.transpose(data2)) 25 | 26 | 27 | ##### Converting / reverting ##### 28 | def convert(vocab, words): 29 | """convert words into indices""" 30 | if type(words) == str: 31 | words = words.strip().lower().split(' ') 32 | return [vocab.get(w, 0) for w in words] 33 | def revert(vocab, indices): 34 | """revert indices into words""" 35 | ivocab = dict((v, k) for k, v in vocab.items()) 36 | return [ivocab.get(i, 'UNK') for i in indices] 37 | 38 | ##### Padding ##### 39 | def pad(data, len=None): 40 | from tensorflow.keras.preprocessing.sequence import pad_sequences 41 | return pad_sequences(data, maxlen=len, padding='post', truncating='post', value=0) 42 | 43 | -------------------------------------------------------------------------------- /pytorch/LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Xiaodong Gu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pytorch/README.md: -------------------------------------------------------------------------------- 1 | # Deep Code Search 2 | 3 | PyTorch implementation of [Deep Code Search](https://guxd.github.io/papers/deepcs.pdf). 4 | 5 | ⚠️ **Note that the PyTorch version is problematic at present. It might need bugfix or hyperparameter tuning. For those who want to replicate DeepCS as a baseline model, it is highly recommended to check out the Keras version. This could greatly save your time and effort**. 6 | 7 | ## Dependency 8 | > Tested in MacOS 10.12, Ubuntu 16.04 9 | * Python 3.6 10 | * PyTorch 11 | * tqdm 12 | 13 | ``` 14 | pip install -r requirements.txt 15 | ``` 16 | 17 | 18 | ## Code Structures 19 | 20 | - `models`: neural network models for code/desc representation and similarity measure. 21 | - `modules.py`: basic modules for model construction. 22 | - `train.py`: train and validate code/desc representaton models; 23 | - `repr_code.py`: encode code into vectors and store them to a file; 24 | - `search.py`: perform code search; 25 | - `configs.py`: configurations for models defined in the `models` folder. 26 | Each function defines the hyper-parameters for the corresponding model. 27 | - `data_loader.py`: A PyTorch dataset loader. 28 | - `utils.py`: utilities for models and training. 29 | 30 | ## Pretrained Model 31 | 32 | If you want a quick test, [here](https://drive.google.com/file/d/1xpUXsSFbULYEAs8low5zQZWK7-wmqTNO/view?usp=sharing) is a pretrained model. Put it in `./output/JointEmbeder/github/202106140524/models/` and run: 33 | 34 | ``` 35 | python repr_code.py -t 202106140524 --reload_from 4000000 36 | python search.py -t 202106140524 --reload_from 4000000 37 | ``` 38 | 39 | 40 | ## Usage 41 | 42 | ### Data Preparation 43 | The `/data` folder provides a small dummy dataset for quick deployment. 44 | To train and test our model: 45 | 46 | 1) Download and unzip real dataset from [Google Drive](https://drive.google.com/drive/folders/1GZYLT_lzhlVczXjD6dgwVUvDDPHMB6L7?usp=sharing) or [Baidu Pan](https://pan.baidu.com/s/1U_MtFXqq0C-Qh8WUFAWGvg) for Chinese users. 47 | 48 | 2) Replace each file in the `/data` folder with the corresponding real file. 49 | 50 | ### Configuration 51 | Edit hyper-parameters and settings in `config.py` 52 | 53 | ### Train 54 | 55 | ```bash 56 | python train.py --model JointEmbeder -v 57 | ``` 58 | 59 | 60 | ### Code Embedding 61 | 62 | ```bash 63 | python repr_code.py --model JointEmbeder -t XXX --reload_from YYY 64 | ``` 65 | where `XXX` stands for the timestamp, and `YYY` represents the iteration with the best model. 66 | 67 | ### Search 68 | 69 | ```bash 70 | python search.py --model JointEmbeder -t XXX --_reload_from YYY 71 | ``` 72 | where `XXX` stands for the timestamp, and `YYY` represents the iteration with the best model. 73 | 74 | Here is a screenshot of code search: 75 | 76 | 77 | 78 | 79 | ## Citation 80 | 81 | If you find it useful and would like to cite it, the following would be appropriate: 82 | 83 | ```bibtex 84 | @inproceedings{gu2018deepcs, 85 | title={Deep Code Search}, 86 | author={Gu, Xiaodong and Zhang, Hongyu and Kim, Sunghun}, 87 | booktitle={Proceedings of the 2018 40th International Conference on Software Engineering (ICSE 2018)}, 88 | year={2018}, 89 | organization={ACM} 90 | } 91 | ``` 92 | -------------------------------------------------------------------------------- /pytorch/automl_config.yaml: -------------------------------------------------------------------------------- 1 | # config.yaml 2 | backend: 3 | type: NSMLBackend 4 | setting: 5 | entry: train.py 6 | dataset: [codesearch] 7 | cpus: 2 8 | gpus: 1 9 | gpu-model: P40 10 | args: "--automl" 11 | tune: 12 | objective: 13 | measure: acc 14 | strategy: maximize 15 | sampler: 16 | name: PBTSampler 17 | setting: 18 | random_seed: 777 19 | all_unique: False 20 | max_attempt: 1000 21 | planner: 22 | name: PopulationPlanner 23 | setting: 24 | num_generations: 5 25 | population: 10 26 | alive_cnt: 2 27 | comparator: 'avg_compare' 28 | probs: 'halving' 29 | max_worker: 30 30 | hyperparams: 31 | learning_rate: 32 | type: log_range 33 | min: 0.00005 34 | max: 0.01 35 | margin: 36 | type: range 37 | min: 0.1 38 | max: 0.9 39 | n_hidden: 40 | type: values 41 | values: [512, 1024] 42 | lstm_dims: 43 | type: values 44 | values: [256, 512, 768, 1024] -------------------------------------------------------------------------------- /pytorch/configs.py: -------------------------------------------------------------------------------- 1 | 2 | def config_JointEmbeder(): 3 | conf = { 4 | # data_params 5 | 'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader 6 | #training data 7 | 'train_name':'train.name.h5', 8 | 'train_api':'train.apiseq.h5', 9 | 'train_tokens':'train.tokens.h5', 10 | 'train_desc':'train.desc.h5', 11 | #test data 12 | 'valid_name':'valid.name.h5', 13 | 'valid_api':'valid.apiseq.h5', 14 | 'valid_tokens':'valid.tokens.h5', 15 | 'valid_desc':'valid.desc.h5', 16 | #use data (computing code vectors) 17 | 'use_codebase':'use.rawcode.txt',#'use.rawcode.h5' 18 | 'use_names':'use.name.h5', 19 | 'use_apis':'use.apiseq.h5', 20 | 'use_tokens':'use.tokens.h5', 21 | #results data(code vectors) 22 | 'use_codevecs':'use.codevecs.h5', 23 | 24 | #parameters 25 | 'name_len': 6, 26 | 'api_len':30, 27 | 'tokens_len':50, 28 | 'desc_len': 30, 29 | 'n_words': 10000, # len(vocabulary) + 1 30 | #vocabulary info 31 | 'vocab_name':'vocab.name.json', 32 | 'vocab_api':'vocab.apiseq.json', 33 | 'vocab_tokens':'vocab.tokens.json', 34 | 'vocab_desc':'vocab.desc.json', 35 | 36 | #training_params 37 | 'batch_size': 64, 38 | 'chunk_size':200000, 39 | 'nb_epoch': 15, 40 | #'optimizer': 'adam', 41 | 'learning_rate': 1.34e-4, #2.08e-4, 42 | 'adam_epsilon':1e-8, 43 | 'warmup_steps':5000, 44 | 'fp16': False, 45 | 'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. 46 | #"See details at https://nvidia.github.io/apex/amp.html" 47 | 48 | # model_params # best: lstm_dims=512, n_hidden=1024, lr=1.38e-3, margin=0.6454, acc=0.9534 49 | # sub-optimal: lstm_dims=256, n_hidden=512, lr=2.08e-4, margin=0.3986, acc = 0.9348 50 | 'emb_size': 512, 51 | 'n_hidden': 512,#number of hidden dimension of code/desc representation 52 | # recurrent 53 | 'lstm_dims': 1024, #256, # * 2 54 | 'margin': 0.413, #0.3986, 55 | 'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf 56 | #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization. 57 | } 58 | return conf 59 | -------------------------------------------------------------------------------- /pytorch/data/example/test.apiseq.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/test.apiseq.h5 -------------------------------------------------------------------------------- /pytorch/data/example/test.desc.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/test.desc.h5 -------------------------------------------------------------------------------- /pytorch/data/example/test.meta.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/test.meta.txt -------------------------------------------------------------------------------- /pytorch/data/example/test.methname.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/test.methname.h5 -------------------------------------------------------------------------------- /pytorch/data/example/test.tokens.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/test.tokens.h5 -------------------------------------------------------------------------------- /pytorch/data/example/train.apiseq.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/train.apiseq.h5 -------------------------------------------------------------------------------- /pytorch/data/example/train.desc.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/train.desc.h5 -------------------------------------------------------------------------------- /pytorch/data/example/train.methname.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/train.methname.h5 -------------------------------------------------------------------------------- /pytorch/data/example/train.tokens.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/train.tokens.h5 -------------------------------------------------------------------------------- /pytorch/data/example/use.apiseq.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/use.apiseq.h5 -------------------------------------------------------------------------------- /pytorch/data/example/use.desc.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/use.desc.h5 -------------------------------------------------------------------------------- /pytorch/data/example/use.methname.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/use.methname.h5 -------------------------------------------------------------------------------- /pytorch/data/example/use.tokens.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/use.tokens.h5 -------------------------------------------------------------------------------- /pytorch/data/example/vocab.apiseq.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/vocab.apiseq.pkl -------------------------------------------------------------------------------- /pytorch/data/example/vocab.desc.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/vocab.desc.pkl -------------------------------------------------------------------------------- /pytorch/data/example/vocab.methname.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/vocab.methname.pkl -------------------------------------------------------------------------------- /pytorch/data/example/vocab.tokens.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/vocab.tokens.pkl -------------------------------------------------------------------------------- /pytorch/data/github/train.apiseq.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/train.apiseq.h5 -------------------------------------------------------------------------------- /pytorch/data/github/train.desc.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/train.desc.h5 -------------------------------------------------------------------------------- /pytorch/data/github/train.methname.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/train.methname.h5 -------------------------------------------------------------------------------- /pytorch/data/github/train.tokens.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/train.tokens.h5 -------------------------------------------------------------------------------- /pytorch/data/github/use.apiseq.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/use.apiseq.h5 -------------------------------------------------------------------------------- /pytorch/data/github/use.methname.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/use.methname.h5 -------------------------------------------------------------------------------- /pytorch/data/github/use.tokens.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/use.tokens.h5 -------------------------------------------------------------------------------- /pytorch/data_loader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import torch.utils.data as data 4 | import torch.nn as nn 5 | import tables 6 | import json 7 | import random 8 | import numpy as np 9 | import pickle 10 | from utils import PAD_ID, SOS_ID, EOS_ID, UNK_ID, indexes2sent 11 | 12 | 13 | class CodeSearchDataset(data.Dataset): 14 | """ 15 | Dataset that has only positive samples. 16 | """ 17 | def __init__(self, data_dir, f_name, max_name_len, f_api, max_api_len, 18 | f_tokens, max_tok_len, f_descs=None, max_desc_len=None): 19 | self.max_name_len=max_name_len 20 | self.max_api_len=max_api_len 21 | self.max_tok_len=max_tok_len 22 | self.max_desc_len=max_desc_len 23 | # 1. Initialize file path or list of file names. 24 | """read training data(list of int arrays) from a hdf5 file""" 25 | self.training=False 26 | print("loading data...") 27 | table_name = tables.open_file(data_dir+f_name) 28 | self.names = table_name.get_node('/phrases')[:].astype(np.long) 29 | self.idx_names = table_name.get_node('/indices')[:] 30 | table_api = tables.open_file(data_dir+f_api) 31 | self.apis = table_api.get_node('/phrases')[:].astype(np.long) 32 | self.idx_apis = table_api.get_node('/indices')[:] 33 | table_tokens = tables.open_file(data_dir+f_tokens) 34 | self.tokens = table_tokens.get_node('/phrases')[:].astype(np.long) 35 | self.idx_tokens = table_tokens.get_node('/indices')[:] 36 | if f_descs is not None: 37 | self.training=True 38 | table_desc = tables.open_file(data_dir+f_descs) 39 | self.descs = table_desc.get_node('/phrases')[:].astype(np.long) 40 | self.idx_descs = table_desc.get_node('/indices')[:] 41 | 42 | assert self.idx_names.shape[0] == self.idx_apis.shape[0] 43 | assert self.idx_apis.shape[0] == self.idx_tokens.shape[0] 44 | if f_descs is not None: 45 | assert self.idx_names.shape[0]==self.idx_descs.shape[0] 46 | self.data_len = self.idx_names.shape[0] 47 | print("{} entries".format(self.data_len)) 48 | 49 | def pad_seq(self, seq, maxlen): 50 | if len(seq)20: break 134 | print('-------------------------------') 135 | print(indexes2sent(name, vocab_name)) 136 | print(indexes2sent(apiseq, vocab_api)) 137 | print(indexes2sent(tokens, vocab_tokens)) 138 | print(indexes2sent(good_desc, vocab_desc)) 139 | 140 | print('\n\n============ Valid Data ================') 141 | k=0 142 | for batch in valid_data_loader: 143 | batch = tuple([t.numpy() for t in batch]) 144 | name, name_len, apiseq, api_len, tokens, tok_len, good_desc, good_desc_len, bad_desc, bad_desc_len = batch 145 | k+=1 146 | if k>20: break 147 | print('-------------------------------') 148 | print(indexes2sent(name, vocab_name)) 149 | print(indexes2sent(apiseq, vocab_api)) 150 | print(indexes2sent(tokens, vocab_tokens)) 151 | print(indexes2sent(good_desc, vocab_desc)) 152 | 153 | print('\n\n============ Use Data ================') 154 | k=0 155 | for batch in use_data_loader: 156 | batch = tuple([t.numpy() for t in batch]) 157 | name, name_len, apiseq, api_len, tokens, tok_len = batch 158 | k+=1 159 | if k>20: break 160 | print('-------------------------------') 161 | print(indexes2sent(name, vocab_name)) 162 | print(indexes2sent(apiseq, vocab_api)) 163 | print(indexes2sent(tokens, vocab_tokens)) 164 | -------------------------------------------------------------------------------- /pytorch/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .jointemb import JointEmbeder -------------------------------------------------------------------------------- /pytorch/models/jointemb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as weight_init 8 | import torch.nn.functional as F 9 | 10 | import logging 11 | logger = logging.getLogger(__name__) 12 | parentPath = os.path.abspath("..") 13 | sys.path.insert(0, parentPath)# add parent folder to path so as to import common modules 14 | from modules import SeqEncoder, BOWEncoder 15 | 16 | class JointEmbeder(nn.Module): 17 | """ 18 | References on sentence pair matching models: 19 | https://arxiv.org/pdf/1508.01585.pdf 20 | https://arxiv.org/pdf/1908.10084.pdf 21 | similarity scale classification for sentence pairs: https://arxiv.org/pdf/1503.00075.pdf 22 | """ 23 | def __init__(self, config): 24 | super(JointEmbeder, self).__init__() 25 | self.conf = config 26 | self.margin = config['margin'] 27 | 28 | self.name_encoder=SeqEncoder(config['n_words'],config['emb_size'],config['lstm_dims']) 29 | self.api_encoder=SeqEncoder(config['n_words'],config['emb_size'],config['lstm_dims']) 30 | self.tok_encoder=BOWEncoder(config['n_words'],config['emb_size'],config['n_hidden']) 31 | self.desc_encoder=SeqEncoder(config['n_words'],config['emb_size'],config['lstm_dims']) 32 | #self.fuse1=nn.Linear(config['emb_size']+4*config['lstm_dims'], config['n_hidden']) 33 | #self.fuse2 = nn.Sequential( 34 | # nn.Linear(config['emb_size']+4*config['lstm_dims'], config['n_hidden']), 35 | # nn.BatchNorm1d(config['n_hidden'], eps=1e-05, momentum=0.1), 36 | # nn.ReLU(), 37 | # nn.Linear(config['n_hidden'], config['n_hidden']), 38 | #) 39 | self.w_name = nn.Linear(2*config['lstm_dims'], config['n_hidden']) 40 | self.w_api = nn.Linear(2*config['lstm_dims'], config['n_hidden']) 41 | self.w_tok = nn.Linear(config['emb_size'], config['n_hidden']) 42 | self.w_desc = nn.Linear(2*config['lstm_dims'], config['n_hidden']) 43 | self.fuse3 = nn.Linear(config['n_hidden'], config['n_hidden']) 44 | 45 | self.init_weights() 46 | 47 | def init_weights(self):# Initialize Linear Weight 48 | for m in [self.w_name, self.w_api, self.w_tok, self.fuse3]: 49 | m.weight.data.uniform_(-0.1, 0.1)#nn.init.xavier_normal_(m.weight) 50 | nn.init.constant_(m.bias, 0.) 51 | 52 | def code_encoding(self, name, name_len, api, api_len, tokens, tok_len): 53 | name_repr=self.name_encoder(name, name_len) 54 | api_repr=self.api_encoder(api, api_len) 55 | tok_repr=self.tok_encoder(tokens, tok_len) 56 | #code_repr= self.fuse2(torch.cat((name_repr, api_repr, tok_repr),1)) 57 | code_repr = self.fuse3(torch.tanh(self.w_name(name_repr)+self.w_api(api_repr)+self.w_tok(tok_repr))) 58 | return code_repr 59 | 60 | def desc_encoding(self, desc, desc_len): 61 | desc_repr=self.desc_encoder(desc, desc_len) 62 | desc_repr=self.w_desc(desc_repr) 63 | return desc_repr 64 | 65 | def similarity(self, code_vec, desc_vec): 66 | """ 67 | https://arxiv.org/pdf/1508.01585.pdf 68 | """ 69 | assert self.conf['sim_measure'] in ['cos', 'poly', 'euc', 'sigmoid', 'gesd', 'aesd'], "invalid similarity measure" 70 | if self.conf['sim_measure']=='cos': 71 | return F.cosine_similarity(code_vec, desc_vec) 72 | elif self.conf['sim_measure']=='poly': 73 | return (0.5*torch.matmul(code_vec, desc_vec.t()).diag()+1)**2 74 | elif self.conf['sim_measure']=='sigmoid': 75 | return torch.tanh(torch.matmul(code_vec, desc_vec.t()).diag()+1) 76 | elif self.conf['sim_measure'] in ['euc', 'gesd', 'aesd']: 77 | euc_dist = torch.dist(code_vec, desc_vec, 2) # or torch.norm(code_vec-desc_vec,2) 78 | euc_sim = 1 / (1 + euc_dist) 79 | if self.conf['sim_measure']=='euc': return euc_sim 80 | sigmoid_sim = torch.sigmoid(torch.matmul(code_vec, desc_vec.t()).diag()+1) 81 | if self.conf['sim_measure']=='gesd': 82 | return euc_sim * sigmoid_sim 83 | elif self.conf['sim_measure']=='aesd': 84 | return 0.5*(euc_sim+sigmoid_sim) 85 | 86 | def forward(self, name, name_len, apiseq, api_len, tokens, tok_len, desc_anchor, desc_anchor_len, desc_neg, desc_neg_len): 87 | batch_size=name.size(0) 88 | code_repr=self.code_encoding(name, name_len, apiseq, api_len, tokens, tok_len) 89 | desc_anchor_repr=self.desc_encoding(desc_anchor, desc_anchor_len) 90 | desc_neg_repr=self.desc_encoding(desc_neg, desc_neg_len) 91 | 92 | anchor_sim = self.similarity(code_repr, desc_anchor_repr) 93 | neg_sim = self.similarity(code_repr, desc_neg_repr) # [batch_sz x 1] 94 | 95 | loss=(self.margin-anchor_sim+neg_sim).clamp(min=1e-6).mean() 96 | 97 | return loss -------------------------------------------------------------------------------- /pytorch/modules.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as weight_init 8 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 9 | from torch import optim 10 | import torch.nn.functional as F 11 | 12 | import logging 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class BOWEncoder(nn.Module): 17 | ''' 18 | https://medium.com/data-from-the-trenches/how-deep-does-your-sentence-embedding-model-need-to-be-cdffa191cb53 19 | https://www.kdnuggets.com/2019/10/beyond-word-embedding-document-embedding.html 20 | https://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d#bbe8 21 | ''' 22 | def __init__(self, vocab_size, emb_size, hidden_size): 23 | super(BOWEncoder, self).__init__() 24 | self.emb_size=emb_size 25 | self.hidden_size = hidden_size 26 | self.embedding = nn.Embedding(vocab_size, emb_size) 27 | #self.word_weights = get_word_weights(vocab_size) 28 | self.init_weights() 29 | 30 | def init_weights(self): 31 | nn.init.uniform_(self.embedding.weight, -0.1, 0.1) 32 | nn.init.constant_(self.embedding.weight[0], 0) 33 | 34 | def forward(self, input, input_len=None): 35 | batch_size, seq_len =input.size() 36 | embedded = self.embedding(input) # input: [batch_sz x seq_len x 1] embedded: [batch_sz x seq_len x emb_sz] 37 | embedded= F.dropout(embedded, 0.25, self.training) # [batch_size x seq_len x emb_size] 38 | 39 | # try to use a weighting scheme to summarize bag of word embeddings: 40 | # for example, a smooth inverse frequency weighting algorithm: https://github.com/peter3125/sentence2vec/blob/master/sentence2vec.py 41 | # word_weights = self.word_weights(input) # [batch_size x seq_len x 1] 42 | # embeded = word_weights*embedded 43 | 44 | # max pooling word vectors 45 | maxpooling = nn.MaxPool1d(kernel_size = seq_len, stride=seq_len) 46 | output_pool = maxpooling(embedded.transpose(1,2)).squeeze(2) # [batch_size x emb_size] 47 | encoding = output_pool #torch.tanh(output_pool) 48 | return encoding 49 | 50 | class SeqEncoder(nn.Module): 51 | def __init__(self, vocab_size, emb_size, hidden_size, n_layers=1): 52 | super(SeqEncoder, self).__init__() 53 | self.emb_size = emb_size 54 | self.hidden_size = hidden_size 55 | self.n_layers = n_layers 56 | self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0) 57 | self.lstm = nn.LSTM(emb_size, hidden_size, batch_first=True, bidirectional=True) 58 | self.init_weights() 59 | 60 | def init_weights(self): 61 | nn.init.uniform_(self.embedding.weight, -0.1, 0.1) 62 | nn.init.constant_(self.embedding.weight[0], 0) 63 | for name, param in self.lstm.named_parameters(): # initialize the gate weights 64 | # adopted from https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5 65 | #if len(param.shape)>1: 66 | # weight_init.orthogonal_(param.data) 67 | #else: 68 | # weight_init.normal_(param.data) 69 | # adopted from fairseq 70 | if 'weight' in name or 'bias' in name: 71 | param.data.uniform_(-0.1, 0.1) 72 | 73 | def forward(self, inputs, input_lens=None): 74 | ''' 75 | input_lens: [batch_size] 76 | ''' 77 | batch_size, seq_len=inputs.size() 78 | inputs = self.embedding(inputs) # input: [batch_sz x seq_len] embedded: [batch_sz x seq_len x emb_sz] 79 | inputs = F.dropout(inputs, 0.25, self.training) 80 | 81 | if input_lens is not None:# sort and pack sequence 82 | input_lens_sorted, indices = input_lens.sort(descending=True) 83 | inputs_sorted = inputs.index_select(0, indices) 84 | inputs = pack_padded_sequence(inputs_sorted, input_lens_sorted.data.tolist(), batch_first=True) 85 | 86 | hids, (h_n, c_n) = self.lstm(inputs) 87 | 88 | if input_lens is not None: # reorder and pad 89 | _, inv_indices = indices.sort() 90 | hids, lens = pad_packed_sequence(hids, batch_first=True) # hids:[batch_size x seq_len x (n_dir*hid_sz)](biRNN) 91 | hids = F.dropout(hids, p=0.25, training=self.training) 92 | hids = hids.index_select(0, inv_indices) 93 | h_n = h_n.index_select(1, inv_indices) 94 | h_n = h_n.view(self.n_layers, 2, batch_size, self.hidden_size) #[n_layers x n_dirs x batch_sz x hid_sz] 95 | h_n = h_n[-1] # get the last layer [n_dirs x batch_sz x hid_sz] 96 | ############commenting the following line significantly improves the performance, why? ##################################### 97 | # h_n1 = h_n.transpose(1, 0).contiguous() #[batch_size x n_dirs x hid_sz] 98 | # encoding1 = h_n1.view(batch_size,-1) #[batch_sz x (n_dirs*hid_sz)] 99 | 100 | #https://www.jianshu.com/p/c5b8e02bedbe 101 | #maxpooling = nn.MaxPool1d(kernel_size=hids.size(1), stride=hids.size(1)) 102 | #encoding2 = maxpooling(hids.transpose(1,2)).squeeze(2) # [batch_size x 2*hid_size] 103 | #encoding2 = torch.tanh(encoding2) 104 | 105 | encoding3 = torch.cat((h_n[0], h_n[1]), dim=1) 106 | return encoding3 #, encoding2, encoding3 107 | 108 | 109 | from torch.optim.lr_scheduler import LambdaLR 110 | 111 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1): 112 | """ Create a schedule with a learning rate that decreases following the 113 | values of the cosine function between 0 and `pi * cycles` after a warmup 114 | period during which it increases linearly between 0 and 1. 115 | """ 116 | def lr_lambda(current_step): 117 | if current_step < num_warmup_steps: 118 | return float(current_step) / float(max(1, num_warmup_steps)) 119 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 120 | return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress))) 121 | 122 | return LambdaLR(optimizer, lr_lambda, last_epoch) 123 | 124 | 125 | def get_word_weights(vocab_size, padding_idx=0): 126 | '''contruct a word weighting table ''' 127 | def cal_weight(word_idx): 128 | return 1-math.exp(-word_idx) 129 | weight_table = np.array([cal_weight(w) for w in range(vocab_size)]) 130 | if padding_idx is not None: 131 | weight_table[padding_idx] = 0. # zero vector for padding dimension 132 | return torch.FloatTensor(weight_table) 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /pytorch/repr_code.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime 4 | import numpy as np 5 | import argparse 6 | from tqdm import tqdm 7 | import logging 8 | logger = logging.getLogger(__name__) 9 | logging.basicConfig(level=logging.INFO, format="%(message)s") 10 | 11 | import torch 12 | from utils import normalize 13 | from data_loader import CodeSearchDataset, save_vecs 14 | import models, configs 15 | 16 | ##### Compute Representation ##### 17 | def repr_code(args): 18 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 19 | config=getattr(configs, 'config_'+args.model)() 20 | 21 | ##### Define model ###### 22 | logger.info('Constructing Model..') 23 | model = getattr(models, args.model)(config)#initialize the model 24 | if args.reload_from>0: 25 | ckpt_path = f'./output/{args.model}/{args.dataset}/{args.timestamp}/models/step{args.reload_from}.h5' 26 | model.load_state_dict(torch.load(ckpt_path, map_location=device)) 27 | model = model.to(device) 28 | model.eval() 29 | 30 | data_path = args.data_path+args.dataset+'/' 31 | use_set = eval(config['dataset_name'])(data_path, config['use_names'], config['name_len'], 32 | config['use_apis'], config['api_len'], 33 | config['use_tokens'], config['tokens_len']) 34 | 35 | data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=args.batch_size, 36 | shuffle=False, drop_last=False, num_workers=1) 37 | 38 | chunk_id = 0 39 | vecs, n_processed = [], 0 40 | for batch in tqdm(data_loader): 41 | batch_gpu = [tensor.to(device) for tensor in batch] 42 | with torch.no_grad(): 43 | reprs = model.code_encoding(*batch_gpu).data.cpu().numpy() 44 | reprs = reprs.astype(np.float32) # [batch x dim] 45 | if config['sim_measure']=='cos': # do normalization for fast cosine computation 46 | reprs = normalize(reprs) 47 | vecs.append(reprs) 48 | n_processed=n_processed+ batch[0].size(0) 49 | if n_processed>= args.chunk_size: 50 | output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5" 51 | save_vecs(np.vstack(vecs), output_path) 52 | chunk_id+=1 53 | vecs, n_processed = [], 0 54 | # save the last chunk (probably incomplete) 55 | output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5" 56 | save_vecs(np.vstack(vecs), output_path) 57 | 58 | def parse_args(): 59 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model") 60 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 61 | parser.add_argument('--model', type=str, default='JointEmbeder', help='model name') 62 | parser.add_argument('-d', '--dataset', type=str, default='github', help='dataset') 63 | parser.add_argument('-t', '--timestamp', type=str, help='time stamp') 64 | parser.add_argument('--reload_from', type=int, default=-1, help='step to reload from') 65 | parser.add_argument('--batch_size', type=int, default=10000, help='how many instances for encoding and normalization at each step') 66 | parser.add_argument('--chunk_size', type=int, default=2000000, help='split code vector into chunks and store them individually. '\ 67 | 'Note: should be consistent with the same argument in the search.py') 68 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 69 | return parser.parse_args() 70 | 71 | 72 | if __name__ == '__main__': 73 | args = parse_args() 74 | repr_code(args) 75 | 76 | 77 | -------------------------------------------------------------------------------- /pytorch/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | tables 4 | numpy 5 | scipy 6 | tqdm 7 | tensorboardX 8 | transformers -------------------------------------------------------------------------------- /pytorch/search.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | import numpy as np 5 | import argparse 6 | import threading 7 | import codecs 8 | import logging 9 | logger = logging.getLogger(__name__) 10 | logging.basicConfig(level=logging.INFO, format="%(message)s") 11 | 12 | import torch 13 | 14 | from utils import normalize, similarity, sent2indexes 15 | from data_loader import load_dict, load_vecs 16 | import models, configs 17 | 18 | codevecs, codebase = [], [] 19 | 20 | ##### Data Set ##### 21 | def load_codebase(code_path, chunk_size=2000000): 22 | """load codebase 23 | codefile: h5 file that stores raw code 24 | """ 25 | logger.info(f'Loading codebase (chunk size={chunk_size})..') 26 | codebase= [] 27 | codes = codecs.open(code_path, encoding='latin-1').readlines() # use codecs to read in case of encoding problem 28 | for i in range(0, len(codes), chunk_size): 29 | codebase.append(codes[i: i+chunk_size]) 30 | ''' 31 | import subprocess 32 | n_lines = int(subprocess.check_output(["wc", "-l", code_path], universal_newlines=True).split()[0]) 33 | for i in range(1, n_lines+1, chunk_size): 34 | codecs = subprocess.check_output(["sed",'-n',f'{i},{i+chunk_size}p', code_path]).split() 35 | codebase.append(codecs) 36 | ''' 37 | return codebase 38 | 39 | ### Results Data ### 40 | def load_codevecs(vec_path, chunk_size=2000000): 41 | logger.debug(f'Loading code vectors (chunk size={chunk_size})..') 42 | """read vectors (2D numpy array) from a hdf5 file""" 43 | codevecs=[] 44 | chunk_id = 0 45 | chunk_path = f"{vec_path[:-3]}_part{chunk_id}.h5" 46 | while os.path.exists(chunk_path): 47 | reprs = load_vecs(chunk_path) 48 | codevecs.append(reprs) 49 | chunk_id+=1 50 | chunk_path = f"{vec_path[:-3]}_part{chunk_id}.h5" 51 | return codevecs 52 | 53 | def search(config, model, vocab, query, n_results=10): 54 | model.eval() 55 | device = next(model.parameters()).device 56 | desc, desc_len =sent2indexes(query, vocab_desc, config['desc_len'])#convert query into word indices 57 | desc = torch.from_numpy(desc).unsqueeze(0).to(device) 58 | desc_len = torch.from_numpy(desc_len).clamp(max=config['desc_len']).to(device) 59 | with torch.no_grad(): 60 | desc_repr = model.desc_encoding(desc, desc_len).data.cpu().numpy().astype(np.float32) # [1 x dim] 61 | if config['sim_measure']=='cos': # normalizing vector for fast cosine computation 62 | desc_repr = normalize(desc_repr) # [1 x dim] 63 | results =[] 64 | threads = [] 65 | for i, codevecs_chunk in enumerate(codevecs): 66 | t = threading.Thread(target=search_thread, args = (results, desc_repr, codevecs_chunk, i, n_results, config['sim_measure'])) 67 | threads.append(t) 68 | for t in threads: 69 | t.start() 70 | for t in threads:#wait until all sub-threads have completed 71 | t.join() 72 | return results 73 | 74 | def search_thread(results, desc_repr, codevecs, i, n_results, sim_measure): 75 | #1. compute code similarities 76 | if sim_measure=='cos': 77 | chunk_sims = np.dot(codevecs, desc_repr.T)[:,0] # [pool_size] 78 | else: 79 | chunk_sims = similarity(codevecs, desc_repr, sim_measure) # [pool_size] 80 | 81 | #2. select the top K results 82 | negsims = np.negative(chunk_sims) 83 | maxinds = np.argpartition(negsims, kth=n_results-1) 84 | maxinds = maxinds[:n_results] 85 | chunk_codes = [codebase[i][k] for k in maxinds] 86 | chunk_sims = chunk_sims[maxinds] 87 | results.extend(zip(chunk_codes, chunk_sims)) 88 | 89 | def postproc(codes_sims): 90 | codes_, sims_ = zip(*codes_sims) 91 | codes = [code for code in codes_] 92 | sims = [sim for sim in sims_] 93 | final_codes = [] 94 | final_sims = [] 95 | n = len(codes_sims) 96 | for i in range(n): 97 | is_dup=False 98 | for j in range(i): 99 | if codes[i][:80]==codes[j][:80] and abs(sims[i]-sims[j])<0.01: 100 | is_dup=True 101 | if not is_dup: 102 | final_codes.append(codes[i]) 103 | final_sims.append(sims[i]) 104 | return zip(final_codes,final_sims) 105 | 106 | def parse_args(): 107 | parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model") 108 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 109 | parser.add_argument('--model', type=str, default='JointEmbeder', help='model name') 110 | parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python') 111 | parser.add_argument('-t', '--timestamp', type=str, help='time stamp') 112 | parser.add_argument('--reload_from', type=int, default=-1, help='step to reload from') 113 | parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\ 114 | 'Note: should be consistent with the same argument in the repr_code.py') 115 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 116 | return parser.parse_args() 117 | 118 | 119 | if __name__ == '__main__': 120 | args = parse_args() 121 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 122 | config = getattr(configs, 'config_'+args.model)() 123 | 124 | ##### Define model ###### 125 | logger.info('Constructing Model..') 126 | model = getattr(models, args.model)(config)#initialize the model 127 | ckpt=f'./output/{args.model}/{args.dataset}/{args.timestamp}/models/step{args.reload_from}.h5' 128 | model.load_state_dict(torch.load(ckpt, map_location=device)) 129 | model.eval() 130 | data_path = args.data_path+args.dataset+'/' 131 | 132 | vocab_desc = load_dict(data_path+config['vocab_desc']) 133 | codebase = load_codebase(data_path+config['use_codebase'], args.chunk_size) 134 | codevecs = load_codevecs(data_path+config['use_codevecs'], args.chunk_size) 135 | assert len(codebase)==len(codevecs), \ 136 | "inconsistent number of chunks, check whether the specified files for codebase and code vectors are correct!" 137 | 138 | while True: 139 | try: 140 | query = input('Input Query: ') 141 | n_results = int(input('How many results? ')) 142 | except Exception: 143 | print("Exception while parsing your input:") 144 | traceback.print_exc() 145 | break 146 | query = query.lower().replace('how to ', '').replace('how do i ', '').replace('how can i ', '').replace('?', '').strip() 147 | results = search(config, model, vocab_desc, query, n_results) 148 | results = sorted(results, reverse=True, key=lambda x:x[1]) 149 | results = postproc(results) 150 | results = list(results)[:n_results] 151 | results = '\n\n'.join(map(str,results)) #combine the result into a returning string 152 | print(results) 153 | 154 | -------------------------------------------------------------------------------- /pytorch/setup.py: -------------------------------------------------------------------------------- 1 | #nsml: nsml/ml:cuda10.1-cudnn7-pytorch1.3keras2.3 2 | from distutils.core import setup 3 | setup( 4 | author='Xiaodong Gu', 5 | author_email='xiaodong.gu@navercorp.com', 6 | name='DeepCS', 7 | version='0.1', 8 | description='Hyperparameter tuning', 9 | install_requires = [ 10 | 'numpy', 11 | 'protobuf', 12 | 'six', 13 | 'tables', 14 | 'tensorboardX', 15 | 'tqdm', 16 | 'transformers', 17 | ] 18 | ) 19 | -------------------------------------------------------------------------------- /pytorch/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import random 4 | import time 5 | from datetime import datetime 6 | import numpy as np 7 | import math 8 | import argparse 9 | random.seed(42) 10 | from tqdm import tqdm 11 | 12 | import logging 13 | logger = logging.getLogger(__name__) 14 | logging.basicConfig(level=logging.INFO, format="%(message)s") 15 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package 16 | 17 | import torch 18 | 19 | import models, configs, data_loader 20 | from modules import get_cosine_schedule_with_warmup 21 | from utils import similarity, normalize 22 | from data_loader import * 23 | 24 | try: 25 | import nsml 26 | from nsml import DATASET_PATH, IS_ON_NSML, SESSION_NAME 27 | except: 28 | IS_ON_NSML = False 29 | 30 | def bind_nsml(model, **kwargs): 31 | if type(model) == torch.nn.DataParallel: model = model.module 32 | def infer(raw_data, **kwargs): 33 | pass 34 | def load(path, *args): 35 | global global_step 36 | state = torch.load(os.path.join(path, 'model.pt')) 37 | model.load_state_dict(state['model']) 38 | global_step = state['step'] 39 | if 'optimizer' in state and optimizer: 40 | optimizer.load_state_dict(state['optimizer']) 41 | logger.info(f'Load checkpoints...!{path}') 42 | def save(path, *args): 43 | global global_step 44 | state = { 45 | 'model': model.state_dict(), 46 | 'step' : global_step 47 | } 48 | torch.save(state, os.path.join(path, 'model.pt')) 49 | logger.info(f'Save checkpoints...!{path}') 50 | # function in function is just used to divide the namespace. 51 | nsml.bind(save=save, load=load, infer=infer) 52 | 53 | 54 | def train(args): 55 | timestamp = datetime.now().strftime('%Y%m%d%H%M') 56 | # make output directory if it doesn't already exist 57 | os.makedirs(f'./output/{args.model}/{args.dataset}/{timestamp}/models', exist_ok=True) 58 | os.makedirs(f'./output/{args.model}/{args.dataset}/{timestamp}/tmp_results', exist_ok=True) 59 | 60 | fh = logging.FileHandler(f"./output/{args.model}/{args.dataset}/{timestamp}/logs.txt") 61 | # create file handler which logs even debug messages 62 | logger.addHandler(fh)# add the handlers to the logger 63 | 64 | tb_writer = SummaryWriter(f"./output/{args.model}/{args.dataset}/{timestamp}/logs/" ) if args.visual else None 65 | 66 | random.seed(args.seed) 67 | np.random.seed(args.seed) 68 | torch.manual_seed(args.seed) 69 | torch.cuda.manual_seed(args.seed) 70 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 71 | 72 | config=getattr(configs, 'config_'+args.model)() 73 | if args.automl: 74 | config.update(vars(args)) 75 | print(config) 76 | 77 | ############################################################################### 78 | # Load data 79 | ############################################################################### 80 | data_path = DATASET_PATH+"/train/" if IS_ON_NSML else args.data_path+args.dataset+'/' 81 | train_set = eval(config['dataset_name'])(data_path, config['train_name'], config['name_len'], 82 | config['train_api'], config['api_len'], 83 | config['train_tokens'], config['tokens_len'], 84 | config['train_desc'], config['desc_len']) 85 | valid_set = eval(config['dataset_name'])(data_path, 86 | config['valid_name'], config['name_len'], 87 | config['valid_api'], config['api_len'], 88 | config['valid_tokens'], config['tokens_len'], 89 | config['valid_desc'], config['desc_len']) 90 | data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], 91 | shuffle=True, drop_last=True, num_workers=1) 92 | 93 | ############################################################################### 94 | # Define Model 95 | ############################################################################### 96 | logger.info('Constructing Model..') 97 | model = getattr(models, args.model)(config)#initialize the model 98 | 99 | def save_model(model, ckpt_path): 100 | torch.save(model.state_dict(), ckpt_path) 101 | 102 | def load_model(model, ckpt_path, to_device): 103 | assert os.path.exists(ckpt_path), f'Weights not found' 104 | model.load_state_dict(torch.load(ckpt_path, map_location=to_device)) 105 | 106 | if args.reload_from>0: 107 | ckpt = f'./output/{args.model}/{args.dataset}/{timestamp}/models/step{args.reload_from}.h5' 108 | load_model(model, ckpt, device) 109 | 110 | if IS_ON_NSML: 111 | bind_nsml(model) 112 | if args.pause: 113 | nsml.paused(locals()) 114 | 115 | model.to(device) 116 | 117 | ############################################################################### 118 | # Prepare the Optimizer 119 | ############################################################################### 120 | 121 | no_decay = ['bias', 'LayerNorm.weight'] 122 | optimizer_grouped_parameters = [ 123 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, 124 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 125 | ] 126 | optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon']) 127 | scheduler = get_cosine_schedule_with_warmup( 128 | optimizer, num_warmup_steps=config['warmup_steps'], 129 | num_training_steps=len(data_loader)*config['nb_epoch']) # do not foget to modify the number when dataset is changed 130 | if config['fp16']: 131 | try: 132 | from apex import amp 133 | except ImportError: 134 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 135 | model, optimizer = amp.initialize(model, optimizer, opt_level=config['fp16_opt_level']) 136 | 137 | ############################################################################### 138 | # Training Process 139 | ############################################################################### 140 | n_iters = len(data_loader) 141 | global global_step 142 | global_step = args.reload_from+1 143 | for epoch in range(int(args.reload_from/n_iters)+1, config['nb_epoch']+1): 144 | itr_start_time = time.time() 145 | losses=[] 146 | for batch in data_loader: 147 | 148 | model.train() 149 | batch_gpu = [tensor.to(device) for tensor in batch] 150 | loss = model(*batch_gpu) 151 | 152 | if config['fp16']: 153 | with amp.scale_loss(loss, optimizer) as scaled_loss: 154 | scaled_loss.backward() 155 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 5.0) 156 | else: 157 | loss.backward() 158 | torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) 159 | 160 | optimizer.step() 161 | scheduler.step() 162 | model.zero_grad() 163 | 164 | losses.append(loss.item()) 165 | 166 | if global_step % args.log_every ==0: 167 | elapsed = time.time() - itr_start_time 168 | logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f'% 169 | (epoch, config['nb_epoch'], global_step%n_iters, n_iters, elapsed, np.mean(losses))) 170 | if tb_writer is not None: 171 | tb_writer.add_scalar('loss', np.mean(losses), global_step) 172 | if IS_ON_NSML: 173 | summary = {"summary": True, "scope": locals(), "step": global_step} 174 | summary.update({'loss':np.mean(losses)}) 175 | nsml.report(**summary) 176 | 177 | losses=[] 178 | itr_start_time = time.time() 179 | global_step = global_step + 1 180 | 181 | if global_step % args.valid_every == 0: 182 | logger.info("validating..") 183 | valid_result = validate(valid_set, model, 100000, 1, config['sim_measure']) 184 | logger.info(valid_result) 185 | if tb_writer is not None: 186 | for key, value in valid_result.items(): 187 | tb_writer.add_scalar(key, value, global_step) 188 | if IS_ON_NSML: 189 | summary = {"summary": True, "scope": locals(), "step": global_step} 190 | summary.update(valid_result) 191 | nsml.report(**summary) 192 | 193 | if global_step % args.save_every == 0: 194 | ckpt_path = f'./output/{args.model}/{args.dataset}/{timestamp}/models/step{global_step}.h5' 195 | save_model(model, ckpt_path) 196 | if IS_ON_NSML: 197 | nsml.save(checkpoint=f'model_step{global_step}') 198 | 199 | ##### Evaluation ##### 200 | def validate(valid_set, model, pool_size, K, sim_measure): 201 | """ 202 | simple validation in a code pool. 203 | @param: poolsize - size of the code pool, if -1, load the whole test set 204 | """ 205 | def ACC(real,predict): 206 | sum=0.0 207 | for val in real: 208 | try: index=predict.index(val) 209 | except ValueError: index=-1 210 | if index!=-1: sum=sum+1 211 | return sum/float(len(real)) 212 | def MAP(real,predict): 213 | sum=0.0 214 | for id, val in enumerate(real): 215 | try: index=predict.index(val) 216 | except ValueError: index=-1 217 | if index!=-1: sum=sum+(id+1)/float(index+1) 218 | return sum/float(len(real)) 219 | def MRR(real, predict): 220 | sum=0.0 221 | for val in real: 222 | try: index = predict.index(val) 223 | except ValueError: index=-1 224 | if index!=-1: sum=sum+1.0/float(index+1) 225 | return sum/float(len(real)) 226 | def NDCG(real, predict): 227 | dcg=0.0 228 | idcg=IDCG(len(real)) 229 | for i, predictItem in enumerate(predict): 230 | if predictItem in real: 231 | itemRelevance = 1 232 | rank = i+1 233 | dcg +=(math.pow(2,itemRelevance)-1.0)*(math.log(2)/math.log(rank+1)) 234 | return dcg/float(idcg) 235 | def IDCG(n): 236 | idcg=0 237 | itemRelevance=1 238 | for i in range(n): idcg+=(math.pow(2,itemRelevance)-1.0)*(math.log(2)/math.log(i+2)) 239 | return idcg 240 | 241 | model.eval() 242 | device = next(model.parameters()).device 243 | 244 | data_loader = torch.utils.data.DataLoader(dataset=valid_set, batch_size=10000, 245 | shuffle=True, drop_last=True, num_workers=1) 246 | accs, mrrs, maps, ndcgs=[],[],[],[] 247 | code_reprs, desc_reprs = [], [] 248 | n_processed = 0 249 | for batch in tqdm(data_loader): 250 | if len(batch) == 10: # names, name_len, apis, api_len, toks, tok_len, descs, desc_len, bad_descs, bad_desc_len 251 | code_batch = [tensor.to(device) for tensor in batch[:6]] 252 | desc_batch = [tensor.to(device) for tensor in batch[6:8]] 253 | else: # code_ids, type_ids, code_mask, good_ids, good_mask, bad_ids, bad_mask 254 | code_batch = [tensor.to(device) for tensor in batch[:3]] 255 | desc_batch = [tensor.to(device) for tensor in batch[3:5]] 256 | with torch.no_grad(): 257 | code_repr=model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32) 258 | desc_repr=model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size] 259 | if sim_measure=='cos': 260 | code_repr = normalize(code_repr) 261 | desc_repr = normalize(desc_repr) 262 | code_reprs.append(code_repr) 263 | desc_reprs.append(desc_repr) 264 | n_processed += batch[0].size(0) 265 | code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) 266 | 267 | for k in tqdm(range(0, n_processed, pool_size)): 268 | code_pool, desc_pool = code_reprs[k:k+pool_size], desc_reprs[k:k+pool_size] 269 | for i in range(min(10000, pool_size)): # for i in range(pool_size): 270 | desc_vec = np.expand_dims(desc_pool[i], axis=0) # [1 x dim] 271 | n_results = K 272 | if sim_measure=='cos': 273 | sims = np.dot(code_pool, desc_vec.T)[:,0] # [pool_size] 274 | else: 275 | sims = similarity(code_pool, desc_vec, sim_measure) # [pool_size] 276 | 277 | negsims=np.negative(sims) 278 | predict = np.argpartition(negsims, kth=n_results-1)#predict=np.argsort(negsims)# 279 | predict = predict[:n_results] 280 | predict = [int(k) for k in predict] 281 | real = [i] 282 | accs.append(ACC(real,predict)) 283 | mrrs.append(MRR(real,predict)) 284 | maps.append(MAP(real,predict)) 285 | ndcgs.append(NDCG(real,predict)) 286 | return {'acc':np.mean(accs), 'mrr': np.mean(mrrs), 'map': np.mean(maps), 'ndcg': np.mean(ndcgs)} 287 | 288 | def parse_args(): 289 | parser = argparse.ArgumentParser("Train and Validate The Code Search (Embedding) Model") 290 | parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus') 291 | parser.add_argument('--model', type=str, default='JointEmbeder', help='model name: JointEmbeder, SelfAttnModel') 292 | parser.add_argument('--dataset', type=str, default='github', help='name of dataset.java, python') 293 | parser.add_argument('--reload_from', type=int, default=-1, help='epoch to reload from') 294 | 295 | parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID') 296 | parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard") 297 | parser.add_argument('--automl', action='store_true', default=False, help='use automl') 298 | # Training Arguments 299 | parser.add_argument('--log_every', type=int, default=100, help='interval to log autoencoder training results') 300 | parser.add_argument('--valid_every', type=int, default=10000, help='interval to validation') 301 | parser.add_argument('--save_every', type=int, default=50000, help='interval to evaluation to concrete results') 302 | parser.add_argument('--seed', type=int, default=1111, help='random seed') 303 | 304 | parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") 305 | 306 | # Model Hyperparameters for automl tuning 307 | #parser.add_argument('--emb_size', type=int, default=-1, help = 'embedding dim') 308 | parser.add_argument('--n_hidden', type=int, default= -1, help='number of hidden dimension of code/desc representation') 309 | parser.add_argument('--lstm_dims', type=int, default= -1) 310 | parser.add_argument('--margin', type=float, default= -1) 311 | parser.add_argument('--sim_measure', type=str, default = 'cos', help='similarity measure for training') 312 | 313 | parser.add_argument('--learning_rate', type=float, help='learning rate') 314 | #parser.add_argument('--adam_epsilon', type=float) 315 | #parser.add_argument("--weight_decay", type=float, help="Weight deay if we apply some.") 316 | #parser.add_argument('--warmup_steps', type=int) 317 | 318 | # reserved args for automl pbt 319 | parser.add_argument('--pause', default=0, type=int) 320 | parser.add_argument('--iteration', default=0, type=str) 321 | 322 | return parser.parse_args() 323 | 324 | if __name__ == '__main__': 325 | args = parse_args() 326 | 327 | torch.backends.cudnn.benchmark = True # speed up training by using cudnn 328 | torch.backends.cudnn.deterministic = True # fix the random seed in cudnn 329 | 330 | train(args) 331 | 332 | -------------------------------------------------------------------------------- /pytorch/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import math 4 | import torch 5 | from torch.nn import functional as F 6 | 7 | PAD_ID, SOS_ID, EOS_ID, UNK_ID = [0, 1, 2, 3] 8 | 9 | def cos_approx(data1,data2): 10 | """numpy implementation of cosine similarity for matrix""" 11 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 12 | dotted = np.dot(data1,np.transpose(data2)) 13 | norm1 = np.linalg.norm(data1,axis=1) 14 | norm2 = np.linalg.norm(data2,axis=1) 15 | matrix_vector_norms = np.multiply(norm1, norm2) 16 | neighbors = np.divide(dotted, matrix_vector_norms) 17 | return neighbors 18 | 19 | def normalize(data): 20 | """normalize matrix by rows""" 21 | return data/np.linalg.norm(data,axis=1,keepdims=True) 22 | 23 | def dot_np(data1,data2): 24 | """cosine similarity for normalized vectors""" 25 | #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.") 26 | return np.dot(data1, data2.T) 27 | 28 | def sigmoid(x): 29 | return 1/(1 + np.exp(-x)) 30 | 31 | def similarity(vec1, vec2, measure='cos'): 32 | if measure=='cos': 33 | vec1_norm = normalize(vec1) 34 | vec2_norm = normalize(vec2) 35 | return np.dot(vec1_norm, vec2_norm.T)[:,0] 36 | elif measure=='poly': 37 | return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2 38 | elif measure=='sigmoid': 39 | return np.tanh(np.dot(vec1, vec2.T).diagonal()+1) 40 | elif measure in ['euc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf 41 | euc_dist = np.linalg.norm(vec1-vec2, axis=1) 42 | euc_sim = 1 / (1 + euc_dist) 43 | if measure=='euc': return euc_sim 44 | sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1) 45 | if measure == 'gesd': return euc_sim * sigmoid_sim 46 | elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim) 47 | 48 | ####################################################################### 49 | 50 | def asMinutes(s): 51 | m = math.floor(s / 60) 52 | s -= m * 60 53 | return '%d:%d'% (m, s) 54 | 55 | def timeSince(since, percent): 56 | now = time.time() 57 | s = now - since 58 | es = s / (percent) 59 | rs = es - s 60 | return '%s<%s'%(asMinutes(s), asMinutes(rs)) 61 | 62 | ####################################################################### 63 | import nltk 64 | try: nltk.word_tokenize("hello world") 65 | except LookupError: nltk.download('punkt') 66 | 67 | def sent2indexes(sentence, vocab, maxlen): 68 | '''sentence: a string or list of string 69 | return: a numpy array of word indices 70 | ''' 71 | def convert_sent(sent, vocab, maxlen): 72 | idxes = np.zeros(maxlen, dtype=np.int64) 73 | idxes.fill(PAD_ID) 74 | tokens = nltk.word_tokenize(sent.strip()) 75 | idx_len = min(len(tokens), maxlen) 76 | for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID) 77 | return idxes, idx_len 78 | if type(sentence) is list: 79 | inds, lens = [], [] 80 | for sent in sentence: 81 | idxes, idx_len = convert_sent(sent, vocab, maxlen) 82 | #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len]) 83 | inds.append(idxes) 84 | lens.append(idx_len) 85 | return np.vstack(inds), np.vstack(lens) 86 | else: 87 | inds, lens = sent2indexes([sentence], vocab, maxlen) 88 | return inds[0], lens[0] 89 | 90 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 91 | '''indexes: numpy array''' 92 | def revert_sent(indexes, ivocab, ignore_tok=PAD_ID): 93 | indexes=filter(lambda i: i!=ignore_tok, indexes) 94 | toks, length = [], 0 95 | for idx in indexes: 96 | toks.append(ivocab.get(idx, '')) 97 | length+=1 98 | if idx == EOS_ID: 99 | break 100 | return ' '.join(toks), length 101 | 102 | ivocab = {v: k for k, v in vocab.items()} 103 | if indexes.ndim==1:# one sentence 104 | return revert_sent(indexes, ivocab, ignore_tok) 105 | else:# dim>1 106 | sentences, lens =[], [] # a batch of sentences 107 | for inds in indexes: 108 | sentence, length = revert_sent(inds, ivocab, ignore_tok) 109 | sentences.append(sentence) 110 | lens.append(length) 111 | return sentences, lens 112 | 113 | ######################################################################## 114 | --------------------------------------------------------------------------------