├── LICENSE.md
├── README.md
├── keras
    ├── LICENSE.md
    ├── README.md
    ├── configs.py
    ├── data
    │   ├── example
    │   │   ├── test.apiseq.h5
    │   │   ├── test.apiseq.txt
    │   │   ├── test.desc.h5
    │   │   ├── test.desc.txt
    │   │   ├── test.meta.txt
    │   │   ├── test.methname.h5
    │   │   ├── test.methname.txt
    │   │   ├── test.tokens.h5
    │   │   ├── test.tokens.txt
    │   │   ├── train.apiseq.h5
    │   │   ├── train.apiseq.txt
    │   │   ├── train.desc.h5
    │   │   ├── train.desc.txt
    │   │   ├── train.methname.h5
    │   │   ├── train.methname.txt
    │   │   ├── train.tokens.h5
    │   │   ├── train.tokens.txt
    │   │   ├── use.apiseq.h5
    │   │   ├── use.apiseq.txt
    │   │   ├── use.desc.h5
    │   │   ├── use.desc.txt
    │   │   ├── use.methname.h5
    │   │   ├── use.methname.txt
    │   │   ├── use.tokens.h5
    │   │   ├── use.tokens.txt
    │   │   ├── vocab.apiseq.pkl
    │   │   ├── vocab.desc.pkl
    │   │   ├── vocab.methname.pkl
    │   │   └── vocab.tokens.pkl
    │   └── github
    │   │   ├── test.apiseq.h5
    │   │   ├── test.desc.h5
    │   │   ├── test.methname.h5
    │   │   ├── test.rawcode.txt
    │   │   ├── test.tokens.h5
    │   │   ├── train.apiseq.h5
    │   │   ├── train.desc.h5
    │   │   ├── train.methname.h5
    │   │   ├── train.tokens.h5
    │   │   ├── use.apiseq.h5
    │   │   ├── use.codevecs.normalized.h5
    │   │   ├── use.methname.h5
    │   │   ├── use.rawcode.txt
    │   │   ├── use.tokens.h5
    │   │   ├── vocab.apiseq.pkl
    │   │   ├── vocab.desc.pkl
    │   │   ├── vocab.methname.pkl
    │   │   └── vocab.tokens.pkl
    ├── data_loader.py
    ├── main.py
    ├── models.py
    ├── requirements.txt
    ├── results
    │   └── results.xlsx
    └── utils.py
└── pytorch
    ├── LICENSE.md
    ├── README.md
    ├── automl_config.yaml
    ├── configs.py
    ├── data
        ├── example
        │   ├── test.apiseq.h5
        │   ├── test.apiseq.txt
        │   ├── test.desc.h5
        │   ├── test.desc.txt
        │   ├── test.meta.txt
        │   ├── test.methname.h5
        │   ├── test.methname.txt
        │   ├── test.tokens.h5
        │   ├── test.tokens.txt
        │   ├── train.apiseq.h5
        │   ├── train.apiseq.txt
        │   ├── train.desc.h5
        │   ├── train.desc.txt
        │   ├── train.methname.h5
        │   ├── train.methname.txt
        │   ├── train.tokens.h5
        │   ├── train.tokens.txt
        │   ├── use.apiseq.h5
        │   ├── use.apiseq.txt
        │   ├── use.desc.h5
        │   ├── use.desc.txt
        │   ├── use.methname.h5
        │   ├── use.methname.txt
        │   ├── use.tokens.h5
        │   ├── use.tokens.txt
        │   ├── vocab.apiseq.pkl
        │   ├── vocab.desc.pkl
        │   ├── vocab.methname.pkl
        │   └── vocab.tokens.pkl
        └── github
        │   ├── train.apiseq.h5
        │   ├── train.desc.h5
        │   ├── train.methname.h5
        │   ├── train.tokens.h5
        │   ├── use.apiseq.h5
        │   ├── use.methname.h5
        │   ├── use.rawcode.txt
        │   ├── use.tokens.h5
        │   ├── vocab.apiseq.json
        │   ├── vocab.desc.json
        │   ├── vocab.name.json
        │   └── vocab.tokens.json
    ├── data_loader.py
    ├── models
        ├── __init__.py
        └── jointemb.py
    ├── modules.py
    ├── repr_code.py
    ├── requirements.txt
    ├── search.py
    ├── setup.py
    ├── train.py
    └── utils.py


/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Xiaodong Gu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Code Search
 2 | 
 3 |   Code for the ICSE 2018 paper [Deep Code Search](https://guxd.github.io/papers/deepcs.pdf).
 4 | 
 5 | ## Two Versions
 6 | We release both ```Keras``` and ```PyTorch``` code of our approach, in the ```keras``` and ```pytorch``` folders, respectively.
 7 | 
 8 | - The ```Keras``` folder contains the code to run the experiments presented in the paper. The code is frozen to what it was when we originally wrote the paper. (NOTE: we modified some deprecated API invocations to fit for the latest Keras and theano).
 9 | 
10 | - The ```PyTorch``` is the bleeding-edge reporitory where we packaged it up, improved the code quality and added some features.
11 | 
12 | ⚠️ **Note that the PyTorch version is problematic at present. For those who want to replicate DeepCS as a baseline model, it is highly recommended to check out the Keras version. This could greatly save your time and effort**.
13 | 
14 | 🤗 Nevertheless, if you are interested in using and improving DeepCS, check out the PyTorch version and feel free to contribute.
15 | 
16 | For more information, please refer to the README files under the directory of each component.
17 | 
18 | 
19 | 
20 | ## Tool Demo
21 | 
22 | An online tool demo can be found in http://211.249.63.55:81/  (Unavailable due to budget constraint)
23 | 
24 | ## Citation
25 | If you find it useful and would like to cite it, the following would be appropriate:
26 | ```bibtex
27 | @inproceedings{gu2018deepcs,
28 |   title={Deep Code Search},
29 |   author={Gu, Xiaodong and Zhang, Hongyu and Kim, Sunghun},
30 |   booktitle={Proceedings of the 2018 40th International Conference on Software Engineering (ICSE 2018)},
31 |   year={2018},
32 |   organization={ACM}
33 | }
34 | ```
35 | 


--------------------------------------------------------------------------------
/keras/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Xiaodong Gu 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/keras/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Code Search
 2 | A keras implementation of the paper [Deep Code Search](https://guxd.github.io/papers/deepcs.pdf).
 3 | 
 4 | ## Dependency
 5 | > Tested in Ubuntu 16.04
 6 | * Python 3.6
 7 | * Keras 2.3.1 or newer
 8 | * Tensorflow 2.0.0 or Theano 0.8.0~0.9.1
 9 | 
10 | ## Code Structures
11 | 
12 |  - `models.py`: Neural network models for code/desc representation and similarity measure.
13 |  
14 |  - `main.py`: The main entry for code search, including four sub-tasks: 
15 |      1) Train: train the code/desc representaton models; 
16 |      2) Eval: evaluate the learnt code/desc representation models; 
17 |      3) Code Embedding: encode code into vectors and store them to a file; 
18 |      4) Search: search relevant code for a given query.
19 |      
20 |  - `configs.py`: Configurations for models defined in the `models.py`. 
21 |    Each function defines the hyperparameters for the corresponding model.
22 | 
23 | 
24 | ## Usage
25 | 
26 |    ### Data Preparation
27 |   The `/data` folder provides a small dummy dataset for quick deployment. 
28 |   To train and test our model:
29 |   
30 |   1) Download and unzip real dataset from [Google Drive](https://drive.google.com/drive/folders/1GZYLT_lzhlVczXjD6dgwVUvDDPHMB6L7?usp=sharing) or [Baidu Pan](https://pan.baidu.com/s/1U_MtFXqq0C-Qh8WUFAWGvg) for Chinese users.
31 |   
32 |   2) Replace each file in the `/data` folder with the corresponding real file. 
33 |   
34 |    ### Configuration
35 |    
36 |    Edit hyper-parameters and settings in `config.py`
37 |    
38 |    ### Train
39 |    
40 |    ```bash
41 |    python main.py --mode train
42 |    ```
43 |    
44 |    ### Code Embedding
45 |    
46 |    First, set `reload` in `config.py` to the number of optimal checkpoint, e.g., 500
47 |    
48 |    Then, run
49 |    ```bash
50 |    python main.py --mode repr_code
51 |    ```
52 |    
53 |    ### Search
54 |    
55 |    First, set `reload` in `config.py` to the number of optimal checkpoint, e.g., 500  
56 |    
57 |    Then, run
58 |    ```bash
59 |    python main.py --mode search
60 |    ``` 
61 |    
62 | ## Tool Demo
63 | 
64 | An online tool demo can be found at http://211.249.63.55:81/ (Unavailable Now)
65 | 
66 | ## Citation
67 | If you find it useful and would like to cite it, the following would be appropriate:
68 | ```
69 | @inproceedings{gu2018deepcs,
70 |   title={Deep Code Search},
71 |   author={Gu, Xiaodong and Zhang, Hongyu and Kim, Sunghun},
72 |   booktitle={Proceedings of the 2018 40th International Conference on Software Engineering (ICSE 2018)},
73 |   year={2018},
74 |   organization={ACM}
75 | }
76 | ```
77 | 


--------------------------------------------------------------------------------
/keras/configs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def config_JointEmbeddingModel():   
 3 |     config = {
 4 |         'data_params':{
 5 |             #training data
 6 |             'train_methname':'train.methname.h5',
 7 |             'train_apiseq':'train.apiseq.h5',
 8 |             'train_tokens':'train.tokens.h5',
 9 |             'train_desc':'train.desc.h5',
10 |             #valid data
11 |             'valid_methname':'test.methname.h5',
12 |             'valid_apiseq':'test.apiseq.h5',
13 |             'valid_tokens':'test.tokens.h5',
14 |             'valid_desc':'test.desc.h5',
15 |             #use data (computing code vectors)
16 |             'use_codebase':'use.rawcode.txt',#'use.rawcode.h5'
17 |             'use_methname':'use.methname.h5',
18 |             'use_apiseq':'use.apiseq.h5',
19 |             'use_tokens':'use.tokens.h5',     
20 |             #results data(code vectors)            
21 |             'use_codevecs':'use.codevecs.normalized.h5',#'use.codevecs.h5',         
22 |                    
23 |             #parameters
24 |             'methname_len': 6,
25 |             'apiseq_len':30,
26 |             'tokens_len':50,
27 |             'desc_len': 30,
28 |             'n_words': 10000, # len(vocabulary) + 1
29 |             #vocabulary info
30 |             'vocab_methname':'vocab.methname.pkl',
31 |             'vocab_apiseq':'vocab.apiseq.pkl',
32 |             'vocab_tokens':'vocab.tokens.pkl',
33 |             'vocab_desc':'vocab.desc.pkl',
34 |         },               
35 |         'training_params': {           
36 |             'batch_size': 128,
37 |             'chunk_size':100000,
38 |             'nb_epoch': 2000,
39 |             'validation_split': 0.2,
40 |             'optimizer': 'adam',
41 |             # 'optimizer': Adam(clip_norm=0.1),
42 |             'valid_every': 5,
43 |             'n_eval': 100,
44 |             'evaluate_all_threshold': {
45 |                 'mode': 'all',
46 |                 'top1': 0.4,
47 |             },
48 |             'save_every': 10,
49 |             'reload':-1, #epoch that the model is reloaded from . If reload=0, then train from scratch
50 |         },
51 | 
52 |         'model_params': {
53 |             'n_embed_dims': 100,
54 |             'n_hidden': 400,#number of hidden dimension of code/desc representation
55 |             # recurrent
56 |             'n_lstm_dims': 200, # * 2
57 |             'init_embed_weights_methname': None,#'word2vec_100_methname.h5', 
58 |             'init_embed_weights_tokens': None,#'word2vec_100_tokens.h5', 
59 |             'init_embed_weights_desc': None,#'word2vec_100_desc.h5',           
60 |             'margin': 0.05,
61 |             'sim_measure':'cos',#similarity measure: gesd, cos, aesd
62 |         }        
63 |     }
64 |     return config
65 | 
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/keras/data/example/test.apiseq.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/test.apiseq.h5


--------------------------------------------------------------------------------
/keras/data/example/test.desc.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/test.desc.h5


--------------------------------------------------------------------------------
/keras/data/example/test.meta.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/test.meta.txt


--------------------------------------------------------------------------------
/keras/data/example/test.methname.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/test.methname.h5


--------------------------------------------------------------------------------
/keras/data/example/test.tokens.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/test.tokens.h5


--------------------------------------------------------------------------------
/keras/data/example/train.apiseq.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/train.apiseq.h5


--------------------------------------------------------------------------------
/keras/data/example/train.desc.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/train.desc.h5


--------------------------------------------------------------------------------
/keras/data/example/train.methname.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/train.methname.h5


--------------------------------------------------------------------------------
/keras/data/example/train.tokens.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/train.tokens.h5


--------------------------------------------------------------------------------
/keras/data/example/use.apiseq.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/use.apiseq.h5


--------------------------------------------------------------------------------
/keras/data/example/use.desc.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/use.desc.h5


--------------------------------------------------------------------------------
/keras/data/example/use.methname.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/use.methname.h5


--------------------------------------------------------------------------------
/keras/data/example/use.tokens.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/use.tokens.h5


--------------------------------------------------------------------------------
/keras/data/example/vocab.apiseq.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/vocab.apiseq.pkl


--------------------------------------------------------------------------------
/keras/data/example/vocab.desc.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/vocab.desc.pkl


--------------------------------------------------------------------------------
/keras/data/example/vocab.methname.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/vocab.methname.pkl


--------------------------------------------------------------------------------
/keras/data/example/vocab.tokens.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/example/vocab.tokens.pkl


--------------------------------------------------------------------------------
/keras/data/github/test.apiseq.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/test.apiseq.h5


--------------------------------------------------------------------------------
/keras/data/github/test.desc.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/test.desc.h5


--------------------------------------------------------------------------------
/keras/data/github/test.methname.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/test.methname.h5


--------------------------------------------------------------------------------
/keras/data/github/test.tokens.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/test.tokens.h5


--------------------------------------------------------------------------------
/keras/data/github/train.apiseq.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/train.apiseq.h5


--------------------------------------------------------------------------------
/keras/data/github/train.desc.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/train.desc.h5


--------------------------------------------------------------------------------
/keras/data/github/train.methname.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/train.methname.h5


--------------------------------------------------------------------------------
/keras/data/github/train.tokens.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/train.tokens.h5


--------------------------------------------------------------------------------
/keras/data/github/use.apiseq.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/use.apiseq.h5


--------------------------------------------------------------------------------
/keras/data/github/use.codevecs.normalized.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/use.codevecs.normalized.h5


--------------------------------------------------------------------------------
/keras/data/github/use.methname.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/use.methname.h5


--------------------------------------------------------------------------------
/keras/data/github/use.tokens.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/use.tokens.h5


--------------------------------------------------------------------------------
/keras/data/github/vocab.apiseq.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/vocab.apiseq.pkl


--------------------------------------------------------------------------------
/keras/data/github/vocab.desc.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/vocab.desc.pkl


--------------------------------------------------------------------------------
/keras/data/github/vocab.methname.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/vocab.methname.pkl


--------------------------------------------------------------------------------
/keras/data/github/vocab.tokens.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/data/github/vocab.tokens.pkl


--------------------------------------------------------------------------------
/keras/data_loader.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import codecs
 3 | import tables
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | import logging
 7 | logger = logging.getLogger(__name__)
 8 | logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
 9 | 
10 | 
11 | def load_pickle(filename):
12 |     return pickle.load(open(filename, 'rb'))    
13 | 
14 | ##### Data Set #####
15 | def load_codebase(path, chunk_size):
16 |     """load codebase
17 |     codefile: h5 file that stores raw code
18 |     """
19 |     logger.info('Loading codebase (chunk size={})..'.format(chunk_size))
20 |     codebase=[]
21 |     #codes=codecs.open(self.path+self.data_params['use_codebase']).readlines()
22 |     codes=codecs.open(path, encoding='utf8',errors='replace').readlines()
23 |         #use codecs to read in case of encoding problem
24 |     for i in tqdm(range(0,len(codes), chunk_size)):
25 |         codebase.append(codes[i:i+chunk_size])            
26 |     return codebase
27 | 
28 | ### Results Data ###
29 | def load_code_reprs(path, chunk_size):
30 |     logger.debug(f'Loading code vectors (chunk size={chunk_size})..')          
31 |     """reads vectors (2D numpy array) from a hdf5 file"""
32 |     codereprs=[]
33 |     h5f = tables.open_file(path)
34 |     vecs = h5f.root.vecs
35 |     for i in range(0, len(vecs), chunk_size):
36 |         codereprs.append(vecs[i: i+ chunk_size])
37 |     h5f.close()
38 |     return codereprs
39 | 
40 | def save_code_reprs(vecs, path):
41 |     npvecs=np.array(vecs)
42 |     fvec = tables.open_file(path, 'w')
43 |     atom = tables.Atom.from_dtype(npvecs.dtype)
44 |     filters = tables.Filters(complib='blosc', complevel=5)
45 |     ds = fvec.create_carray(fvec.root, 'vecs', atom, npvecs.shape,filters=filters)
46 |     ds[:] = npvecs
47 |     fvec.close()
48 | 
49 | def load_hdf5(vecfile, start_offset, chunk_size):
50 |     """reads training sentences(list of int array) from a hdf5 file"""  
51 |     table = tables.open_file(vecfile)
52 |     data = table.get_node('/phrases')[:].astype(np.int)
53 |     index = table.get_node('/indices')[:]
54 |     data_len = index.shape[0]
55 |     if chunk_size==-1:#if chunk_size is set to -1, then, load all data
56 |         chunk_size=data_len
57 |     start_offset = start_offset%data_len    
58 |     logger.debug("{} entries".format(data_len))
59 |     logger.debug("starting from offset {} to {}".format(start_offset,start_offset+chunk_size))
60 |     sents = []
61 |     for offset in tqdm(range(start_offset, start_offset+chunk_size)):
62 |         offset = offset%data_len
63 |         len, pos = index[offset]['length'], index[offset]['pos']
64 |         sents.append(data[pos:pos + len])
65 |     table.close()
66 |     return sents 
67 | 


--------------------------------------------------------------------------------
/keras/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import random
  4 | import traceback
  5 | from tensorflow.keras.optimizers import RMSprop, Adam
  6 | from scipy.stats import rankdata
  7 | import math
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | import argparse
 11 | random.seed(42)
 12 | import threading 
 13 | import configs
 14 | import logging
 15 | logger = logging.getLogger(__name__)
 16 | logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
 17 | 
 18 | from utils import normalize, pad, convert, revert
 19 | import models, configs, data_loader
 20 | 
 21 | class SearchEngine:
 22 |     def __init__(self, args, conf=None):
 23 |         self.data_path = args.data_path + args.dataset+'/' 
 24 |         self.train_params = conf.get('training_params', dict())
 25 |         self.data_params = conf.get('data_params',dict())
 26 |         self.model_params = conf.get('model_params',dict())
 27 |         
 28 |         self._eval_sets = None
 29 |         
 30 |         self._code_reprs = None
 31 |         self._codebase = None
 32 |         self._codebase_chunksize = 2000000
 33 | 
 34 |     ##### Model Loading / saving #####
 35 |     def save_model(self, model, epoch):
 36 |         model_path = f"./output/{model.__class__.__name__}/models/"
 37 |         os.makedirs(model_path, exist_ok=True)
 38 |         model.save(model_path + f"epo{epoch}_code.h5", model_path + f"epo{epoch}_desc.h5", overwrite=True)
 39 |         
 40 |     def load_model(self, model, epoch):
 41 |         model_path = f"./output/{model.__class__.__name__}/models/"
 42 |         assert os.path.exists(model_path + f"epo{epoch}_code.h5"),f"Weights at epoch {epoch} not found"
 43 |         assert os.path.exists(model_path + f"epo{epoch}_desc.h5"),f"Weights at epoch {epoch} not found"
 44 |         model.load(model_path + f"epo{epoch}_code.h5", model_path + f"epo{epoch}_desc.h5")
 45 | 
 46 | 
 47 |     ##### Training #####
 48 |     def train(self, model):
 49 |         if self.train_params['reload']>0:
 50 |             self.load_model(model, self.train_params['reload'])
 51 |         valid_every = self.train_params.get('valid_every', None)
 52 |         save_every = self.train_params.get('save_every', None)
 53 |         batch_size = self.train_params.get('batch_size', 128)
 54 |         nb_epoch = self.train_params.get('nb_epoch', 10)
 55 |         split = self.train_params.get('validation_split', 0)
 56 |         
 57 |         val_loss = {'loss': 1., 'epoch': 0}
 58 |         chunk_size = self.train_params.get('chunk_size', 100000)
 59 |         
 60 |         for i in range(self.train_params['reload']+1, nb_epoch):
 61 |             print('Epoch %d :: \n' % i, end='')  
 62 |             
 63 |             logger.debug('loading data chunk..')
 64 |             offset = (i-1)*self.train_params.get('chunk_size', 100000)
 65 |             
 66 |             names = data_loader.load_hdf5(self.data_path+self.data_params['train_methname'], offset, chunk_size)
 67 |             apis = data_loader.load_hdf5(self.data_path+self.data_params['train_apiseq'], offset, chunk_size)
 68 |             tokens = data_loader.load_hdf5(self.data_path+self.data_params['train_tokens'], offset, chunk_size)
 69 |             descs = data_loader.load_hdf5(self.data_path+self.data_params['train_desc'], offset, chunk_size)
 70 |             
 71 |             logger.debug('padding data..')
 72 |             methnames = pad(names, self.data_params['methname_len'])
 73 |             apiseqs = pad(apis, self.data_params['apiseq_len'])
 74 |             tokens = pad(tokens, self.data_params['tokens_len'])
 75 |             good_descs = pad(descs,self.data_params['desc_len'])
 76 |             bad_descs=[desc for desc in descs]
 77 |             random.shuffle(bad_descs)
 78 |             bad_descs = pad(bad_descs, self.data_params['desc_len'])
 79 | 
 80 |             hist = model.fit([methnames, apiseqs, tokens, good_descs, bad_descs], epochs=1, batch_size=batch_size, validation_split=split)
 81 | 
 82 |             if hist.history['val_loss'][0] < val_loss['loss']:
 83 |                 val_loss = {'loss': hist.history['val_loss'][0], 'epoch': i}
 84 |             print('Best: Loss = {}, Epoch = {}'.format(val_loss['loss'], val_loss['epoch']))
 85 |             
 86 |             if save_every is not None and i % save_every == 0:
 87 |                 self.save_model(model, i)
 88 | 
 89 |             if valid_every is not None and i % valid_every == 0:                
 90 |                 acc, mrr, map, ndcg = self.valid(model, 1000, 1)             
 91 | 
 92 |     ##### Evaluation in the develop set #####
 93 |     def valid(self, model, poolsize, K):
 94 |         """
 95 |         validate in a code pool. 
 96 |         param: poolsize - size of the code pool, if -1, load the whole test set
 97 |         """
 98 |         def ACC(real,predict):
 99 |             sum=0.0
100 |             for val in real:
101 |                 try: index=predict.index(val)
102 |                 except ValueError: index=-1
103 |                 if index!=-1: sum=sum+1  
104 |             return sum/float(len(real))
105 |         def MAP(real,predict):
106 |             sum=0.0
107 |             for id,val in enumerate(real):
108 |                 try: index=predict.index(val)
109 |                 except ValueError: index=-1
110 |                 if index!=-1: sum=sum+(id+1)/float(index+1)
111 |             return sum/float(len(real))
112 |         def MRR(real,predict):
113 |             sum=0.0
114 |             for val in real:
115 |                 try: index=predict.index(val)
116 |                 except ValueError: index=-1
117 |                 if index!=-1: sum=sum+1.0/float(index+1)
118 |             return sum/float(len(real))
119 |         def NDCG(real,predict):
120 |             dcg=0.0
121 |             idcg=IDCG(len(real))
122 |             for i,predictItem in enumerate(predict):
123 |                 if predictItem in real:
124 |                     itemRelevance=1
125 |                     rank = i+1
126 |                     dcg+=(math.pow(2,itemRelevance)-1.0)*(math.log(2)/math.log(rank+1))
127 |             return dcg/float(idcg)
128 |         def IDCG(n):
129 |             idcg=0
130 |             itemRelevance=1
131 |             for i in range(n):
132 |                 idcg+=(math.pow(2, itemRelevance)-1.0)*(math.log(2)/math.log(i+2))
133 |             return idcg
134 | 
135 |         #load valid dataset
136 |         if self._eval_sets is None:
137 |             methnames = data_loader.load_hdf5(self.data_path+self.data_params['valid_methname'], 0, poolsize)
138 |             apiseqs= data_loader.load_hdf5(self.data_path+self.data_params['valid_apiseq'], 0, poolsize)
139 |             tokens = data_loader.load_hdf5(self.data_path+self.data_params['valid_tokens'], 0, poolsize)
140 |             descs = data_loader.load_hdf5(self.data_path+self.data_params['valid_desc'], 0, poolsize) 
141 |             self._eval_sets={'methnames':methnames, 'apiseqs':apiseqs, 'tokens':tokens, 'descs':descs}
142 |             
143 |         accs,mrrs,maps,ndcgs = [], [], [], []
144 |         data_len = len(self._eval_sets['descs'])
145 |         for i in tqdm(range(data_len)):
146 |             desc=self._eval_sets['descs'][i]#good desc
147 |             descs = pad([desc]*data_len,self.data_params['desc_len'])
148 |             methnames = pad(self._eval_sets['methnames'],self.data_params['methname_len'])
149 |             apiseqs= pad(self._eval_sets['apiseqs'],self.data_params['apiseq_len'])
150 |             tokens= pad(self._eval_sets['tokens'],self.data_params['tokens_len'])
151 |             n_results = K          
152 |             sims = model.predict([methnames, apiseqs,tokens, descs], batch_size=data_len).flatten()
153 |             negsims= np.negative(sims)
154 |             predict = np.argpartition(negsims, kth=n_results-1)
155 |             predict = predict[:n_results]   
156 |             predict = [int(k) for k in predict]
157 |             real=[i]
158 |             accs.append(ACC(real,predict))
159 |             mrrs.append(MRR(real,predict))
160 |             maps.append(MAP(real,predict))
161 |             ndcgs.append(NDCG(real,predict))  
162 |         acc, mrr, map_, ndcg = np.mean(accs), np.mean(mrrs), np.mean(maps), np.mean(ndcgs)
163 |         logger.info(f'ACC={acc}, MRR={mrr}, MAP={map_}, nDCG={ndcg}')        
164 |         return acc,mrr,map_,ndcg
165 |     
166 |     
167 |     ##### Compute Representation #####
168 |     def repr_code(self, model):
169 |         logger.info('Loading the use data ..')
170 |         methnames = data_loader.load_hdf5(self.data_path+self.data_params['use_methname'],0,-1)
171 |         apiseqs = data_loader.load_hdf5(self.data_path+self.data_params['use_apiseq'],0,-1)
172 |         tokens = data_loader.load_hdf5(self.data_path+self.data_params['use_tokens'],0,-1) 
173 |         methnames = pad(methnames, self.data_params['methname_len'])
174 |         apiseqs = pad(apiseqs, self.data_params['apiseq_len'])
175 |         tokens = pad(tokens, self.data_params['tokens_len'])
176 |         
177 |         logger.info('Representing code ..')
178 |         vecs= model.repr_code([methnames, apiseqs, tokens], batch_size=10000)
179 |         vecs= vecs.astype(np.float)
180 |         vecs= normalize(vecs)
181 |         return vecs
182 |             
183 |     
184 |     def search(self, model, vocab, query, n_results=10):
185 |         desc=[convert(vocab, query)]#convert desc sentence to word indices
186 |         padded_desc = pad(desc, self.data_params['desc_len'])
187 |         desc_repr=model.repr_desc([padded_desc])
188 |         desc_repr=desc_repr.astype(np.float32)
189 |         desc_repr = normalize(desc_repr).T # [dim x 1]
190 |         codes, sims = [], []
191 |         threads=[]
192 |         for i,code_reprs_chunk in enumerate(self._code_reprs):
193 |             t = threading.Thread(target=self.search_thread, args = (codes,sims,desc_repr,code_reprs_chunk,i,n_results))
194 |             threads.append(t)
195 |         for t in threads:
196 |             t.start()
197 |         for t in threads:#wait until all sub-threads finish
198 |             t.join()
199 |         return codes,sims
200 |                  
201 |     def search_thread(self, codes, sims, desc_repr, code_reprs, i, n_results):        
202 |     #1. compute similarity
203 |         chunk_sims=np.dot(code_reprs, desc_repr) # [pool_size x 1] 
204 |         chunk_sims = np.squeeze(chunk_sims, axis=1)
205 |     #2. choose top results
206 |         negsims=np.negative(chunk_sims)
207 |         maxinds = np.argpartition(negsims, kth=n_results-1)
208 |         maxinds = maxinds[:n_results]        
209 |         chunk_codes = [self._codebase[i][k] for k in maxinds]
210 |         chunk_sims = chunk_sims[maxinds]
211 |         codes.extend(chunk_codes)
212 |         sims.extend(chunk_sims)
213 |         
214 |     def postproc(self,codes_sims):
215 |         codes_, sims_ = zip(*codes_sims)
216 |         codes= [code for code in codes_]
217 |         sims= [sim for sim in sims_]
218 |         final_codes=[]
219 |         final_sims=[]
220 |         n=len(codes_sims)        
221 |         for i in range(n):
222 |             is_dup=False
223 |             for j in range(i):
224 |                 if codes[i][:80]==codes[j][:80] and abs(sims[i]-sims[j])<0.01:
225 |                     is_dup=True
226 |             if not is_dup:
227 |                 final_codes.append(codes[i])
228 |                 final_sims.append(sims[i])
229 |         return zip(final_codes,final_sims)
230 | 
231 |     
232 | def parse_args():
233 |     parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
234 |     parser.add_argument("--data_path", type=str, default='./data/', help="working directory")
235 |     parser.add_argument("--model", type=str, default="JointEmbeddingModel", help="model name")
236 |     parser.add_argument("--dataset", type=str, default="github", help="dataset name")
237 |     parser.add_argument("--mode", choices=["train","eval","repr_code","search"], default='train',
238 |                         help="The mode to run. The `train` mode trains a model;"
239 |                         " the `eval` mode evaluat models in a test set "
240 |                         " The `repr_code/repr_desc` mode computes vectors"
241 |                         " for a code snippet or a natural language description with a trained model.")
242 |     parser.add_argument("--verbose",action="store_true", default=True, help="Be verbose")
243 |     return parser.parse_args()
244 | 
245 | 
246 | if __name__ == '__main__':
247 |     args = parse_args()
248 |     config=getattr(configs, 'config_'+args.model)()
249 |     engine = SearchEngine(args, config)
250 | 
251 |     ##### Define model ######
252 |     logger.info('Build Model')
253 |     model = getattr(models, args.model)(config)#initialize the model
254 |     model.build()
255 |     model.summary(export_path = f"./output/{args.model}/")
256 |     
257 |     optimizer = config.get('training_params', dict()).get('optimizer', 'adam')
258 |     model.compile(optimizer=optimizer)  
259 | 
260 |     data_path = args.data_path+args.dataset+'/'
261 |     
262 |     if args.mode=='train':  
263 |         engine.train(model)
264 |         
265 |     elif args.mode=='eval': # evaluate for a specific epoch
266 |         assert config['training_params']['reload']>0, "please specify the number of epoch of the optimal checkpoint in config.py"
267 |         engine.load_model(model, config['training_params']['reload'])
268 |         engine.valid(model, -1, 10)
269 |         
270 |     elif args.mode=='repr_code':
271 |         assert config['training_params']['reload']>0, "please specify the number of epoch of the optimal checkpoint in config.py"
272 |         engine.load_model(model, config['training_params']['reload'])
273 |         vecs = engine.repr_code(model)
274 |         data_loader.save_code_reprs(vecs, data_path+config['data_params']['use_codevecs'])
275 |         
276 |     elif args.mode=='search':
277 |         #search code based on a desc
278 |         assert config['training_params']['reload']>0, "please specify the number of epoch of the optimal checkpoint in config.py"
279 |         engine.load_model(model, config['training_params']['reload'])
280 |         engine._code_reprs = data_loader.load_code_reprs(data_path+config['data_params']['use_codevecs'], engine._codebase_chunksize)
281 |         engine._codebase = data_loader.load_codebase(data_path+config['data_params']['use_codebase'], engine._codebase_chunksize)
282 |         vocab = data_loader.load_pickle(data_path+config['data_params']['vocab_desc'])
283 |         while True:
284 |             try:
285 |                 query = input('Input Query: ')
286 |                 n_results = int(input('How many results? '))
287 |             except Exception:
288 |                 print("Exception while parsing your input:")
289 |                 traceback.print_exc()
290 |                 break
291 |             query = query.lower().replace('how to ', '').replace('how do i ', '').replace('how can i ', '').replace('?', '').strip()
292 |             codes,sims=engine.search(model, vocab, query, n_results)
293 |             zipped=zip(codes,sims)
294 |             zipped=sorted(zipped, reverse=True, key=lambda x:x[1])
295 |             zipped=engine.postproc(zipped)
296 |             zipped = list(zipped)[:n_results]
297 |             results = '\n\n'.join(map(str,zipped)) #combine the result into a returning string
298 |             print(results)
299 | 


--------------------------------------------------------------------------------
/keras/models.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from tensorflow.keras.layers import Input, Concatenate, Dot, Embedding, Dropout, Lambda, Activation, LSTM, Dense
  3 | from tensorflow.keras import backend as K
  4 | from tensorflow.keras.models import Model
  5 | from tensorflow.keras.utils import plot_model
  6 | import numpy as np
  7 | import logging
  8 | logger = logging.getLogger(__name__)
  9 |     
 10 | class JointEmbeddingModel:
 11 |     def __init__(self, config):
 12 |         self.model_params = config.get('model_params', dict())
 13 |         self.data_params = config.get('data_params',dict())
 14 |         self.methname = Input(shape=(self.data_params['methname_len'],), dtype='int32', name='i_methname')
 15 |         self.apiseq= Input(shape=(self.data_params['apiseq_len'],),dtype='int32',name='i_apiseq')
 16 |         self.tokens=Input(shape=(self.data_params['tokens_len'],),dtype='int32',name='i_tokens')
 17 |         self.desc_good = Input(shape=(self.data_params['desc_len'],), dtype='int32', name='i_desc_good')
 18 |         self.desc_bad = Input(shape=(self.data_params['desc_len'],), dtype='int32', name='i_desc_bad')
 19 |         
 20 |         # initialize a bunch of variables that will be set later
 21 |         self._code_repr_model=None
 22 |         self._desc_repr_model=None        
 23 |         self._sim_model = None        
 24 |         self._training_model = None
 25 |         #self.prediction_model = None       
 26 |     
 27 |     def build(self):
 28 |         '''
 29 |         1. Build Code Representation Model
 30 |         '''
 31 |         logger.debug('Building Code Representation Model')
 32 |         methname = Input(shape=(self.data_params['methname_len'],), dtype='int32', name='methname')
 33 |         apiseq= Input(shape=(self.data_params['apiseq_len'],),dtype='int32',name='apiseq')
 34 |         tokens=Input(shape=(self.data_params['tokens_len'],),dtype='int32',name='tokens')
 35 |         
 36 |         ## method name representation ##
 37 |         #1.embedding
 38 |         init_emb_weights = np.load(self.model_params['init_embed_weights_methname']) if self.model_params['init_embed_weights_methname'] is not None else None
 39 |         if init_emb_weights is not None: init_emb_weights = [init_emb_weights]
 40 |         embedding = Embedding(input_dim=self.data_params['n_words'],
 41 |                               output_dim=self.model_params.get('n_embed_dims', 100),
 42 |                               weights=init_emb_weights,
 43 |                               mask_zero=False,#Whether 0 in the input is a special "padding" value that should be masked out. 
 44 |                               #If True, all subsequent layers in the model must support masking, otherwise an exception will be raised.
 45 |                               name='embedding_methname')
 46 |         methname_embedding = embedding(methname)
 47 |         dropout = Dropout(0.25,name='dropout_methname_embed')
 48 |         methname_dropout = dropout(methname_embedding)
 49 |         #2.rnn
 50 |         f_rnn = LSTM(self.model_params.get('n_lstm_dims', 128), recurrent_dropout=0.2, 
 51 |                      return_sequences=True, name='lstm_methname_f')
 52 |         
 53 |         b_rnn = LSTM(self.model_params.get('n_lstm_dims', 128), return_sequences=True, 
 54 |                      recurrent_dropout=0.2, name='lstm_methname_b',go_backwards=True)        
 55 |         methname_f_rnn = f_rnn(methname_dropout)
 56 |         methname_b_rnn = b_rnn(methname_dropout)
 57 |         dropout = Dropout(0.25,name='dropout_methname_rnn')
 58 |         methname_f_dropout = dropout(methname_f_rnn)
 59 |         methname_b_dropout = dropout(methname_b_rnn)
 60 |         #3.maxpooling
 61 |         maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]),name='maxpool_methname')
 62 |         methname_pool = Concatenate(name='concat_methname_lstms')([maxpool(methname_f_dropout), maxpool(methname_b_dropout)])
 63 |         activation = Activation('tanh',name='active_methname')
 64 |         methname_repr = activation(methname_pool)
 65 |         
 66 |         
 67 |         ## API Sequence Representation ##
 68 |         #1.embedding
 69 |         embedding = Embedding(input_dim=self.data_params['n_words'],
 70 |                               output_dim=self.model_params.get('n_embed_dims', 100),
 71 |                               #weights=weights,
 72 |                               mask_zero=False,#Whether 0 in the input is a special "padding" value that should be masked out. 
 73 |                                          #If True, all subsequent layers must support masking, otherwise an exception will be raised.
 74 |                               name='embedding_apiseq')
 75 |         apiseq_embedding = embedding(apiseq)
 76 |         dropout = Dropout(0.25,name='dropout_apiseq_embed')
 77 |         apiseq_dropout = dropout(apiseq_embedding)
 78 |         #2.rnn
 79 |         f_rnn = LSTM(self.model_params.get('n_lstm_dims', 100), return_sequences=True, recurrent_dropout=0.2,
 80 |                       name='lstm_apiseq_f')
 81 |         b_rnn = LSTM(self.model_params.get('n_lstm_dims', 100), return_sequences=True, recurrent_dropout=0.2, 
 82 |                       name='lstm_apiseq_b', go_backwards=True)        
 83 |         apiseq_f_rnn = f_rnn(apiseq_dropout)
 84 |         apiseq_b_rnn = b_rnn(apiseq_dropout)
 85 |         dropout = Dropout(0.25,name='dropout_apiseq_rnn')
 86 |         apiseq_f_dropout = dropout(apiseq_f_rnn)
 87 |         apiseq_b_dropout = dropout(apiseq_b_rnn)
 88 |         #3.maxpooling
 89 |         maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]),name='maxpool_apiseq')
 90 |         apiseq_pool = Concatenate(name='concat_apiseq_lstms')([maxpool(apiseq_f_dropout), maxpool(apiseq_b_dropout)])
 91 |         activation = Activation('tanh',name='active_apiseq')
 92 |         apiseq_repr = activation(apiseq_pool)
 93 |         
 94 |         
 95 |         ## Tokens Representation ##
 96 |         #1.embedding
 97 |         init_emb_weights = np.load(self.model_params['init_embed_weights_tokens']) if self.model_params['init_embed_weights_tokens'] is not None else None
 98 |         if init_emb_weights is not None: init_emb_weights = [init_emb_weights]
 99 |         embedding = Embedding(input_dim=self.data_params['n_words'],
100 |                               output_dim=self.model_params.get('n_embed_dims', 100),
101 |                               weights=init_emb_weights,
102 |                               #mask_zero=True,#Whether 0 in the input is a special "padding" value that should be masked out. 
103 |                               #If True, all subsequent layers must support masking, otherwise an exception will be raised.
104 |                               name='embedding_tokens')
105 |         tokens_embedding = embedding(tokens)
106 |         dropout = Dropout(0.25,name='dropout_tokens_embed')
107 |         tokens_dropout= dropout(tokens_embedding)
108 | 
109 |         #4.maxpooling
110 |         maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]),name='maxpool_tokens')
111 |         tokens_pool = maxpool(tokens_dropout)
112 |         activation = Activation('tanh',name='active_tokens')
113 |         tokens_repr= activation(tokens_pool)        
114 |         
115 |         ## concatenate the representation of code ##
116 |         merged_methname_api=Concatenate(name='merge_methname_api')([methname_repr,apiseq_repr])
117 |         merged_code_repr=Concatenate(name='merge_coderepr')([merged_methname_api,tokens_repr])
118 |         code_repr=Dense(self.model_params.get('n_hidden',400),activation='tanh',name='dense_coderepr')(merged_code_repr)
119 |         
120 |         
121 |         self._code_repr_model=Model(inputs=[methname,apiseq,tokens],outputs=[code_repr],name='code_repr_model')     
122 |         
123 |         
124 |         '''
125 |         2. Build Desc Representation Model
126 |         '''
127 |         ## Desc Representation ##
128 |         logger.debug('Building Desc Representation Model')
129 |         desc = Input(shape=(self.data_params['desc_len'],), dtype='int32', name='desc')
130 |         #1.embedding
131 |         init_emb_weights = np.load(self.model_params['init_embed_weights_desc']) if self.model_params['init_embed_weights_desc'] is not None else None
132 |         if init_emb_weights is not None: init_emb_weights = [init_emb_weights]
133 |         embedding = Embedding(input_dim=self.data_params['n_words'],
134 |                               output_dim=self.model_params.get('n_embed_dims', 100),
135 |                               weights=init_emb_weights,
136 |                               mask_zero=True,#Whether 0 in the input is a special "padding" value that should be masked out. 
137 |                                       #If True, all subsequent layers must support masking, otherwise an exception will be raised.
138 |                               name='embedding_desc')
139 |         desc_embedding = embedding(desc)
140 |         dropout = Dropout(0.25,name='dropout_desc_embed')
141 |         desc_dropout = dropout(desc_embedding)
142 |         #2. rnn
143 |         f_rnn = LSTM(self.model_params.get('n_lstm_dims', 100), return_sequences=True, recurrent_dropout=0.2,
144 |                      name='lstm_desc_f')
145 |         b_rnn = LSTM(self.model_params.get('n_lstm_dims', 100), return_sequences=True, recurrent_dropout=0.2, 
146 |                      name='lstm_desc_b', go_backwards=True) 
147 |         desc_f_rnn = f_rnn(desc_dropout)
148 |         desc_b_rnn = b_rnn(desc_dropout)
149 |         dropout = Dropout(0.25,name='dropout_desc_rnn')
150 |         desc_f_dropout = dropout(desc_f_rnn)
151 |         desc_b_dropout = dropout(desc_b_rnn)
152 |         #3. maxpooling
153 |         maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]),name='maxpool_desc')
154 |         desc_pool = Concatenate(name='concat_desc_rnns')([maxpool(desc_f_dropout), maxpool(desc_b_dropout)])
155 |         activation = Activation('tanh',name='active_desc')
156 |         desc_repr = activation(desc_pool)
157 |         
158 |         self._desc_repr_model=Model(inputs=[desc],outputs=[desc_repr],name='desc_repr_model')
159 |             
160 |         """
161 |         3: calculate the cosine similarity between code and desc
162 |         """     
163 |         logger.debug('Building similarity model') 
164 |         code_repr=self._code_repr_model([methname,apiseq,tokens])
165 |         desc_repr=self._desc_repr_model([desc])
166 |         cos_sim=Dot(axes=1, normalize=True, name='cos_sim')([code_repr, desc_repr])
167 |         
168 |         sim_model = Model(inputs=[methname,apiseq,tokens,desc], outputs=[cos_sim],name='sim_model')   
169 |         self._sim_model=sim_model  #for model evaluation  
170 | 
171 |         
172 |         '''
173 |         4:Build training model
174 |         '''
175 |         good_sim = sim_model([self.methname,self.apiseq,self.tokens, self.desc_good])# similarity of good output
176 |         bad_sim = sim_model([self.methname,self.apiseq,self.tokens, self.desc_bad])#similarity of bad output
177 |         loss = Lambda(lambda x: K.maximum(1e-6, self.model_params['margin'] - x[0] + x[1]),
178 |                      output_shape=lambda x: x[0], name='loss')([good_sim, bad_sim])
179 | 
180 |         logger.debug('Building training model')
181 |         self._training_model=Model(inputs=[self.methname,self.apiseq,self.tokens,self.desc_good,self.desc_bad],
182 |                                    outputs=[loss],name='training_model')
183 |         
184 |                 
185 |     def summary(self, export_path):
186 |         print('Summary of the code representation model')
187 |         self._code_repr_model.summary()
188 |         #plot_model(self._code_repr_model, show_shapes=True, to_file= export_path+'code_repr_model.png')  
189 |         print('Summary of the desc representation model')
190 |         self._desc_repr_model.summary()
191 |         #plot_model(self._desc_repr_model, show_shapes=True, to_file=export_path+'desc_repr_model.png') 
192 |         print ("Summary of the similarity model")
193 |         self._sim_model.summary() 
194 |         #plot_model(self._sim_model, show_shapes=True, to_file= export_path+'sim_model.png')
195 |         print ('Summary of the training model')
196 |         self._training_model.summary()      
197 |         #plot_model(self._training_model, show_shapes=True, to_file=export_path+'training_model.png')  
198 |    
199 | 
200 |     def compile(self, optimizer, **kwargs):
201 |         logger.info('compiling models')
202 |         self._code_repr_model.compile(loss='cosine_similarity', optimizer=optimizer, **kwargs)
203 |         self._desc_repr_model.compile(loss='cosine_similarity', optimizer=optimizer, **kwargs)
204 |         self._training_model.compile(loss=lambda y_true, y_pred: y_pred+y_true-y_true, optimizer=optimizer, **kwargs)
205 |         #+y_true-y_true is for avoiding an unused input warning, it can be simply +y_true since y_true is always 0 in the training set.
206 |         self._sim_model.compile(loss='binary_crossentropy', optimizer=optimizer, **kwargs)
207 | 
208 |     def fit(self, x, **kwargs):
209 |         assert self._training_model is not None, 'Must compile the model before fitting data'
210 |         y = np.zeros(shape=x[0].shape[:1],dtype=np.float32)
211 |         return self._training_model.fit(x, y, **kwargs)
212 | 
213 |     def repr_code(self, x, **kwargs):
214 |         return self._code_repr_model.predict(x, **kwargs)
215 |     
216 |     def repr_desc(self, x, **kwargs):
217 |         return self._desc_repr_model.predict(x, **kwargs)
218 |     
219 |     def predict(self, x, **kwargs):
220 |         return self._sim_model.predict(x, **kwargs)
221 | 
222 |     def save(self, code_model_file, desc_model_file, **kwargs):
223 |         assert self._code_repr_model is not None, 'Must compile the model before saving weights'
224 |         self._code_repr_model.save_weights(code_model_file, **kwargs)
225 |         assert self._desc_repr_model is not None, 'Must compile the model before saving weights'
226 |         self._desc_repr_model.save_weights(desc_model_file, **kwargs)
227 | 
228 |     def load(self, code_model_file, desc_model_file, **kwargs):
229 |         assert self._code_repr_model is not None, 'Must compile the model loading weights'
230 |         self._code_repr_model.load_weights(code_model_file, **kwargs)
231 |         assert self._desc_repr_model is not None, 'Must compile the model loading weights'
232 |         self._desc_repr_model.load_weights(desc_model_file, **kwargs)
233 | 
234 |  
235 |  
236 |  
237 |  


--------------------------------------------------------------------------------
/keras/requirements.txt:
--------------------------------------------------------------------------------
1 | keras=2.3.1
2 | tensorflow-gpu=2.0.0
3 | tables
4 | numpy
5 | tqdm
6 | scipy
7 | scikit-learn


--------------------------------------------------------------------------------
/keras/results/results.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/keras/results/results.xlsx


--------------------------------------------------------------------------------
/keras/utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Utils for similarity computation
 3 | 
 4 | @author: v-xiaodg
 5 | '''
 6 | import numpy as np
 7 | 
 8 | def cos_np(data1,data2):
 9 |     """numpy implementation of cosine similarity for matrix"""
10 |     dotted = np.dot(data1,np.transpose(data2))
11 |     norm1 = np.linalg.norm(data1,axis=1)
12 |     norm2 = np.linalg.norm(data2,axis=1)
13 |     matrix_vector_norms = np.multiply(norm1, norm2)
14 |     neighbors = np.divide(dotted, matrix_vector_norms)
15 |     return neighbors
16 | 
17 | def normalize(data):
18 |     """normalize matrix by rows"""
19 |     normalized_data = data/np.linalg.norm(data,axis=1).reshape((data.shape[0], 1))
20 |     return normalized_data
21 | 
22 | def cos_np_for_normalized(data1,data2):
23 |     """cosine similarity for normalized vectors"""
24 |     return np.dot(data1,np.transpose(data2))
25 | 
26 | 
27 | ##### Converting / reverting #####
28 | def convert(vocab, words):
29 |     """convert words into indices"""        
30 |     if type(words) == str:
31 |         words = words.strip().lower().split(' ')
32 |     return [vocab.get(w, 0) for w in words]
33 | def revert(vocab, indices):
34 |     """revert indices into words"""
35 |     ivocab = dict((v, k) for k, v in vocab.items())
36 |     return [ivocab.get(i, 'UNK') for i in indices]
37 | 
38 | ##### Padding #####
39 | def pad(data, len=None):
40 |     from tensorflow.keras.preprocessing.sequence import pad_sequences
41 |     return pad_sequences(data, maxlen=len, padding='post', truncating='post', value=0)
42 | 
43 | 


--------------------------------------------------------------------------------
/pytorch/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Xiaodong Gu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pytorch/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Code Search
 2 | 
 3 | PyTorch implementation of [Deep Code Search](https://guxd.github.io/papers/deepcs.pdf).
 4 | 
 5 | ⚠️ **Note that the PyTorch version is problematic at present. It might need bugfix or hyperparameter tuning. For those who want to replicate DeepCS as a baseline model, it is highly recommended to check out the Keras version. This could greatly save your time and effort**.
 6 | 
 7 | ## Dependency
 8 | > Tested in MacOS 10.12, Ubuntu 16.04
 9 | * Python 3.6
10 | * PyTorch 
11 | * tqdm
12 | 
13 |  ```
14 |  pip install -r requirements.txt
15 |  ```
16 |  
17 | 
18 | ## Code Structures
19 | 
20 |  - `models`: neural network models for code/desc representation and similarity measure.
21 |  - `modules.py`: basic modules for model construction.
22 |  - `train.py`: train and validate code/desc representaton models; 
23 |  - `repr_code.py`: encode code into vectors and store them to a file; 
24 |  - `search.py`: perform code search;
25 |  - `configs.py`: configurations for models defined in the `models` folder. 
26 |    Each function defines the hyper-parameters for the corresponding model.
27 |  - `data_loader.py`: A PyTorch dataset loader.
28 |  - `utils.py`: utilities for models and training. 
29 | 
30 | ## Pretrained Model
31 | 
32 |    If you want a quick test, [here](https://drive.google.com/file/d/1xpUXsSFbULYEAs8low5zQZWK7-wmqTNO/view?usp=sharing) is a pretrained model. Put it in `./output/JointEmbeder/github/202106140524/models/` and run:
33 | 
34 |    ```
35 |    python repr_code.py -t 202106140524 --reload_from 4000000
36 |    python search.py -t 202106140524 --reload_from 4000000
37 |    ```
38 |    
39 |  
40 | ## Usage
41 | 
42 |    ### Data Preparation
43 |   The `/data` folder provides a small dummy dataset for quick deployment.  
44 |   To train and test our model:
45 |   
46 |   1) Download and unzip real dataset from [Google Drive](https://drive.google.com/drive/folders/1GZYLT_lzhlVczXjD6dgwVUvDDPHMB6L7?usp=sharing) or [Baidu Pan](https://pan.baidu.com/s/1U_MtFXqq0C-Qh8WUFAWGvg) for Chinese users.
47 |   
48 |   2) Replace each file in the `/data` folder with the corresponding real file. 
49 |   
50 |    ### Configuration
51 |    Edit hyper-parameters and settings in `config.py`
52 | 
53 |    ### Train
54 |    
55 |    ```bash
56 |    python train.py --model JointEmbeder -v
57 |    ```
58 |    <img src="https://user-images.githubusercontent.com/6091014/125632961-36df8a55-5a1e-4d90-b96d-5abca8e90e0e.png" width=50% height=50%>
59 |       
60 |    ### Code Embedding
61 |    
62 |    ```bash
63 |    python repr_code.py --model JointEmbeder -t XXX --reload_from YYY
64 |    ```
65 |    where `XXX` stands for the timestamp, and `YYY` represents the iteration with the best model.
66 |    
67 |    ### Search
68 |    
69 |    ```bash
70 |    python search.py --model JointEmbeder -t XXX --_reload_from YYY
71 |    ```
72 |    where `XXX` stands for the timestamp, and `YYY` represents the iteration with the best model.
73 |    
74 |    Here is a screenshot of code search:
75 |    
76 |    <img src="https://user-images.githubusercontent.com/6091014/125629170-6dff5196-7b9a-41b4-b4a7-53626f6b2e83.png" width=100% height=100%>
77 |    
78 | 
79 | ## Citation
80 | 
81 |  If you find it useful and would like to cite it, the following would be appropriate:
82 |  
83 | ```bibtex
84 | @inproceedings{gu2018deepcs,
85 |   title={Deep Code Search},
86 |   author={Gu, Xiaodong and Zhang, Hongyu and Kim, Sunghun},
87 |   booktitle={Proceedings of the 2018 40th International Conference on Software Engineering (ICSE 2018)},
88 |   year={2018},
89 |   organization={ACM}
90 | }
91 | ```
92 | 


--------------------------------------------------------------------------------
/pytorch/automl_config.yaml:
--------------------------------------------------------------------------------
 1 | # config.yaml
 2 | backend:
 3 |     type: NSMLBackend
 4 |     setting:
 5 |         entry: train.py
 6 |         dataset: [codesearch]
 7 |         cpus: 2
 8 |         gpus: 1
 9 |         gpu-model: P40
10 |         args: "--automl"
11 | tune:
12 |     objective:
13 |         measure: acc
14 |         strategy: maximize
15 |     sampler:
16 |         name: PBTSampler
17 |         setting:
18 |             random_seed: 777
19 |             all_unique: False
20 |             max_attempt: 1000
21 |     planner:
22 |         name: PopulationPlanner
23 |         setting:
24 |             num_generations: 5
25 |             population: 10
26 |             alive_cnt: 2
27 |             comparator: 'avg_compare'
28 |             probs: 'halving'
29 |             max_worker: 30
30 | hyperparams:     
31 |    learning_rate:
32 |         type: log_range
33 |         min: 0.00005
34 |         max: 0.01
35 |    margin:
36 |         type: range
37 |         min: 0.1
38 |         max: 0.9
39 |    n_hidden:
40 |         type: values
41 |         values: [512, 1024]
42 |    lstm_dims:
43 |         type: values
44 |         values: [256, 512, 768, 1024]


--------------------------------------------------------------------------------
/pytorch/configs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def config_JointEmbeder():   
 3 |     conf = {
 4 |         # data_params
 5 |         'dataset_name':'CodeSearchDataset', # name of dataset to specify a data loader
 6 |             #training data
 7 |             'train_name':'train.name.h5',
 8 |             'train_api':'train.apiseq.h5',
 9 |             'train_tokens':'train.tokens.h5',
10 |             'train_desc':'train.desc.h5',
11 |             #test data
12 |             'valid_name':'valid.name.h5',
13 |             'valid_api':'valid.apiseq.h5',
14 |             'valid_tokens':'valid.tokens.h5',
15 |             'valid_desc':'valid.desc.h5',
16 |             #use data (computing code vectors)
17 |             'use_codebase':'use.rawcode.txt',#'use.rawcode.h5'
18 |             'use_names':'use.name.h5',
19 |             'use_apis':'use.apiseq.h5',
20 |             'use_tokens':'use.tokens.h5',     
21 |             #results data(code vectors)            
22 |             'use_codevecs':'use.codevecs.h5',        
23 |                    
24 |             #parameters
25 |             'name_len': 6,
26 |             'api_len':30,
27 |             'tokens_len':50,
28 |             'desc_len': 30,
29 |             'n_words': 10000, # len(vocabulary) + 1
30 |             #vocabulary info
31 |             'vocab_name':'vocab.name.json',
32 |             'vocab_api':'vocab.apiseq.json',
33 |             'vocab_tokens':'vocab.tokens.json',
34 |             'vocab_desc':'vocab.desc.json',
35 |                     
36 |         #training_params            
37 |             'batch_size': 64,
38 |             'chunk_size':200000,
39 |             'nb_epoch': 15,
40 |             #'optimizer': 'adam',
41 |             'learning_rate': 1.34e-4, #2.08e-4,
42 |             'adam_epsilon':1e-8,
43 |             'warmup_steps':5000,
44 |             'fp16': False,
45 |             'fp16_opt_level': 'O1', #For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].
46 |                             #"See details at https://nvidia.github.io/apex/amp.html"
47 | 
48 |         # model_params # best: lstm_dims=512, n_hidden=1024, lr=1.38e-3, margin=0.6454, acc=0.9534
49 |                        # sub-optimal: lstm_dims=256, n_hidden=512, lr=2.08e-4, margin=0.3986, acc = 0.9348
50 |             'emb_size': 512,
51 |             'n_hidden': 512,#number of hidden dimension of code/desc representation
52 |             # recurrent
53 |             'lstm_dims': 1024, #256, # * 2          
54 |             'margin': 0.413, #0.3986,
55 |             'sim_measure':'cos',#similarity measure: cos, poly, sigmoid, euc, gesd, aesd. see https://arxiv.org/pdf/1508.01585.pdf
56 |                          #cos, poly and sigmoid are fast with simple dot, while euc, gesd and aesd are slow with vector normalization.
57 |     }
58 |     return conf
59 | 


--------------------------------------------------------------------------------
/pytorch/data/example/test.apiseq.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/test.apiseq.h5


--------------------------------------------------------------------------------
/pytorch/data/example/test.desc.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/test.desc.h5


--------------------------------------------------------------------------------
/pytorch/data/example/test.meta.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/test.meta.txt


--------------------------------------------------------------------------------
/pytorch/data/example/test.methname.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/test.methname.h5


--------------------------------------------------------------------------------
/pytorch/data/example/test.tokens.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/test.tokens.h5


--------------------------------------------------------------------------------
/pytorch/data/example/train.apiseq.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/train.apiseq.h5


--------------------------------------------------------------------------------
/pytorch/data/example/train.desc.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/train.desc.h5


--------------------------------------------------------------------------------
/pytorch/data/example/train.methname.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/train.methname.h5


--------------------------------------------------------------------------------
/pytorch/data/example/train.tokens.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/train.tokens.h5


--------------------------------------------------------------------------------
/pytorch/data/example/use.apiseq.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/use.apiseq.h5


--------------------------------------------------------------------------------
/pytorch/data/example/use.desc.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/use.desc.h5


--------------------------------------------------------------------------------
/pytorch/data/example/use.methname.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/use.methname.h5


--------------------------------------------------------------------------------
/pytorch/data/example/use.tokens.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/use.tokens.h5


--------------------------------------------------------------------------------
/pytorch/data/example/vocab.apiseq.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/vocab.apiseq.pkl


--------------------------------------------------------------------------------
/pytorch/data/example/vocab.desc.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/vocab.desc.pkl


--------------------------------------------------------------------------------
/pytorch/data/example/vocab.methname.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/vocab.methname.pkl


--------------------------------------------------------------------------------
/pytorch/data/example/vocab.tokens.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/example/vocab.tokens.pkl


--------------------------------------------------------------------------------
/pytorch/data/github/train.apiseq.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/train.apiseq.h5


--------------------------------------------------------------------------------
/pytorch/data/github/train.desc.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/train.desc.h5


--------------------------------------------------------------------------------
/pytorch/data/github/train.methname.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/train.methname.h5


--------------------------------------------------------------------------------
/pytorch/data/github/train.tokens.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/train.tokens.h5


--------------------------------------------------------------------------------
/pytorch/data/github/use.apiseq.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/use.apiseq.h5


--------------------------------------------------------------------------------
/pytorch/data/github/use.methname.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/use.methname.h5


--------------------------------------------------------------------------------
/pytorch/data/github/use.tokens.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guxd/deep-code-search/06c9de63d97308b5b0adcb20f38402e5d29c8704/pytorch/data/github/use.tokens.h5


--------------------------------------------------------------------------------
/pytorch/data_loader.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import torch 
  3 | import torch.utils.data as data
  4 | import torch.nn as nn
  5 | import tables
  6 | import json
  7 | import random
  8 | import numpy as np
  9 | import pickle
 10 | from utils import PAD_ID, SOS_ID, EOS_ID, UNK_ID, indexes2sent
 11 | 
 12 |     
 13 | class CodeSearchDataset(data.Dataset):
 14 |     """
 15 |     Dataset that has only positive samples.
 16 |     """
 17 |     def __init__(self, data_dir, f_name, max_name_len, f_api, max_api_len, 
 18 |                  f_tokens, max_tok_len, f_descs=None, max_desc_len=None):
 19 |         self.max_name_len=max_name_len
 20 |         self.max_api_len=max_api_len
 21 |         self.max_tok_len=max_tok_len
 22 |         self.max_desc_len=max_desc_len
 23 |         # 1. Initialize file path or list of file names.
 24 |         """read training data(list of int arrays) from a hdf5 file"""
 25 |         self.training=False
 26 |         print("loading data...")
 27 |         table_name = tables.open_file(data_dir+f_name)
 28 |         self.names = table_name.get_node('/phrases')[:].astype(np.long)
 29 |         self.idx_names = table_name.get_node('/indices')[:]
 30 |         table_api = tables.open_file(data_dir+f_api)
 31 |         self.apis = table_api.get_node('/phrases')[:].astype(np.long)
 32 |         self.idx_apis = table_api.get_node('/indices')[:]
 33 |         table_tokens = tables.open_file(data_dir+f_tokens)
 34 |         self.tokens = table_tokens.get_node('/phrases')[:].astype(np.long)
 35 |         self.idx_tokens = table_tokens.get_node('/indices')[:]
 36 |         if f_descs is not None:
 37 |             self.training=True
 38 |             table_desc = tables.open_file(data_dir+f_descs)
 39 |             self.descs = table_desc.get_node('/phrases')[:].astype(np.long)
 40 |             self.idx_descs = table_desc.get_node('/indices')[:]
 41 |         
 42 |         assert self.idx_names.shape[0] == self.idx_apis.shape[0]
 43 |         assert self.idx_apis.shape[0] == self.idx_tokens.shape[0]
 44 |         if f_descs is not None:
 45 |             assert self.idx_names.shape[0]==self.idx_descs.shape[0]
 46 |         self.data_len = self.idx_names.shape[0]
 47 |         print("{} entries".format(self.data_len))
 48 |         
 49 |     def pad_seq(self, seq, maxlen):
 50 |         if len(seq)<maxlen:
 51 |             # !!!!! numpy appending is slow. Try to optimize the padding
 52 |             seq=np.append(seq, [PAD_ID]*(maxlen-len(seq)))
 53 |         seq=seq[:maxlen]
 54 |         return seq
 55 |     
 56 |     def __getitem__(self, offset):          
 57 |         len, pos = self.idx_names[offset]['length'], self.idx_names[offset]['pos']
 58 |         name_len=min(int(len),self.max_name_len) 
 59 |         name = self.names[pos: pos+name_len]
 60 |         name = self.pad_seq(name, self.max_name_len)
 61 |         
 62 |         len, pos = self.idx_apis[offset]['length'], self.idx_apis[offset]['pos']
 63 |         api_len = min(int(len), self.max_api_len)
 64 |         apiseq = self.apis[pos:pos+api_len]
 65 |         apiseq = self.pad_seq(apiseq, self.max_api_len)
 66 |         
 67 |         len, pos = self.idx_tokens[offset]['length'], self.idx_tokens[offset]['pos']
 68 |         tok_len = min(int(len), self.max_tok_len)
 69 |         tokens = self.tokens[pos:pos+tok_len]
 70 |         tokens = self.pad_seq(tokens, self.max_tok_len)
 71 | 
 72 |         if self.training:
 73 |             len, pos = self.idx_descs[offset]['length'], self.idx_descs[offset]['pos']
 74 |             good_desc_len = min(int(len), self.max_desc_len)
 75 |             good_desc = self.descs[pos:pos+good_desc_len]
 76 |             good_desc = self.pad_seq(good_desc, self.max_desc_len)
 77 |             
 78 |             rand_offset=random.randint(0, self.data_len-1)
 79 |             len, pos = self.idx_descs[rand_offset]['length'], self.idx_descs[rand_offset]['pos']
 80 |             bad_desc_len=min(int(len), self.max_desc_len)
 81 |             bad_desc = self.descs[pos:pos+bad_desc_len]
 82 |             bad_desc = self.pad_seq(bad_desc, self.max_desc_len)
 83 | 
 84 |             return name, name_len, apiseq, api_len, tokens, tok_len, good_desc, good_desc_len, bad_desc, bad_desc_len
 85 |         return name, name_len, apiseq, api_len, tokens, tok_len
 86 |         
 87 |     def __len__(self):
 88 |         return self.data_len
 89 |     
 90 | 
 91 | def load_dict(filename):
 92 |     return json.loads(open(filename, "r").readline())
 93 |     #return pickle.load(open(filename, 'rb')) 
 94 | 
 95 | def load_vecs(fin):         
 96 |     """read vectors (2D numpy array) from a hdf5 file"""
 97 |     h5f = tables.open_file(fin)
 98 |     h5vecs= h5f.root.vecs
 99 |     
100 |     vecs=np.zeros(shape=h5vecs.shape,dtype=h5vecs.dtype)
101 |     vecs[:]=h5vecs[:]
102 |     h5f.close()
103 |     return vecs
104 |         
105 | def save_vecs(vecs, fout):
106 |     fvec = tables.open_file(fout, 'w')
107 |     atom = tables.Atom.from_dtype(vecs.dtype)
108 |     filters = tables.Filters(complib='blosc', complevel=5)
109 |     ds = fvec.create_carray(fvec.root,'vecs', atom, vecs.shape,filters=filters)
110 |     ds[:] = vecs
111 |     print('done')
112 |     fvec.close()
113 | 
114 | if __name__ == '__main__':
115 |     input_dir='./data/github/'
116 |     train_set=CodeSearchDataset(input_dir, 'train.name.h5', 6, 'train.apiseq.h5', 20, 'train.tokens.h5', 30, 'train.desc.h5', 30)
117 |     train_data_loader=torch.utils.data.DataLoader(dataset=train_set, batch_size=1, shuffle=False, num_workers=1)
118 |     valid_set=CodeSearchDataset(input_dir, 'valid.name.h5', 6, 'valid.apiseq.h5', 20, 'valid.tokens.h5', 30, 'valid.desc.h5', 30)
119 |     valid_data_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=1, shuffle=False, num_workers=1)
120 |     use_set=CodeSearchDataset(input_dir, 'use.name.h5', 6, 'use.apiseq.h5', 20, 'use.tokens.h5', 30)
121 |     use_data_loader=torch.utils.data.DataLoader(dataset=use_set, batch_size=1, shuffle=False, num_workers=1)
122 |     vocab_api = load_dict(input_dir+'vocab.apiseq.json')
123 |     vocab_name = load_dict(input_dir+'vocab.name.json')
124 |     vocab_tokens = load_dict(input_dir+'vocab.tokens.json')
125 |     vocab_desc = load_dict(input_dir+'vocab.desc.json')
126 |     
127 |     print('============ Train Data ================')
128 |     k=0
129 |     for batch in train_data_loader:
130 |         batch = tuple([t.numpy() for t in batch])
131 |         name, name_len, apiseq, api_len, tokens, tok_len, good_desc, good_desc_len, bad_desc, bad_desc_len = batch
132 |         k+=1
133 |         if k>20: break
134 |         print('-------------------------------')
135 |         print(indexes2sent(name, vocab_name))
136 |         print(indexes2sent(apiseq, vocab_api))
137 |         print(indexes2sent(tokens, vocab_tokens))
138 |         print(indexes2sent(good_desc, vocab_desc))
139 |         
140 |     print('\n\n============ Valid Data ================')
141 |     k=0
142 |     for batch in valid_data_loader:
143 |         batch = tuple([t.numpy() for t in batch])
144 |         name, name_len, apiseq, api_len, tokens, tok_len, good_desc, good_desc_len, bad_desc, bad_desc_len = batch
145 |         k+=1
146 |         if k>20: break
147 |         print('-------------------------------')
148 |         print(indexes2sent(name, vocab_name))
149 |         print(indexes2sent(apiseq, vocab_api))
150 |         print(indexes2sent(tokens, vocab_tokens))
151 |         print(indexes2sent(good_desc, vocab_desc))
152 |         
153 |     print('\n\n============ Use Data ================')
154 |     k=0
155 |     for batch in use_data_loader:
156 |         batch = tuple([t.numpy() for t in batch])
157 |         name, name_len, apiseq, api_len, tokens, tok_len = batch
158 |         k+=1
159 |         if k>20: break
160 |         print('-------------------------------')
161 |         print(indexes2sent(name, vocab_name))
162 |         print(indexes2sent(apiseq, vocab_api))
163 |         print(indexes2sent(tokens, vocab_tokens))
164 | 


--------------------------------------------------------------------------------
/pytorch/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .jointemb import JointEmbeder


--------------------------------------------------------------------------------
/pytorch/models/jointemb.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import numpy as np
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.init as weight_init
 8 | import torch.nn.functional as F
 9 | 
10 | import logging
11 | logger = logging.getLogger(__name__)
12 | parentPath = os.path.abspath("..")
13 | sys.path.insert(0, parentPath)# add parent folder to path so as to import common modules
14 | from modules import SeqEncoder, BOWEncoder
15 | 
16 | class JointEmbeder(nn.Module):
17 |     """
18 |     References on sentence pair matching models:
19 |     https://arxiv.org/pdf/1508.01585.pdf
20 |     https://arxiv.org/pdf/1908.10084.pdf
21 |     similarity scale classification for sentence pairs: https://arxiv.org/pdf/1503.00075.pdf
22 |     """
23 |     def __init__(self, config):
24 |         super(JointEmbeder, self).__init__()
25 |         self.conf = config
26 |         self.margin = config['margin']
27 |                
28 |         self.name_encoder=SeqEncoder(config['n_words'],config['emb_size'],config['lstm_dims'])
29 |         self.api_encoder=SeqEncoder(config['n_words'],config['emb_size'],config['lstm_dims'])
30 |         self.tok_encoder=BOWEncoder(config['n_words'],config['emb_size'],config['n_hidden'])
31 |         self.desc_encoder=SeqEncoder(config['n_words'],config['emb_size'],config['lstm_dims'])
32 |         #self.fuse1=nn.Linear(config['emb_size']+4*config['lstm_dims'], config['n_hidden'])
33 |         #self.fuse2 = nn.Sequential(
34 |         #    nn.Linear(config['emb_size']+4*config['lstm_dims'], config['n_hidden']),
35 |         #    nn.BatchNorm1d(config['n_hidden'], eps=1e-05, momentum=0.1),
36 |         #    nn.ReLU(),
37 |         #    nn.Linear(config['n_hidden'], config['n_hidden']),
38 |         #)
39 |         self.w_name = nn.Linear(2*config['lstm_dims'], config['n_hidden'])
40 |         self.w_api = nn.Linear(2*config['lstm_dims'], config['n_hidden'])
41 |         self.w_tok = nn.Linear(config['emb_size'], config['n_hidden'])
42 |         self.w_desc = nn.Linear(2*config['lstm_dims'], config['n_hidden'])
43 |         self.fuse3 = nn.Linear(config['n_hidden'], config['n_hidden'])
44 |         
45 |         self.init_weights()
46 |         
47 |     def init_weights(self):# Initialize Linear Weight 
48 |         for m in [self.w_name, self.w_api, self.w_tok, self.fuse3]:        
49 |             m.weight.data.uniform_(-0.1, 0.1)#nn.init.xavier_normal_(m.weight)
50 |             nn.init.constant_(m.bias, 0.) 
51 |             
52 |     def code_encoding(self, name, name_len, api, api_len, tokens, tok_len):
53 |         name_repr=self.name_encoder(name, name_len)
54 |         api_repr=self.api_encoder(api, api_len)
55 |         tok_repr=self.tok_encoder(tokens, tok_len)
56 |         #code_repr= self.fuse2(torch.cat((name_repr, api_repr, tok_repr),1))
57 |         code_repr = self.fuse3(torch.tanh(self.w_name(name_repr)+self.w_api(api_repr)+self.w_tok(tok_repr)))
58 |         return code_repr
59 |         
60 |     def desc_encoding(self, desc, desc_len):
61 |         desc_repr=self.desc_encoder(desc, desc_len)
62 |         desc_repr=self.w_desc(desc_repr)
63 |         return desc_repr
64 |     
65 |     def similarity(self, code_vec, desc_vec):
66 |         """
67 |         https://arxiv.org/pdf/1508.01585.pdf 
68 |         """
69 |         assert self.conf['sim_measure'] in ['cos', 'poly', 'euc', 'sigmoid', 'gesd', 'aesd'], "invalid similarity measure"
70 |         if self.conf['sim_measure']=='cos':
71 |             return F.cosine_similarity(code_vec, desc_vec)
72 |         elif self.conf['sim_measure']=='poly':
73 |             return (0.5*torch.matmul(code_vec, desc_vec.t()).diag()+1)**2
74 |         elif self.conf['sim_measure']=='sigmoid':
75 |             return torch.tanh(torch.matmul(code_vec, desc_vec.t()).diag()+1)
76 |         elif self.conf['sim_measure'] in ['euc', 'gesd', 'aesd']:
77 |             euc_dist = torch.dist(code_vec, desc_vec, 2) # or torch.norm(code_vec-desc_vec,2)
78 |             euc_sim = 1 / (1 + euc_dist)
79 |             if self.conf['sim_measure']=='euc': return euc_sim                
80 |             sigmoid_sim = torch.sigmoid(torch.matmul(code_vec, desc_vec.t()).diag()+1)
81 |             if self.conf['sim_measure']=='gesd': 
82 |                 return euc_sim * sigmoid_sim
83 |             elif self.conf['sim_measure']=='aesd':
84 |                 return 0.5*(euc_sim+sigmoid_sim)
85 |     
86 |     def forward(self, name, name_len, apiseq, api_len, tokens, tok_len, desc_anchor, desc_anchor_len, desc_neg, desc_neg_len):
87 |         batch_size=name.size(0)
88 |         code_repr=self.code_encoding(name, name_len, apiseq, api_len, tokens, tok_len)
89 |         desc_anchor_repr=self.desc_encoding(desc_anchor, desc_anchor_len)
90 |         desc_neg_repr=self.desc_encoding(desc_neg, desc_neg_len)
91 |     
92 |         anchor_sim = self.similarity(code_repr, desc_anchor_repr)
93 |         neg_sim = self.similarity(code_repr, desc_neg_repr) # [batch_sz x 1]
94 |         
95 |         loss=(self.margin-anchor_sim+neg_sim).clamp(min=1e-6).mean()
96 |         
97 |         return loss


--------------------------------------------------------------------------------
/pytorch/modules.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import math
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.init as weight_init
  8 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
  9 | from torch import optim
 10 | import torch.nn.functional as F
 11 | 
 12 | import logging
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class BOWEncoder(nn.Module):
 17 |     '''
 18 |     https://medium.com/data-from-the-trenches/how-deep-does-your-sentence-embedding-model-need-to-be-cdffa191cb53
 19 |     https://www.kdnuggets.com/2019/10/beyond-word-embedding-document-embedding.html
 20 |     https://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d#bbe8
 21 |     '''
 22 |     def __init__(self, vocab_size, emb_size, hidden_size):
 23 |         super(BOWEncoder, self).__init__()
 24 |         self.emb_size=emb_size
 25 |         self.hidden_size = hidden_size
 26 |         self.embedding = nn.Embedding(vocab_size, emb_size)
 27 |         #self.word_weights = get_word_weights(vocab_size) 
 28 |         self.init_weights()
 29 |         
 30 |     def init_weights(self):
 31 |         nn.init.uniform_(self.embedding.weight, -0.1, 0.1)
 32 |         nn.init.constant_(self.embedding.weight[0], 0)
 33 |         
 34 |     def forward(self, input, input_len=None): 
 35 |         batch_size, seq_len =input.size()
 36 |         embedded = self.embedding(input)  # input: [batch_sz x seq_len x 1]  embedded: [batch_sz x seq_len x emb_sz]
 37 |         embedded= F.dropout(embedded, 0.25, self.training) # [batch_size x seq_len x emb_size]
 38 |         
 39 |         # try to use a weighting scheme to summarize bag of word embeddings: 
 40 |         # for example, a smooth inverse frequency weighting algorithm: https://github.com/peter3125/sentence2vec/blob/master/sentence2vec.py
 41 |         # word_weights = self.word_weights(input) # [batch_size x seq_len x 1]
 42 |         # embeded = word_weights*embedded 
 43 |         
 44 |         # max pooling word vectors
 45 |         maxpooling = nn.MaxPool1d(kernel_size = seq_len, stride=seq_len)
 46 |         output_pool = maxpooling(embedded.transpose(1,2)).squeeze(2) # [batch_size x emb_size]
 47 |         encoding = output_pool #torch.tanh(output_pool)        
 48 |         return encoding
 49 |         
 50 | class SeqEncoder(nn.Module):
 51 |     def __init__(self, vocab_size, emb_size, hidden_size, n_layers=1):
 52 |         super(SeqEncoder, self).__init__()
 53 |         self.emb_size = emb_size
 54 |         self.hidden_size = hidden_size
 55 |         self.n_layers = n_layers
 56 |         self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
 57 |         self.lstm = nn.LSTM(emb_size, hidden_size, batch_first=True, bidirectional=True)
 58 |         self.init_weights()
 59 |         
 60 |     def init_weights(self):
 61 |         nn.init.uniform_(self.embedding.weight, -0.1, 0.1)
 62 |         nn.init.constant_(self.embedding.weight[0], 0)
 63 |         for name, param in self.lstm.named_parameters(): # initialize the gate weights 
 64 |             # adopted from https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5
 65 |             #if len(param.shape)>1:
 66 |             #    weight_init.orthogonal_(param.data) 
 67 |             #else:
 68 |             #    weight_init.normal_(param.data)                
 69 |             # adopted from fairseq
 70 |             if 'weight' in name or 'bias' in name: 
 71 |                 param.data.uniform_(-0.1, 0.1)
 72 | 
 73 |     def forward(self, inputs, input_lens=None): 
 74 |         '''
 75 |         input_lens: [batch_size]
 76 |         '''
 77 |         batch_size, seq_len=inputs.size()
 78 |         inputs = self.embedding(inputs)  # input: [batch_sz x seq_len]  embedded: [batch_sz x seq_len x emb_sz]
 79 |         inputs = F.dropout(inputs, 0.25, self.training)
 80 |         
 81 |         if input_lens is not None:# sort and pack sequence 
 82 |             input_lens_sorted, indices = input_lens.sort(descending=True)
 83 |             inputs_sorted = inputs.index_select(0, indices)        
 84 |             inputs = pack_padded_sequence(inputs_sorted, input_lens_sorted.data.tolist(), batch_first=True)
 85 |             
 86 |         hids, (h_n, c_n) = self.lstm(inputs)  
 87 |         
 88 |         if input_lens is not None: # reorder and pad
 89 |             _, inv_indices = indices.sort()
 90 |             hids, lens = pad_packed_sequence(hids, batch_first=True) # hids:[batch_size x seq_len x (n_dir*hid_sz)](biRNN)
 91 |             hids = F.dropout(hids, p=0.25, training=self.training)
 92 |             hids = hids.index_select(0, inv_indices)
 93 |             h_n = h_n.index_select(1, inv_indices)
 94 |         h_n = h_n.view(self.n_layers, 2, batch_size, self.hidden_size) #[n_layers x n_dirs x batch_sz x hid_sz]
 95 |         h_n = h_n[-1] # get the last layer [n_dirs x batch_sz x hid_sz]
 96 | ############commenting the following line significantly improves the performance, why? #####################################
 97 |       #  h_n1 = h_n.transpose(1, 0).contiguous() #[batch_size x n_dirs x hid_sz]
 98 |       #  encoding1 = h_n1.view(batch_size,-1) #[batch_sz x (n_dirs*hid_sz)]
 99 |         
100 |         #https://www.jianshu.com/p/c5b8e02bedbe
101 |         #maxpooling = nn.MaxPool1d(kernel_size=hids.size(1), stride=hids.size(1))
102 |         #encoding2 = maxpooling(hids.transpose(1,2)).squeeze(2) # [batch_size x 2*hid_size]
103 |         #encoding2 = torch.tanh(encoding2)
104 | 
105 |         encoding3 = torch.cat((h_n[0], h_n[1]), dim=1)
106 |         return encoding3 #, encoding2, encoding3
107 | 
108 |     
109 | from torch.optim.lr_scheduler import LambdaLR
110 | 
111 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
112 |     """ Create a schedule with a learning rate that decreases following the
113 |     values of the cosine function between 0 and `pi * cycles` after a warmup
114 |     period during which it increases linearly between 0 and 1.
115 |     """
116 |     def lr_lambda(current_step):
117 |         if current_step < num_warmup_steps:
118 |             return float(current_step) / float(max(1, num_warmup_steps))
119 |         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
120 |         return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
121 | 
122 |     return LambdaLR(optimizer, lr_lambda, last_epoch)    
123 |     
124 | 
125 | def get_word_weights(vocab_size, padding_idx=0):
126 |     '''contruct a word weighting table '''
127 |     def cal_weight(word_idx):
128 |         return 1-math.exp(-word_idx)
129 |     weight_table = np.array([cal_weight(w) for w in range(vocab_size)])
130 |     if padding_idx is not None:        
131 |         weight_table[padding_idx] = 0. # zero vector for padding dimension
132 |     return torch.FloatTensor(weight_table)
133 | 
134 |  
135 |  
136 |  
137 |  


--------------------------------------------------------------------------------
/pytorch/repr_code.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from datetime import datetime
 4 | import numpy as np
 5 | import argparse
 6 | from tqdm import tqdm
 7 | import logging
 8 | logger = logging.getLogger(__name__)
 9 | logging.basicConfig(level=logging.INFO, format="%(message)s")
10 | 
11 | import torch
12 | from utils import normalize
13 | from data_loader import CodeSearchDataset, save_vecs
14 | import models, configs    
15 | 
16 | ##### Compute Representation #####
17 | def repr_code(args):
18 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
19 |     config=getattr(configs, 'config_'+args.model)()
20 | 
21 |     ##### Define model ######
22 |     logger.info('Constructing Model..')
23 |     model = getattr(models, args.model)(config)#initialize the model
24 |     if args.reload_from>0:
25 |         ckpt_path = f'./output/{args.model}/{args.dataset}/{args.timestamp}/models/step{args.reload_from}.h5'
26 |         model.load_state_dict(torch.load(ckpt_path, map_location=device))       
27 |     model = model.to(device)   
28 |     model.eval()
29 | 
30 |     data_path = args.data_path+args.dataset+'/'
31 |     use_set = eval(config['dataset_name'])(data_path, config['use_names'], config['name_len'],
32 |                               config['use_apis'], config['api_len'],
33 |                               config['use_tokens'], config['tokens_len'])
34 | 
35 |     data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=args.batch_size, 
36 |                                   shuffle=False, drop_last=False, num_workers=1)
37 | 
38 |     chunk_id = 0
39 |     vecs, n_processed = [], 0 
40 |     for batch in tqdm(data_loader):
41 |         batch_gpu = [tensor.to(device) for tensor in batch] 
42 |         with torch.no_grad():
43 |             reprs = model.code_encoding(*batch_gpu).data.cpu().numpy()
44 |         reprs = reprs.astype(np.float32) # [batch x dim]
45 |         if config['sim_measure']=='cos': # do normalization for fast cosine computation
46 |             reprs = normalize(reprs)
47 |         vecs.append(reprs)
48 |         n_processed=n_processed+ batch[0].size(0)
49 |         if n_processed>= args.chunk_size:
50 |             output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5"
51 |             save_vecs(np.vstack(vecs), output_path)
52 |             chunk_id+=1
53 |             vecs, n_processed = [], 0
54 |     # save the last chunk (probably incomplete)
55 |     output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5"
56 |     save_vecs(np.vstack(vecs), output_path)
57 |     
58 | def parse_args():
59 |     parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
60 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
61 |     parser.add_argument('--model', type=str, default='JointEmbeder', help='model name')
62 |     parser.add_argument('-d', '--dataset', type=str, default='github', help='dataset')
63 |     parser.add_argument('-t', '--timestamp', type=str, help='time stamp')
64 |     parser.add_argument('--reload_from', type=int, default=-1, help='step to reload from')
65 |     parser.add_argument('--batch_size', type=int, default=10000, help='how many instances for encoding and normalization at each step')
66 |     parser.add_argument('--chunk_size', type=int, default=2000000, help='split code vector into chunks and store them individually. '\
67 |                         'Note: should be consistent with the same argument in the search.py')
68 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
69 |     return parser.parse_args()
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     args = parse_args()
74 |     repr_code(args)
75 | 
76 |         
77 |    


--------------------------------------------------------------------------------
/pytorch/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision
3 | tables
4 | numpy
5 | scipy
6 | tqdm
7 | tensorboardX
8 | transformers


--------------------------------------------------------------------------------
/pytorch/search.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | import numpy as np
  5 | import argparse
  6 | import threading
  7 | import codecs
  8 | import logging
  9 | logger = logging.getLogger(__name__)
 10 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 11 | 
 12 | import torch
 13 | 
 14 | from utils import normalize, similarity, sent2indexes
 15 | from data_loader import load_dict, load_vecs
 16 | import models, configs
 17 |   
 18 | codevecs, codebase = [], []
 19 | 
 20 | ##### Data Set #####   
 21 | def load_codebase(code_path, chunk_size=2000000):
 22 |     """load codebase
 23 |       codefile: h5 file that stores raw code
 24 |     """
 25 |     logger.info(f'Loading codebase (chunk size={chunk_size})..')
 26 |     codebase= []
 27 |     codes = codecs.open(code_path, encoding='latin-1').readlines() # use codecs to read in case of encoding problem
 28 |     for i in range(0, len(codes), chunk_size):
 29 |         codebase.append(codes[i: i+chunk_size]) 
 30 |     '''
 31 |     import subprocess
 32 |     n_lines = int(subprocess.check_output(["wc", "-l", code_path], universal_newlines=True).split()[0])
 33 |     for i in range(1, n_lines+1, chunk_size):
 34 |         codecs = subprocess.check_output(["sed",'-n',f'{i},{i+chunk_size}p', code_path]).split()
 35 |         codebase.append(codecs)
 36 |    '''
 37 |     return codebase
 38 | 
 39 | ### Results Data ###
 40 | def load_codevecs(vec_path, chunk_size=2000000):
 41 |     logger.debug(f'Loading code vectors (chunk size={chunk_size})..')       
 42 |     """read vectors (2D numpy array) from a hdf5 file"""
 43 |     codevecs=[]
 44 |     chunk_id = 0
 45 |     chunk_path = f"{vec_path[:-3]}_part{chunk_id}.h5"
 46 |     while os.path.exists(chunk_path):
 47 |         reprs = load_vecs(chunk_path)
 48 |         codevecs.append(reprs)
 49 |         chunk_id+=1
 50 |         chunk_path = f"{vec_path[:-3]}_part{chunk_id}.h5"
 51 |     return codevecs
 52 | 
 53 | def search(config, model, vocab, query, n_results=10):
 54 |     model.eval()
 55 |     device = next(model.parameters()).device
 56 |     desc, desc_len =sent2indexes(query, vocab_desc, config['desc_len'])#convert query into word indices
 57 |     desc = torch.from_numpy(desc).unsqueeze(0).to(device)
 58 |     desc_len = torch.from_numpy(desc_len).clamp(max=config['desc_len']).to(device)
 59 |     with torch.no_grad():
 60 |         desc_repr = model.desc_encoding(desc, desc_len).data.cpu().numpy().astype(np.float32) # [1 x dim]
 61 |     if config['sim_measure']=='cos': # normalizing vector for fast cosine computation
 62 |         desc_repr = normalize(desc_repr) # [1 x dim]
 63 |     results =[]
 64 |     threads = []
 65 |     for i, codevecs_chunk in enumerate(codevecs):
 66 |         t = threading.Thread(target=search_thread, args = (results, desc_repr, codevecs_chunk, i, n_results, config['sim_measure']))
 67 |         threads.append(t)
 68 |     for t in threads:
 69 |         t.start()
 70 |     for t in threads:#wait until all sub-threads have completed
 71 |         t.join()
 72 |     return results
 73 | 
 74 | def search_thread(results, desc_repr, codevecs, i, n_results, sim_measure):        
 75 | #1. compute code similarities
 76 |     if sim_measure=='cos':
 77 |         chunk_sims = np.dot(codevecs, desc_repr.T)[:,0] # [pool_size]
 78 |     else:
 79 |         chunk_sims = similarity(codevecs, desc_repr, sim_measure) # [pool_size]
 80 |     
 81 | #2. select the top K results
 82 |     negsims = np.negative(chunk_sims)
 83 |     maxinds = np.argpartition(negsims, kth=n_results-1)
 84 |     maxinds = maxinds[:n_results]  
 85 |     chunk_codes = [codebase[i][k] for k in maxinds]
 86 |     chunk_sims = chunk_sims[maxinds]
 87 |     results.extend(zip(chunk_codes, chunk_sims))
 88 |     
 89 | def postproc(codes_sims):
 90 |     codes_, sims_ = zip(*codes_sims)
 91 |     codes = [code for code in codes_]
 92 |     sims = [sim for sim in sims_]
 93 |     final_codes = []
 94 |     final_sims = []
 95 |     n = len(codes_sims)        
 96 |     for i in range(n):
 97 |         is_dup=False
 98 |         for j in range(i):
 99 |             if codes[i][:80]==codes[j][:80] and abs(sims[i]-sims[j])<0.01:
100 |                 is_dup=True
101 |         if not is_dup:
102 |             final_codes.append(codes[i])
103 |             final_sims.append(sims[i])
104 |     return zip(final_codes,final_sims)
105 |     
106 | def parse_args():
107 |     parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
108 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
109 |     parser.add_argument('--model', type=str, default='JointEmbeder', help='model name')
110 |     parser.add_argument('-d', '--dataset', type=str, default='github', help='name of dataset.java, python')
111 |     parser.add_argument('-t', '--timestamp', type=str, help='time stamp')
112 |     parser.add_argument('--reload_from', type=int, default=-1, help='step to reload from')
113 |     parser.add_argument('--chunk_size', type=int, default=2000000, help='codebase and code vector are stored in many chunks. '\
114 |                          'Note: should be consistent with the same argument in the repr_code.py')
115 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
116 |     return parser.parse_args()
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     args = parse_args()
121 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
122 |     config = getattr(configs, 'config_'+args.model)()
123 |     
124 |     ##### Define model ######
125 |     logger.info('Constructing Model..')
126 |     model = getattr(models, args.model)(config)#initialize the model
127 |     ckpt=f'./output/{args.model}/{args.dataset}/{args.timestamp}/models/step{args.reload_from}.h5'
128 |     model.load_state_dict(torch.load(ckpt, map_location=device))
129 |     model.eval()
130 |     data_path = args.data_path+args.dataset+'/'
131 |     
132 |     vocab_desc = load_dict(data_path+config['vocab_desc'])
133 |     codebase = load_codebase(data_path+config['use_codebase'], args.chunk_size)
134 |     codevecs = load_codevecs(data_path+config['use_codevecs'], args.chunk_size)
135 |     assert len(codebase)==len(codevecs), \
136 |          "inconsistent number of chunks, check whether the specified files for codebase and code vectors are correct!"    
137 |     
138 |     while True:
139 |         try:
140 |             query = input('Input Query: ')
141 |             n_results = int(input('How many results? '))
142 |         except Exception:
143 |             print("Exception while parsing your input:")
144 |             traceback.print_exc()
145 |             break
146 |         query = query.lower().replace('how to ', '').replace('how do i ', '').replace('how can i ', '').replace('?', '').strip()
147 |         results = search(config, model, vocab_desc, query, n_results)
148 |         results = sorted(results, reverse=True, key=lambda x:x[1])
149 |         results = postproc(results)
150 |         results = list(results)[:n_results]
151 |         results = '\n\n'.join(map(str,results)) #combine the result into a returning string
152 |         print(results)
153 | 
154 | 


--------------------------------------------------------------------------------
/pytorch/setup.py:
--------------------------------------------------------------------------------
 1 | #nsml: nsml/ml:cuda10.1-cudnn7-pytorch1.3keras2.3
 2 | from distutils.core import setup
 3 | setup(
 4 |     author='Xiaodong Gu',
 5 |     author_email='xiaodong.gu@navercorp.com',
 6 |     name='DeepCS',
 7 |     version='0.1',
 8 |     description='Hyperparameter tuning',
 9 |     install_requires = [
10 |         'numpy',
11 |         'protobuf',
12 |         'six',
13 |         'tables',
14 |         'tensorboardX',
15 |         'tqdm',
16 |         'transformers',
17 |     ]
18 | )
19 | 


--------------------------------------------------------------------------------
/pytorch/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import random
  4 | import time
  5 | from datetime import datetime
  6 | import numpy as np
  7 | import math
  8 | import argparse
  9 | random.seed(42)
 10 | from tqdm import tqdm
 11 | 
 12 | import logging
 13 | logger = logging.getLogger(__name__)
 14 | logging.basicConfig(level=logging.INFO, format="%(message)s")
 15 | from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package
 16 | 
 17 | import torch
 18 | 
 19 | import models, configs, data_loader 
 20 | from modules import get_cosine_schedule_with_warmup
 21 | from utils import similarity, normalize
 22 | from data_loader import *
 23 | 
 24 | try: 
 25 |     import nsml
 26 |     from nsml import DATASET_PATH, IS_ON_NSML, SESSION_NAME
 27 | except: 
 28 |     IS_ON_NSML = False
 29 |     
 30 | def bind_nsml(model, **kwargs):
 31 |     if type(model) == torch.nn.DataParallel: model = model.module
 32 |     def infer(raw_data, **kwargs):
 33 |         pass
 34 |     def load(path, *args):
 35 |         global global_step
 36 |         state = torch.load(os.path.join(path, 'model.pt'))
 37 |         model.load_state_dict(state['model'])
 38 |         global_step = state['step']
 39 |         if 'optimizer' in state and optimizer:
 40 |             optimizer.load_state_dict(state['optimizer'])
 41 |         logger.info(f'Load checkpoints...!{path}')
 42 |     def save(path, *args):
 43 |         global global_step
 44 |         state = {
 45 |             'model': model.state_dict(),
 46 |             'step' : global_step
 47 |         }
 48 |         torch.save(state, os.path.join(path, 'model.pt'))
 49 |         logger.info(f'Save checkpoints...!{path}')
 50 |     # function in function is just used to divide the namespace.
 51 |     nsml.bind(save=save, load=load, infer=infer)
 52 | 
 53 |     
 54 | def train(args):
 55 |     timestamp = datetime.now().strftime('%Y%m%d%H%M') 
 56 |     # make output directory if it doesn't already exist
 57 |     os.makedirs(f'./output/{args.model}/{args.dataset}/{timestamp}/models', exist_ok=True)
 58 |     os.makedirs(f'./output/{args.model}/{args.dataset}/{timestamp}/tmp_results', exist_ok=True)
 59 |     
 60 |     fh = logging.FileHandler(f"./output/{args.model}/{args.dataset}/{timestamp}/logs.txt")
 61 |                                       # create file handler which logs even debug messages
 62 |     logger.addHandler(fh)# add the handlers to the logger
 63 |     
 64 |     tb_writer = SummaryWriter(f"./output/{args.model}/{args.dataset}/{timestamp}/logs/" ) if args.visual else None
 65 |     
 66 |     random.seed(args.seed)
 67 |     np.random.seed(args.seed)
 68 |     torch.manual_seed(args.seed)
 69 |     torch.cuda.manual_seed(args.seed)
 70 |     device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 
 71 | 
 72 |     config=getattr(configs, 'config_'+args.model)()
 73 |     if args.automl:
 74 |         config.update(vars(args))
 75 |     print(config)
 76 |     
 77 |     ###############################################################################
 78 |     # Load data
 79 |     ###############################################################################
 80 |     data_path = DATASET_PATH+"/train/" if IS_ON_NSML else args.data_path+args.dataset+'/'
 81 |     train_set = eval(config['dataset_name'])(data_path, config['train_name'], config['name_len'],
 82 |                                   config['train_api'], config['api_len'],
 83 |                                   config['train_tokens'], config['tokens_len'],
 84 |                                   config['train_desc'], config['desc_len'])
 85 |     valid_set = eval(config['dataset_name'])(data_path,
 86 |                                   config['valid_name'], config['name_len'],
 87 |                                   config['valid_api'], config['api_len'],
 88 |                                   config['valid_tokens'], config['tokens_len'],
 89 |                                   config['valid_desc'], config['desc_len'])
 90 |     data_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], 
 91 |                                        shuffle=True, drop_last=True, num_workers=1)
 92 |     
 93 |     ###############################################################################
 94 |     # Define Model
 95 |     ###############################################################################
 96 |     logger.info('Constructing Model..')
 97 |     model = getattr(models, args.model)(config)#initialize the model
 98 |     
 99 |     def save_model(model, ckpt_path):
100 |         torch.save(model.state_dict(), ckpt_path)
101 | 
102 |     def load_model(model, ckpt_path, to_device):
103 |         assert os.path.exists(ckpt_path), f'Weights not found'
104 |         model.load_state_dict(torch.load(ckpt_path, map_location=to_device))
105 |         
106 |     if args.reload_from>0:
107 |         ckpt = f'./output/{args.model}/{args.dataset}/{timestamp}/models/step{args.reload_from}.h5'
108 |         load_model(model, ckpt, device)    
109 |         
110 |     if IS_ON_NSML:
111 |         bind_nsml(model)
112 |         if args.pause:
113 |             nsml.paused(locals())
114 |             
115 |     model.to(device)    
116 |     
117 |     ###############################################################################
118 |     # Prepare the Optimizer
119 |     ###############################################################################
120 | 
121 |     no_decay = ['bias', 'LayerNorm.weight']
122 |     optimizer_grouped_parameters = [
123 |             {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
124 |             {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
125 |     ]    
126 |     optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon'])        
127 |     scheduler = get_cosine_schedule_with_warmup(
128 |             optimizer, num_warmup_steps=config['warmup_steps'], 
129 |             num_training_steps=len(data_loader)*config['nb_epoch']) # do not foget to modify the number when dataset is changed
130 |     if config['fp16']:
131 |         try:
132 |             from apex import amp
133 |         except ImportError:
134 |             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
135 |         model, optimizer = amp.initialize(model, optimizer, opt_level=config['fp16_opt_level'])
136 |     
137 |     ###############################################################################
138 |     # Training Process
139 |     ###############################################################################    
140 |     n_iters = len(data_loader)
141 |     global global_step
142 |     global_step = args.reload_from+1 
143 |     for epoch in range(int(args.reload_from/n_iters)+1, config['nb_epoch']+1): 
144 |         itr_start_time = time.time()
145 |         losses=[]
146 |         for batch in data_loader:
147 |             
148 |             model.train()
149 |             batch_gpu = [tensor.to(device) for tensor in batch]
150 |             loss = model(*batch_gpu)
151 |             
152 |             if config['fp16']:
153 |                 with amp.scale_loss(loss, optimizer) as scaled_loss:
154 |                     scaled_loss.backward()
155 |                 torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 5.0)
156 |             else:
157 |                 loss.backward()
158 |                 torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
159 |                 
160 |             optimizer.step()
161 |             scheduler.step()
162 |             model.zero_grad()
163 |             
164 |             losses.append(loss.item())
165 |             
166 |             if global_step % args.log_every ==0:
167 |                 elapsed = time.time() - itr_start_time
168 |                 logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f'%
169 |                         (epoch, config['nb_epoch'], global_step%n_iters, n_iters, elapsed, np.mean(losses)))
170 |                 if tb_writer is not None:
171 |                     tb_writer.add_scalar('loss', np.mean(losses), global_step)
172 |                 if IS_ON_NSML:
173 |                     summary = {"summary": True, "scope": locals(), "step": global_step}
174 |                     summary.update({'loss':np.mean(losses)})
175 |                     nsml.report(**summary)
176 |                     
177 |                 losses=[] 
178 |                 itr_start_time = time.time() 
179 |             global_step = global_step + 1
180 | 
181 |             if global_step % args.valid_every == 0:
182 |                 logger.info("validating..")                  
183 |                 valid_result = validate(valid_set, model, 100000, 1, config['sim_measure'])  
184 |                 logger.info(valid_result)
185 |                 if tb_writer is not None:
186 |                     for key, value in valid_result.items():
187 |                         tb_writer.add_scalar(key, value, global_step)
188 |                 if IS_ON_NSML:
189 |                     summary = {"summary": True, "scope": locals(), "step": global_step}
190 |                     summary.update(valid_result)
191 |                     nsml.report(**summary)
192 |                     
193 |             if global_step % args.save_every == 0:
194 |                 ckpt_path = f'./output/{args.model}/{args.dataset}/{timestamp}/models/step{global_step}.h5'
195 |                 save_model(model, ckpt_path)
196 |                 if IS_ON_NSML:
197 |                     nsml.save(checkpoint=f'model_step{global_step}')
198 | 
199 | ##### Evaluation #####
200 | def validate(valid_set, model, pool_size, K, sim_measure):
201 |     """
202 |     simple validation in a code pool. 
203 |     @param: poolsize - size of the code pool, if -1, load the whole test set
204 |     """
205 |     def ACC(real,predict):
206 |         sum=0.0
207 |         for val in real:
208 |             try: index=predict.index(val)
209 |             except ValueError: index=-1
210 |             if index!=-1: sum=sum+1  
211 |         return sum/float(len(real))
212 |     def MAP(real,predict):
213 |         sum=0.0
214 |         for id, val in enumerate(real):
215 |             try: index=predict.index(val)
216 |             except ValueError: index=-1
217 |             if index!=-1: sum=sum+(id+1)/float(index+1)
218 |         return sum/float(len(real))
219 |     def MRR(real, predict):
220 |         sum=0.0
221 |         for val in real:
222 |             try: index = predict.index(val)
223 |             except ValueError: index=-1
224 |             if index!=-1: sum=sum+1.0/float(index+1)
225 |         return sum/float(len(real))
226 |     def NDCG(real, predict):
227 |         dcg=0.0
228 |         idcg=IDCG(len(real))
229 |         for i, predictItem in enumerate(predict):
230 |             if predictItem in real:
231 |                 itemRelevance = 1
232 |                 rank = i+1
233 |                 dcg +=(math.pow(2,itemRelevance)-1.0)*(math.log(2)/math.log(rank+1))
234 |         return dcg/float(idcg)
235 |     def IDCG(n):
236 |         idcg=0
237 |         itemRelevance=1
238 |         for i in range(n): idcg+=(math.pow(2,itemRelevance)-1.0)*(math.log(2)/math.log(i+2))
239 |         return idcg
240 | 
241 |     model.eval()
242 |     device = next(model.parameters()).device
243 | 
244 |     data_loader = torch.utils.data.DataLoader(dataset=valid_set, batch_size=10000, 
245 |                                  shuffle=True, drop_last=True, num_workers=1)
246 |     accs, mrrs, maps, ndcgs=[],[],[],[]
247 |     code_reprs, desc_reprs = [], []
248 |     n_processed = 0
249 |     for batch in tqdm(data_loader):        
250 |         if len(batch) == 10: # names, name_len, apis, api_len, toks, tok_len, descs, desc_len, bad_descs, bad_desc_len
251 |             code_batch = [tensor.to(device) for tensor in batch[:6]]
252 |             desc_batch = [tensor.to(device) for tensor in batch[6:8]]
253 |         else: # code_ids, type_ids, code_mask, good_ids, good_mask, bad_ids, bad_mask
254 |             code_batch = [tensor.to(device) for tensor in batch[:3]]
255 |             desc_batch = [tensor.to(device) for tensor in batch[3:5]]
256 |         with torch.no_grad():
257 |             code_repr=model.code_encoding(*code_batch).data.cpu().numpy().astype(np.float32)
258 |             desc_repr=model.desc_encoding(*desc_batch).data.cpu().numpy().astype(np.float32) # [poolsize x hid_size]
259 |             if sim_measure=='cos':
260 |                 code_repr = normalize(code_repr)
261 |                 desc_repr = normalize(desc_repr)
262 |         code_reprs.append(code_repr)
263 |         desc_reprs.append(desc_repr)
264 |         n_processed += batch[0].size(0)
265 |     code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs)
266 |      
267 |     for k in tqdm(range(0, n_processed, pool_size)):
268 |         code_pool, desc_pool = code_reprs[k:k+pool_size], desc_reprs[k:k+pool_size] 
269 |         for i in range(min(10000, pool_size)): # for i in range(pool_size):
270 |             desc_vec = np.expand_dims(desc_pool[i], axis=0) # [1 x dim]
271 |             n_results = K    
272 |             if sim_measure=='cos':
273 |                 sims = np.dot(code_pool, desc_vec.T)[:,0] # [pool_size]
274 |             else:
275 |                 sims = similarity(code_pool, desc_vec, sim_measure) # [pool_size]
276 |                 
277 |             negsims=np.negative(sims)
278 |             predict = np.argpartition(negsims, kth=n_results-1)#predict=np.argsort(negsims)#
279 |             predict = predict[:n_results]   
280 |             predict = [int(k) for k in predict]
281 |             real = [i]
282 |             accs.append(ACC(real,predict))
283 |             mrrs.append(MRR(real,predict))
284 |             maps.append(MAP(real,predict))
285 |             ndcgs.append(NDCG(real,predict))                     
286 |     return {'acc':np.mean(accs), 'mrr': np.mean(mrrs), 'map': np.mean(maps), 'ndcg': np.mean(ndcgs)}   
287 |     
288 | def parse_args():
289 |     parser = argparse.ArgumentParser("Train and Validate The Code Search (Embedding) Model")
290 |     parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
291 |     parser.add_argument('--model', type=str, default='JointEmbeder', help='model name: JointEmbeder, SelfAttnModel')
292 |     parser.add_argument('--dataset', type=str, default='github', help='name of dataset.java, python')
293 |     parser.add_argument('--reload_from', type=int, default=-1, help='epoch to reload from')
294 |    
295 |     parser.add_argument('-g', '--gpu_id', type=int, default=0, help='GPU ID')
296 |     parser.add_argument('-v', "--visual",action="store_true", default=False, help="Visualize training status in tensorboard")
297 |     parser.add_argument('--automl', action='store_true', default=False, help='use automl')
298 |     # Training Arguments
299 |     parser.add_argument('--log_every', type=int, default=100, help='interval to log autoencoder training results')
300 |     parser.add_argument('--valid_every', type=int, default=10000, help='interval to validation')
301 |     parser.add_argument('--save_every', type=int, default=50000, help='interval to evaluation to concrete results')
302 |     parser.add_argument('--seed', type=int, default=1111, help='random seed')
303 |         
304 |     parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
305 | 
306 |      # Model Hyperparameters for automl tuning
307 |     #parser.add_argument('--emb_size', type=int, default=-1, help = 'embedding dim')
308 |     parser.add_argument('--n_hidden', type=int, default= -1, help='number of hidden dimension of code/desc representation')
309 |     parser.add_argument('--lstm_dims', type=int, default= -1)         
310 |     parser.add_argument('--margin', type=float, default= -1)
311 |     parser.add_argument('--sim_measure', type=str, default = 'cos', help='similarity measure for training')
312 |     
313 |     parser.add_argument('--learning_rate', type=float, help='learning rate')
314 |     #parser.add_argument('--adam_epsilon', type=float)
315 |     #parser.add_argument("--weight_decay", type=float, help="Weight deay if we apply some.")
316 |     #parser.add_argument('--warmup_steps', type=int)
317 |     
318 |     # reserved args for automl pbt
319 |     parser.add_argument('--pause', default=0, type=int)
320 |     parser.add_argument('--iteration', default=0, type=str)
321 |     
322 |     return parser.parse_args()
323 | 
324 | if __name__ == '__main__':
325 |     args = parse_args()
326 |     
327 |     torch.backends.cudnn.benchmark = True # speed up training by using cudnn
328 |     torch.backends.cudnn.deterministic = True # fix the random seed in cudnn
329 |    
330 |     train(args)
331 |         
332 |     


--------------------------------------------------------------------------------
/pytorch/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import math
  4 | import torch
  5 | from torch.nn import functional as F
  6 | 
  7 | PAD_ID, SOS_ID, EOS_ID, UNK_ID = [0, 1, 2, 3]
  8 | 
  9 | def cos_approx(data1,data2):
 10 |     """numpy implementation of cosine similarity for matrix"""
 11 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
 12 |     dotted = np.dot(data1,np.transpose(data2))
 13 |     norm1 = np.linalg.norm(data1,axis=1)
 14 |     norm2 = np.linalg.norm(data2,axis=1)
 15 |     matrix_vector_norms = np.multiply(norm1, norm2)
 16 |     neighbors = np.divide(dotted, matrix_vector_norms)
 17 |     return neighbors
 18 | 
 19 | def normalize(data):
 20 |     """normalize matrix by rows"""
 21 |     return data/np.linalg.norm(data,axis=1,keepdims=True)
 22 | 
 23 | def dot_np(data1,data2):
 24 |     """cosine similarity for normalized vectors"""
 25 |     #print("warning: the second matrix will be transposed, so try to put the simpler matrix as the second argument in order to save time.")
 26 |     return np.dot(data1, data2.T)
 27 | 
 28 | def sigmoid(x):
 29 |     return 1/(1 + np.exp(-x)) 
 30 | 
 31 | def similarity(vec1, vec2, measure='cos'):
 32 |     if measure=='cos':
 33 |         vec1_norm = normalize(vec1)
 34 |         vec2_norm = normalize(vec2)
 35 |         return np.dot(vec1_norm, vec2_norm.T)[:,0]
 36 |     elif measure=='poly':
 37 |         return (0.5*np.dot(vec1, vec2.T).diagonal()+1)**2
 38 |     elif measure=='sigmoid':
 39 |         return np.tanh(np.dot(vec1, vec2.T).diagonal()+1)
 40 |     elif measure in ['euc', 'gesd', 'aesd']: #https://arxiv.org/pdf/1508.01585.pdf 
 41 |         euc_dist = np.linalg.norm(vec1-vec2, axis=1)
 42 |         euc_sim = 1 / (1 + euc_dist)
 43 |         if measure=='euc': return euc_sim                
 44 |         sigmoid_sim = sigmoid(np.dot(vec1, vec2.T).diagonal()+1)
 45 |         if measure == 'gesd': return euc_sim * sigmoid_sim
 46 |         elif measure == 'aesd': return 0.5*(euc_sim+sigmoid_sim)
 47 | 
 48 | #######################################################################
 49 | 
 50 | def asMinutes(s):
 51 |     m = math.floor(s / 60)
 52 |     s -= m * 60
 53 |     return '%d:%d'% (m, s)
 54 | 
 55 | def timeSince(since, percent):
 56 |     now = time.time()
 57 |     s = now - since
 58 |     es = s / (percent)
 59 |     rs = es - s
 60 |     return '%s<%s'%(asMinutes(s), asMinutes(rs))
 61 | 
 62 | #######################################################################
 63 | import nltk
 64 | try: nltk.word_tokenize("hello world")
 65 | except LookupError: nltk.download('punkt')
 66 |     
 67 | def sent2indexes(sentence, vocab, maxlen):
 68 |     '''sentence: a string or list of string
 69 |        return: a numpy array of word indices
 70 |     '''      
 71 |     def convert_sent(sent, vocab, maxlen):
 72 |         idxes = np.zeros(maxlen, dtype=np.int64)
 73 |         idxes.fill(PAD_ID)
 74 |         tokens = nltk.word_tokenize(sent.strip())
 75 |         idx_len = min(len(tokens), maxlen)
 76 |         for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID)
 77 |         return idxes, idx_len
 78 |     if type(sentence) is list:
 79 |         inds, lens = [], []
 80 |         for sent in sentence:
 81 |             idxes, idx_len = convert_sent(sent, vocab, maxlen)
 82 |             #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len])
 83 |             inds.append(idxes)
 84 |             lens.append(idx_len)
 85 |         return np.vstack(inds), np.vstack(lens)
 86 |     else:
 87 |         inds, lens = sent2indexes([sentence], vocab, maxlen)
 88 |         return inds[0], lens[0]
 89 |     
 90 | def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): 
 91 |     '''indexes: numpy array'''
 92 |     def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
 93 |         indexes=filter(lambda i: i!=ignore_tok, indexes)
 94 |         toks, length = [], 0        
 95 |         for idx in indexes:
 96 |             toks.append(ivocab.get(idx, '<unk>'))
 97 |             length+=1
 98 |             if idx == EOS_ID:
 99 |                 break
100 |         return ' '.join(toks), length
101 |     
102 |     ivocab = {v: k for k, v in vocab.items()}
103 |     if indexes.ndim==1:# one sentence
104 |         return revert_sent(indexes, ivocab, ignore_tok)
105 |     else:# dim>1
106 |         sentences, lens =[], [] # a batch of sentences
107 |         for inds in indexes:
108 |             sentence, length = revert_sent(inds, ivocab, ignore_tok)
109 |             sentences.append(sentence)
110 |             lens.append(length)
111 |         return sentences, lens
112 | 
113 | ########################################################################
114 | 


--------------------------------------------------------------------------------