├── .gitignore ├── LICENSE ├── README.md ├── dataset ├── README.md ├── g4_128.npy └── googlejam4.tar.gz └── dcsim ├── .gitignore ├── .idea ├── codeStyleSettings.xml ├── codeStyles │ └── codeStyleConfig.xml ├── inspectionProfiles │ └── Project_Default.xml ├── misc.xml ├── modules.xml ├── other.xml ├── preferred-vcs.xml ├── samples.iml ├── vcs.xml ├── workspace (zg的MacBook Pro's conflicted copy 2017-08-28).xml └── workspace.xml ├── README.md ├── classification.py ├── classification_bigbench_keras.py ├── encoding ├── .idea │ ├── artifacts │ │ └── SourceCodeSimilarity_jar.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── SourceCodeSimilarity.iml ├── bin │ └── META-INF │ │ └── MANIFEST.MF ├── encoding.jar └── src │ ├── DefaultExclusions.txt │ ├── EclipseDefaultInclusions.txt │ ├── EmptyExclusion.txt │ └── Encoder.java ├── graph_mat_data.py ├── preprocessing.py ├── preprocessing_bigbench.py ├── sda_base.py └── sda_unsup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | *.ckpt 107 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 zhaogang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepSIM 2 | 3 | This project is a prototype implementation of DeepSim, a deep learning-based approach to measure code functional similarity. If you find the tool useful in your work, please cite our FSE 2018 paper: 4 | 5 | **"DeepSim: Deep Learning Code Functional Similarity"**. In Proceedings of the 26th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering, 2018. 6 | 7 | ## Setup 8 | 9 | *We haven't created a setup script yet. At this moment you can install those dependency packages manually.* 10 | 11 | We tested it on Ubuntu 16.04.3 64bit. Our hardware environment includes: 12 | - Intel i7 6700K, 4.0GHz 13 | - NVIDIA GTX 1080, 8GB 14 | - DDR4 3000MHz, 48GB 15 | 16 | The dependency packages are listed below: 17 | - Python 2.7 18 | - Tensorflow 1.3 (higher version should be fine) 19 | - Keras 2.x 20 | - All the other packages required by the above packages 21 | 22 | This can be easily installed through [Conda](https://anaconda.org/) and instructions on [Tensorflow](https://www.tensorflow.org). For the encoding part, we already included the WALA jar package. 23 | 24 | ## How To Run it 25 | 26 | In order to run the tool, you need first to generate encoded matrices from Java bytecode files. We already provided the executable jar file `encoding.jar` in the folder `encoding`. Once you complied your Java source code files into a jar package, you can run the below command to generate the matrices: 27 | 28 | ```bash 29 | ./encoding.jar your-bytecode-jar-path.jar 30 | ``` 31 | The generated matrices will be stored in the folder *data* under your current working directory. (We already test this tool on a set of Java projects and it works well. If you find any crashes/errors, please post an issue here.) 32 | 33 | *NOTE: the default matrix size is 128. If you want to change this, just change the value of the variable fixedSize in the Encoder.java source code file.* 34 | 35 | If you just want to have a quick try of the tool, we also provided the matrices we generated for the GCJ dataset used in our paper. They are in the `dataset` folder. In particular, we already stored the matrices using numpy's dump function. So you can directly read it using the below code: 36 | 37 | ```Python 38 | file_path = "path-to-the-datafile/g4_128.npy" 39 | dataset = np.load(open(file_path, 'r')) 40 | X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int) 41 | ``` 42 | Each sample here is a in a spart format. For each 88d feature vector, we only store the indices on which the value is 1. If you want to visualize a sample (as the one in our paper), just convert it back to the normal matrix. 43 | 44 | After getting the matrices, you can run `classification.py` to train the model. By default we are running a 10-fold cross-validation experiment. *You may need to change some paths to your desired folders, since we haven't cleaned the code yet*. Feel free to tweak those super-parameters (batch size, learning rate, layer size, class weights, etc.) 45 | 46 | On our environment, each run of the 10-fold takes nearly 3.75 hours. If you are running it using a weaker GPU, please expect longer time to finish. If you use larger batch size, please make sure that you have enough large memory, since each sample contains 128*128*88 elements. If the result you get are different from what reported in the paper, just change the super-parameters to the values presented in the paper (if you are running on the GCJ dataset), or you can write a simple script to find your best parameter setting on your dataset. 47 | 48 | Running the rest two baseline models are similar. 49 | 50 | ## NOTE 51 | 52 | We are working on a set of improved models, some of them are trying to address 53 | the limitations of this work. Hope we can finish and release them soon. 54 | 55 | In addition, we probably will include a simple web project in this repo for collecting larger 56 | and more comprehensive training samples (though we will not hold it on our server). -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- 1 | # dcsim_dataset 2 | Dataset (projects collected from google code jam competition) for DCSim project. 3 | -------------------------------------------------------------------------------- /dataset/g4_128.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parasol-aser/deepsim/6134ac9593806121e7541d1c6c52f5533c38f728/dataset/g4_128.npy -------------------------------------------------------------------------------- /dataset/googlejam4.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parasol-aser/deepsim/6134ac9593806121e7541d1c6c52f5533c38f728/dataset/googlejam4.tar.gz -------------------------------------------------------------------------------- /dcsim/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # output folder 92 | out/ 93 | idea/ 94 | encoding/idea/ 95 | -------------------------------------------------------------------------------- /dcsim/.idea/codeStyleSettings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 9 | -------------------------------------------------------------------------------- /dcsim/.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /dcsim/.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 15 | -------------------------------------------------------------------------------- /dcsim/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | -------------------------------------------------------------------------------- /dcsim/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /dcsim/.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /dcsim/.idea/preferred-vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ApexVCS 5 | 6 | -------------------------------------------------------------------------------- /dcsim/.idea/samples.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /dcsim/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /dcsim/README.md: -------------------------------------------------------------------------------- 1 | # dcsim 2 | A deep learning powered model for measuring code similarity. 3 | -------------------------------------------------------------------------------- /dcsim/classification.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import pandas as pd 6 | import os 7 | import time 8 | 9 | import matplotlib 10 | from matplotlib.ticker import NullFormatter 11 | from sklearn.decomposition import PCA 12 | from sklearn.model_selection import StratifiedKFold 13 | import matplotlib.pyplot as plt 14 | from mpl_toolkits.mplot3d import Axes3D 15 | import seaborn as sns 16 | 17 | sns.set(style='white') 18 | # matplotlib.rcParams['font.family']='' 19 | matplotlib.rcParams['font.weight'] = 'bold' 20 | 21 | import graph_mat_data 22 | 23 | bin_vec_dim = 88 24 | embedding_dim = 6 25 | dim = 128 26 | keep_prob = 0.75 27 | 28 | batch_size = 256 29 | test_size = 256 30 | 31 | beta = 0.00003 32 | # beta = 0.00001 # for model with batch normalization 33 | reg_term = None 34 | 35 | # disable tensorflow debugging information 36 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 37 | 38 | logdir = '/tmp/tf_logs' 39 | 40 | 41 | def _to_tensor(x, dtype): 42 | """Convert the input `x` to a tensor of type `dtype`. 43 | # Arguments 44 | x: An object to be converted (numpy array, list, tensors). 45 | dtype: The destination type. 46 | # Returns 47 | A tensor. 48 | """ 49 | x = tf.convert_to_tensor(x) 50 | if x.dtype != dtype: 51 | x = tf.cast(x, dtype) 52 | return x 53 | 54 | 55 | def relu(x, alpha=0., max_value=None): 56 | """Rectified linear unit. 57 | With default values, it returns element-wise `max(x, 0)`. 58 | # Arguments 59 | x: A tensor or variable. 60 | alpha: A scalar, slope of negative section (default=`0.`). 61 | max_value: Saturation threshold. 62 | # Returns 63 | A tensor. 64 | """ 65 | if alpha != 0.: 66 | negative_part = tf.nn.relu(-x) 67 | x = tf.nn.relu(x) 68 | if max_value is not None: 69 | max_value = _to_tensor(max_value, x.dtype.base_dtype) 70 | zero = _to_tensor(0., x.dtype.base_dtype) 71 | x = tf.clip_by_value(x, zero, max_value) 72 | if alpha != 0.: 73 | alpha = _to_tensor(alpha, x.dtype.base_dtype) 74 | x -= alpha * negative_part 75 | return x 76 | 77 | 78 | def batch_act(h, act, phase, scope): 79 | with tf.variable_scope(scope): 80 | return act(h) 81 | 82 | 83 | def variable_summaries(var): 84 | """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" 85 | with tf.name_scope('summaries'): 86 | mean = tf.reduce_mean(var) 87 | tf.summary.scalar('mean', mean) 88 | with tf.name_scope('stddev'): 89 | stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) 90 | tf.summary.scalar('stddev', stddev) 91 | tf.summary.scalar('max', tf.reduce_max(var)) 92 | tf.summary.scalar('min', tf.reduce_min(var)) 93 | tf.summary.histogram('histogram', var) 94 | 95 | 96 | def init_weights(shape, name): 97 | return tf.get_variable(name=name, shape=shape, dtype=tf.float32, 98 | initializer=tf.contrib.layers.variance_scaling_initializer( 99 | factor=1.0, mode='FAN_AVG', uniform=True)) 100 | 101 | def init_bias(shape, name): 102 | if len(shape) > 1: 103 | raise Exception('Bias should be a vector.') 104 | return tf.get_variable(name=name, shape=shape, dtype=tf.float32, 105 | initializer=tf.constant_initializer( 106 | 0.01)) 107 | 108 | def model(X, dropout, phase): 109 | global reg_term 110 | num = tf.shape(X)[0] 111 | with tf.name_scope('emb_layer'): 112 | wf = init_weights([bin_vec_dim, embedding_dim], 'wf') 113 | reg_term = tf.nn.l2_loss(wf) 114 | variable_summaries(wf) 115 | bf = init_bias([embedding_dim], 'bf') 116 | variable_summaries(bf) 117 | X = tf.reshape(X, [num * dim * dim, bin_vec_dim]) 118 | h0 = tf.nn.bias_add(tf.matmul(X, wf), bf) 119 | h0 = batch_act(h0, phase=phase, act=tf.nn.elu, scope='emb_layer_bn') 120 | h0 = tf.reshape(h0, [num * dim, dim * embedding_dim]) 121 | h0 = tf.nn.dropout(h0, dropout) 122 | with tf.name_scope('row_fc_layer1'): 123 | wr1 = init_weights([embedding_dim * dim, 256], 'wr1') # 128 124 | reg_term += tf.nn.l2_loss(wr1) 125 | br1 = init_bias([256], 'br1') 126 | h1 = tf.nn.bias_add(tf.matmul(h0, wr1), br1) 127 | h1 = batch_act(h1, phase=phase, act=tf.nn.elu, scope='row_fc_layer1_bn') 128 | h1 = tf.nn.dropout(h1, dropout) 129 | with tf.name_scope('row_fc_layer2'): 130 | wr2 = init_weights([256, 64], 'wr2') # 32 131 | reg_term += tf.nn.l2_loss(wr2) 132 | br2 = init_bias([64], 'br2') 133 | h2 = tf.nn.bias_add(tf.matmul(h1, wr2), br2) 134 | h2 = batch_act(h2, phase=phase, act=tf.nn.elu, scope='row_fc_layer2_bn') 135 | h2 = tf.reshape(h2, [num, dim, 64]) # 32 136 | with tf.name_scope('avg_pooling'): 137 | h3 = tf.reduce_mean(h2, 1) 138 | return h3 139 | 140 | 141 | def classification(X1, X2, dropout, phase): 142 | global reg_term 143 | with tf.variable_scope('encoding') as scope: 144 | h31 = model(X1, dropout, phase) 145 | scope.reuse_variables() 146 | h32 = model(X2, dropout, phase) 147 | h41 = tf.concat(values=[h31, h32], axis=1) 148 | with tf.name_scope('fc_layer1_1'): 149 | w5 = init_weights([128, 32], 'w5') # 64 16 150 | reg_term += tf.nn.l2_loss(w5) 151 | b5 = init_bias([32], 'b5') 152 | h5_1 = tf.nn.bias_add(tf.matmul(h41, w5), b5) 153 | h5_1 = batch_act(h5_1, phase=phase, act=tf.nn.elu, 154 | scope='fc_layer1_1_bn') 155 | h42 = tf.concat(values=[h32, h31], axis=1) 156 | with tf.name_scope('fc_layer1_2'): 157 | h5_2 = tf.nn.bias_add(tf.matmul(h42, w5), b5) 158 | h5_2 = batch_act(h5_2, phase=phase, act=tf.nn.elu, 159 | scope='fc_layer1_2_bn') 160 | h5 = (h5_1 + h5_2) / 2. 161 | with tf.name_scope('sm_layer'): 162 | w7 = init_weights([32, 2], 'w7') 163 | reg_term += tf.nn.l2_loss(w7) 164 | variable_summaries(w7) 165 | o = tf.matmul(h5, w7) 166 | return o 167 | 168 | 169 | def classification_predict(hl, hr, dropout, phase): 170 | h41 = tf.concat(values=[hl, hr], axis=1) 171 | with tf.name_scope('fc_layer1_1'): 172 | w5 = init_weights([128, 32], 'w5') # 64 16 173 | b5 = init_bias([32], 'b5') 174 | h5_1 = tf.nn.bias_add(tf.matmul(h41, w5), b5) 175 | h5_1 = batch_act(h5_1, phase=phase, act=tf.nn.elu, 176 | scope='fc_layer1_1_bn') 177 | h42 = tf.concat(values=[hr, hl], axis=1) 178 | with tf.name_scope('fc_layer1_2'): 179 | h5_2 = tf.nn.bias_add(tf.matmul(h42, w5), b5) 180 | h5_2 = batch_act(h5_2, phase=phase, act=tf.nn.elu, 181 | scope='fc_layer1_2_bn') 182 | h5 = (h5_1 + h5_2) / 2. 183 | with tf.name_scope('sm_layer'): 184 | w7 = init_weights([32, 2], 'w7') 185 | variable_summaries(w7) 186 | o = tf.matmul(h5, w7) 187 | return o 188 | 189 | 190 | def emb_transform(X): 191 | with tf.variable_scope('encoding'): 192 | wf = init_weights([bin_vec_dim, embedding_dim], 'wf') 193 | bf = init_bias([embedding_dim], 'bf') 194 | emb = tf.nn.bias_add(tf.matmul(X, wf), bf) 195 | emb = tf.nn.elu(emb) 196 | return emb 197 | 198 | 199 | def from_sparse_arr(sparse_arr): 200 | mat = np.zeros((dim, dim, bin_vec_dim), dtype=np.float32) 201 | for (i, j, k) in sparse_arr: 202 | mat[i, j, k] = 1 203 | return mat 204 | 205 | 206 | def from_sparse_arrs(sparse_arrs): 207 | mats = [] 208 | for sparse_arr in sparse_arrs: 209 | mats.append(from_sparse_arr(sparse_arr)) 210 | mats = np.array(mats, dtype=np.float32) 211 | return mats 212 | 213 | 214 | def train(): 215 | global reg_term 216 | with tf.name_scope('input'): 217 | X_left = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim]) 218 | X_right = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim]) 219 | Y = tf.placeholder(tf.float32, [None, 2]) 220 | dropout = tf.placeholder(tf.float32) 221 | phase = tf.placeholder(tf.bool, name='phase') 222 | 223 | py_x = classification(X_left, X_right, dropout, phase) 224 | cost = tf.reduce_mean( 225 | tf.nn.softmax_cross_entropy_with_logits(logits=py_x, labels=Y)) 226 | tf.summary.scalar('cost', cost) 227 | cost = tf.reduce_mean(cost + beta * reg_term) 228 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 229 | with tf.control_dependencies(update_ops): 230 | train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost) 231 | predict_op = tf.argmax(py_x, 1) 232 | 233 | train_X_left, train_X_right, train_Y, test_X_left, test_X_right, test_Y = graph_mat_data.load_googlejam_data_newencoding( 234 | neg_ratio=1.3, pos_ratio=1.0) 235 | t_beg = time.clock() 236 | with tf.Session() as sess: 237 | merged = tf.summary.merge_all() 238 | train_writer = tf.summary.FileWriter(logdir, 239 | sess.graph) 240 | tf.global_variables_initializer().run() 241 | saver = tf.train.Saver() 242 | 243 | for epoch in xrange(4): 244 | dense_test_X_left = from_sparse_arrs(test_X_left[0:test_size]) 245 | dense_test_X_right = from_sparse_arrs(test_X_right[0:test_size]) 246 | iter = 0 247 | for start, end in zip( 248 | range(0, np.shape(train_X_left)[0], batch_size), 249 | range(batch_size, np.shape(train_X_left)[0] + 1, 250 | batch_size)): 251 | dense_train_X_left = from_sparse_arrs(train_X_left[start:end]) 252 | dense_train_X_right = from_sparse_arrs(train_X_right[start:end]) 253 | summary, _ = sess.run([merged, train_op], 254 | feed_dict={X_left: dense_train_X_left, 255 | X_right: dense_train_X_right, 256 | Y: train_Y[start:end], 257 | dropout: keep_prob, phase: 1}) 258 | train_writer.add_summary(summary, iter) 259 | print('epoch %d, iteration %d\n' % (epoch, iter)) 260 | iter += 1 261 | 262 | predict_Y = sess.run(predict_op, 263 | feed_dict={X_left: dense_test_X_left, 264 | X_right: dense_test_X_right, 265 | dropout: 1.0, 266 | phase: 0}) # no dropout 267 | print( 268 | epoch, np.mean(np.argmax(test_Y[:test_size], axis=1) == predict_Y)) 269 | saver.save(sess=sess, 270 | save_path='models/model4_' + str(epoch) + '.ckpt') 271 | 272 | saver.save(sess, "models/model4.ckpt") 273 | print "model saved." 274 | t_end = time.clock() 275 | print('Time cost: %.2f' % (t_end - t_beg)) 276 | 277 | 278 | def train_10_fold_balanced(): 279 | global reg_term 280 | with tf.name_scope('input'): 281 | X_left = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim]) 282 | X_right = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim]) 283 | Y = tf.placeholder(tf.float32, [None, 2]) 284 | dropout = tf.placeholder(tf.float32) 285 | phase = tf.placeholder(tf.bool, name='phase') 286 | sample_weights = tf.placeholder(tf.float32, [batch_size]) 287 | 288 | py_x = classification(X_left, X_right, dropout, phase) 289 | cost = tf.reduce_mean( 290 | tf.losses.softmax_cross_entropy(logits=py_x, onehot_labels=Y, 291 | weights=sample_weights)) 292 | tf.summary.scalar('cost', cost) 293 | cost = tf.reduce_mean(cost + beta * reg_term) 294 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 295 | with tf.control_dependencies(update_ops): 296 | train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize( 297 | cost) 298 | predict_op = tf.argmax(py_x, 1) 299 | 300 | skf = StratifiedKFold(n_splits=10) 301 | file_path = "../dataset/g4_128.npy" 302 | dataset = np.load(open(file_path, 'r')) 303 | X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int) 304 | # shuffle 305 | indices = np.random.permutation(X.shape[0]) 306 | X = X[indices] 307 | y = y[indices] 308 | fold_index = 0 309 | avg_accuracy = 0. 310 | avg_recall = 0. 311 | avg_precision = 0. 312 | avg_f1_score = 0. 313 | fout = open('result/10_fold_balanced.txt', 'w') 314 | if os.path.exists('result') is not True: 315 | os.mkdir("result") 316 | if os.path.exists("10_fold_balanced") is not True: 317 | os.mkdir("10_fold_balanced") 318 | for train_idx, test_idx in skf.split(X, y): 319 | print ('*' * 40 + str(fold_index) + '*' * 40) 320 | fold_path = os.path.join("10_fold_balanced", str(fold_index)) 321 | if os.path.exists(fold_path) is not True: 322 | os.mkdir(fold_path) 323 | X_train, X_test = X[train_idx], X[test_idx] 324 | y_train, y_test = y[train_idx], y[test_idx] 325 | train_X_left, train_X_right, train_Y = \ 326 | graph_mat_data.make_pairs_10_fold(X_train, y_train, neg_ratio=10.0, 327 | pos_ratio=1.0, add_all_neg=True) 328 | test_X_left, test_X_right, test_Y = \ 329 | graph_mat_data.make_pairs_10_fold(X_test, y_test, neg_ratio=1.0, 330 | pos_ratio=1.0, add_all_neg=True) 331 | 332 | # compute the class weights 333 | classes_numbers = np.bincount(np.argmax(train_Y, axis=1)) 334 | classes_weights = np.array([classes_numbers[1] * 2.0 / 335 | (classes_numbers[0] + classes_numbers[1]), 336 | classes_numbers[0] * 1.0 / 337 | (classes_numbers[0] + classes_numbers[1])], 338 | dtype=np.float32) 339 | classes_weights = np.reshape(classes_weights, newshape=[2,1]) 340 | 341 | t_beg = time.clock() 342 | # tf.reset_default_graph() # reset the model 343 | with tf.Session() as sess: 344 | sess.run(tf.global_variables_initializer()) 345 | sess.run(tf.local_variables_initializer()) 346 | merged = tf.summary.merge_all() 347 | train_writer = tf.summary.FileWriter( 348 | logdir, sess.graph) 349 | saver = tf.train.Saver(max_to_keep=3) 350 | step = 0 351 | for epoch in xrange(4): 352 | # re-shuffle for each epoch 353 | indices = np.random.permutation(train_X_left.shape[0]) 354 | train_X_left = train_X_left[indices] 355 | train_X_right = train_X_right[indices] 356 | train_Y = train_Y[indices] 357 | # for small test 358 | dense_test_X_left = from_sparse_arrs(test_X_left[0:test_size]) 359 | dense_test_X_right = from_sparse_arrs(test_X_right[0:test_size]) 360 | 361 | for start, end in zip( 362 | range(0, np.shape(train_X_left)[0], batch_size), 363 | range(batch_size, np.shape(train_X_left)[0] + 1, 364 | batch_size)): 365 | dense_train_X_left = from_sparse_arrs( 366 | train_X_left[start:end]) 367 | dense_train_X_right = from_sparse_arrs( 368 | train_X_right[start:end]) 369 | batch_samples_weights = np.matmul(train_Y[start:end], 370 | classes_weights) 371 | batch_samples_weights = np.reshape(batch_samples_weights, 372 | newshape=[batch_size]) 373 | _ = sess.run([train_op], 374 | feed_dict={X_left: dense_train_X_left, 375 | X_right: dense_train_X_right, 376 | Y: train_Y[start:end], 377 | sample_weights: 378 | batch_samples_weights, 379 | dropout: keep_prob, 380 | phase: 1}) 381 | print('epoch %d, iteration %d\n' % (epoch, step)) 382 | step += 1 383 | if step % 100 == 0 and step != 0: 384 | batch_samples_weights = np.matmul(test_Y[:test_size], 385 | classes_weights) 386 | batch_samples_weights = np.reshape( 387 | batch_samples_weights, 388 | newshape=[test_size]) 389 | predict_Y, summary = sess.run([predict_op, merged], 390 | feed_dict={ 391 | X_left: dense_test_X_left, 392 | X_right: dense_test_X_right, 393 | Y: test_Y[:test_size], 394 | sample_weights:batch_samples_weights, 395 | dropout: 1.0, 396 | phase: 0}) # no dropout 397 | train_writer.add_summary(summary, step) 398 | print(epoch, np.mean( 399 | np.argmax(test_Y[:test_size], axis=1) == predict_Y)) 400 | saver.save(sess, os.path.join(fold_path, 'mode.ckpt')) 401 | print "model saved." 402 | t_end = time.clock() 403 | print('Time cost: %.2f' % (t_end - t_beg)) 404 | 405 | # validation 406 | overall_accuracy = 0. 407 | overall_predict_Y = [] 408 | iter = 0 409 | for start, end in zip( 410 | range(0, np.shape(test_X_left)[0], batch_size), 411 | range(batch_size, np.shape(test_X_left)[0] + 1, 412 | batch_size)): 413 | dense_test_X_left = from_sparse_arrs(test_X_left[start:end]) 414 | dense_test_X_right = from_sparse_arrs(test_X_right[start:end]) 415 | predict_Y = sess.run(predict_op, 416 | feed_dict={X_left: dense_test_X_left, 417 | X_right: dense_test_X_right, 418 | dropout: 1.0, 419 | phase: 0}) # no dropout 420 | overall_predict_Y.extend(predict_Y.tolist()) 421 | accuracy = np.mean( 422 | np.argmax(test_Y[start:end], axis=1) == predict_Y) 423 | iter += 1 424 | overall_accuracy += accuracy 425 | 426 | print('Overall accuracy: %.5f' % (overall_accuracy / iter)) 427 | t_end = time.clock() 428 | print('Time cost: %.2f' % (t_end - t_beg)) 429 | fout.write('*' * 80 + '\n') 430 | fout.write('Fold %d:\n' % (fold_index)) 431 | fout.write('Overall accuracy: %.5f\n' % (overall_accuracy / iter)) 432 | fout.write('Time cost: %.2f\n' % (t_end - t_beg)) 433 | recall, precision, f1_score = stat( 434 | np.argmax(test_Y[:len(overall_predict_Y)], axis=1), 435 | np.array(overall_predict_Y, dtype=np.int32), fout=fout) 436 | fout.flush() 437 | avg_accuracy += overall_accuracy / iter 438 | avg_recall += recall 439 | avg_precision += precision 440 | avg_f1_score += f1_score 441 | print('*' * 80) 442 | fold_index += 1 443 | avg_accuracy /= 10.0 444 | avg_precision /= 10.0 445 | avg_recall /= 10.0 446 | avg_f1_score /= 10.0 447 | print('Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 ' 448 | 'score: %.4f' % ( 449 | avg_accuracy, avg_recall, avg_precision, avg_f1_score)) 450 | fout.write('*' * 80 + '\n') 451 | fout.write( 452 | 'Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 ' 453 | 'score: %.4f' % (avg_accuracy, avg_recall, avg_precision, avg_f1_score)) 454 | fout.close() 455 | 456 | 457 | def stat(Y, predicted_Y, fout=None): 458 | real_positive_count = 0 459 | predict_positive_count = 0 460 | recall = 0 461 | precision = 0 462 | for i in xrange(Y.shape[0]): 463 | if Y[i] == 1: 464 | real_positive_count += 1 465 | if predicted_Y[i] == 1: 466 | recall += 1 467 | if predicted_Y[i] == 1: 468 | predict_positive_count += 1 469 | if Y[i] == 1: 470 | precision += 1 471 | retrieved_positive_count = recall 472 | recall /= real_positive_count * 1.0 473 | precision /= max(predict_positive_count * 1.0, 1.0) 474 | f1_score = 2 * recall * precision / max( 475 | recall + precision, 0.00001) 476 | print "Clone pairs: %d, non-clone pairs: %d " % ( 477 | real_positive_count, Y.shape[0] - real_positive_count) 478 | print "Recall: %f, precision: %f, f1 score: %f" % ( 479 | recall, precision, f1_score) 480 | print "Predicted_positive_count: %d, recall truly positive: %d, false positive: %d, missed true positive: %d" \ 481 | % (predict_positive_count, retrieved_positive_count, 482 | predict_positive_count - retrieved_positive_count, 483 | real_positive_count - retrieved_positive_count) 484 | if fout is not None: 485 | fout.write("Clone pairs: %d, non-clone pairs: %d\n" % ( 486 | real_positive_count, Y.shape[0] - real_positive_count)) 487 | fout.write("Recall: %.4f, precision: %.4f, f1 score: %.4f\n" % ( 488 | recall, precision, f1_score)) 489 | fout.write("Predicted_positive_count: %d, recall truly positive: %d, " 490 | "false positive: %d, missed true positive: %d\n" \ 491 | % (predict_positive_count, retrieved_positive_count, 492 | predict_positive_count - retrieved_positive_count, 493 | real_positive_count - retrieved_positive_count)) 494 | return recall, precision, f1_score 495 | 496 | 497 | def predict_on_full_dataset(): 498 | with tf.name_scope('input'): 499 | X_left = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim]) 500 | X_right = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim]) 501 | Y = tf.placeholder(tf.float32, [None, 2]) 502 | dropout = tf.placeholder(tf.float32) 503 | phase = tf.placeholder(tf.bool, name='phase') 504 | 505 | with tf.variable_scope('encoding'): 506 | h_op = model(X_left, dropout, phase) 507 | 508 | h_left = tf.placeholder(tf.float32, [None, 64]) 509 | h_right = tf.placeholder(tf.float32, [None, 64]) 510 | py_x = classification_predict(h_left, h_right, dropout, phase) 511 | predict_op = tf.argmax(py_x, 1) 512 | 513 | file_path = "../dataset/g4_128.npy" 514 | dataset = np.load(open(file_path, 'r')) 515 | X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int) 516 | 517 | t_beg = time.clock() 518 | saver = tf.train.Saver() 519 | sess = tf.InteractiveSession() 520 | saver.restore(sess, '10_fold_balanced/2/mode.ckpt') 521 | 522 | iter = 0 523 | X_reps = [] 524 | for start, end in zip(range(0, np.shape(X)[0], batch_size), \ 525 | range(batch_size, np.shape(X)[0] + 1, batch_size)): 526 | dense_X = from_sparse_arrs(X[start:end]) 527 | h_val = sess.run(h_op, feed_dict={X_left: dense_X, dropout: 1.0, 528 | phase:0}) 529 | X_reps.extend(h_val.tolist()) 530 | dense_X = from_sparse_arrs(X[end:]) 531 | h_val = sess.run(h_op, feed_dict={X_left: dense_X, dropout: 1.0, phase:0}) 532 | X_reps.extend(h_val.tolist()) 533 | test_X_left = [] 534 | test_X_right = [] 535 | test_Y = [] 536 | for i in xrange(y.shape[0]): 537 | for j in xrange(i+1, y.shape[0]): 538 | if y[i] == y[j]: 539 | test_X_left.append(X_reps[i]) 540 | test_X_right.append(X_reps[j]) 541 | test_Y.append([0, 1]) 542 | else: 543 | test_X_left.append(X_reps[i]) 544 | test_X_right.append(X_reps[j]) 545 | test_Y.append([1, 0]) 546 | test_X_left = np.array(test_X_left) 547 | test_X_right = np.array(test_X_right) 548 | test_Y = np.array(test_Y, dtype=np.float32) 549 | 550 | 551 | overall_predict_Y = [] 552 | for start, end in zip(range(0, np.shape(test_X_left)[0], batch_size), 553 | range(batch_size, np.shape(test_X_left)[0] + 1, 554 | batch_size)): 555 | predict_Y = sess.run(predict_op, 556 | feed_dict={h_left: test_X_left[start:end], 557 | h_right: test_X_right[start:end], 558 | dropout: 1.0, phase: 0}) # no dropout 559 | overall_predict_Y.extend(predict_Y.tolist()) 560 | iter += 1 561 | 562 | stat(np.argmax(test_Y[:end], axis=1), 563 | np.array(overall_predict_Y, dtype=np.int32)) 564 | 565 | 566 | if __name__ == '__main__': 567 | train_10_fold_balanced() 568 | st = time.time() 569 | predict_on_full_dataset() 570 | print "Predict time on the full dataset: ", time.time() - st -------------------------------------------------------------------------------- /dcsim/classification_bigbench_keras.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import print_function, division 4 | 5 | import tensorflow as tf 6 | from keras.models import Model 7 | from keras.layers import Dense, Flatten, Dropout, Activation, Input, Lambda 8 | from keras.layers.normalization import BatchNormalization 9 | from keras.layers.pooling import GlobalAveragePooling1D 10 | from keras.utils import np_utils, Sequence 11 | from keras.utils.vis_utils import plot_model 12 | import keras as K 13 | import numpy as np 14 | import pandas as pd 15 | import os 16 | import time 17 | 18 | import matplotlib 19 | from matplotlib.ticker import NullFormatter 20 | from sklearn.decomposition import PCA 21 | from sklearn.metrics import precision_recall_fscore_support, accuracy_score 22 | from sklearn.model_selection import StratifiedKFold 23 | from sklearn.utils import class_weight, shuffle 24 | import matplotlib.pyplot as plt 25 | from mpl_toolkits.mplot3d import Axes3D 26 | 27 | # matplotlib.rcParams['font.family']='' 28 | matplotlib.rcParams['font.weight'] = 'bold' 29 | 30 | import graph_mat_data 31 | import preprocessing_bigbench 32 | 33 | bin_vec_dim = 88 34 | embedding_dim = 6 35 | dim = 128 36 | keep_prob = 0.6 37 | 38 | batch_size = 256 39 | test_size = 256 40 | 41 | 42 | # disable tensorflow debugging information 43 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 44 | 45 | logdir = '/tmp/logs' 46 | 47 | kernel_init = K.initializers.VarianceScaling(scale=1.0, mode='fan_avg', 48 | distribution='uniform') 49 | bias_init = K.initializers.Constant(value=0.01) 50 | 51 | 52 | def stat_by_type(y_true, y_pred, ts, fout=None): 53 | print('*' * 40 + " Performance by Type " + '*' * 40) 54 | # T1 55 | indices = np.where(ts==0) 56 | accuracy = accuracy_score(y_true[indices], y_pred[indices]) 57 | precision, recall, fscore, _ = \ 58 | precision_recall_fscore_support(y_true[indices], y_pred[indices], 59 | average='binary') 60 | print("T1: accuracy: %.4f, recall: %.4f, " 61 | "precision: %.4f, f1 score: %.4f\n" % ( 62 | accuracy, recall, precision, fscore)) 63 | if fout is not None: 64 | fout.write("T1: accuracy: %.4f, recall: %.4f, " 65 | "precision: %.4f, f1 score: %.4f\n" % ( 66 | accuracy, recall, precision, fscore)) 67 | 68 | #T2 69 | indices = np.where(ts == 1) 70 | accuracy = accuracy_score(y_true[indices], y_pred[indices]) 71 | precision, recall, fscore, _ = \ 72 | precision_recall_fscore_support(y_true[indices], y_pred[indices], 73 | average='binary') 74 | print("T2: accuracy: %.4f, recall: %.4f, " 75 | "precision: %.4f, f1 score: %.4f\n" % ( 76 | accuracy, recall, precision, fscore)) 77 | if fout is not None: 78 | fout.write("T2: accuracy: %.4f, recall: %.4f, " 79 | "precision: %.4f, f1 score: %.4f\n" % ( 80 | accuracy, recall, precision, fscore)) 81 | 82 | # ST3 83 | indices = np.where(ts == 2) 84 | accuracy = accuracy_score(y_true[indices], y_pred[indices]) 85 | precision, recall, fscore, _ = \ 86 | precision_recall_fscore_support(y_true[indices], y_pred[indices], 87 | average='binary') 88 | print("ST3: accuracy: %.4f, recall: %.4f, " 89 | "precision: %.4f, f1 score: %.4f\n" % ( 90 | accuracy, recall, precision, fscore)) 91 | if fout is not None: 92 | fout.write("ST3: accuracy: %.4f, recall: %.4f, " 93 | "precision: %.4f, f1 score: %.4f\n" % ( 94 | accuracy, recall, precision, fscore)) 95 | 96 | #MT3 97 | indices = np.where(ts == 3) 98 | accuracy = accuracy_score(y_true[indices], y_pred[indices]) 99 | precision, recall, fscore, _ = \ 100 | precision_recall_fscore_support(y_true[indices], y_pred[indices], 101 | average='binary') 102 | print("MT3: accuracy: %.4f, recall: %.4f, " 103 | "precision: %.4f, f1 score: %.4f\n" % ( 104 | accuracy, recall, precision, fscore)) 105 | if fout is not None: 106 | fout.write("MT3: accuracy: %.4f, recall: %.4f, " 107 | "precision: %.4f, f1 score: %.4f\n" % ( 108 | accuracy, recall, precision, fscore)) 109 | 110 | indices = np.where(ts == 4) 111 | accuracy = accuracy_score(y_true[indices], y_pred[indices]) 112 | precision, recall, fscore, _ = \ 113 | precision_recall_fscore_support(y_true[indices], y_pred[indices], 114 | average='binary') 115 | print("WT3/T4: accuracy: %.4f, recall: %.4f, " 116 | "precision: %.4f, f1 score: %.4f\n" % ( 117 | accuracy, recall, precision, fscore)) 118 | if fout is not None: 119 | fout.write("WT3/T4: accuracy: %.4f, recall: %.4f, " 120 | "precision: %.4f, f1 score: %.4f\n" % ( 121 | accuracy, recall, precision, fscore)) 122 | 123 | def from_sparse_arr(sparse_arr): 124 | mat = np.zeros((dim, dim, bin_vec_dim), dtype=np.float32) 125 | for (i, j, k) in sparse_arr: 126 | mat[i, j, k] = 1 127 | return mat 128 | 129 | def from_sparse_arrs(sparse_arrs): 130 | mats = [] 131 | for sparse_arr in sparse_arrs: 132 | mats.append(from_sparse_arr(sparse_arr)) 133 | mats = np.array(mats, dtype=np.float32) 134 | return mats 135 | 136 | def fit_generator(Xl, Xr, Y): 137 | ''' 138 | Best set worker=1, use_multiprocessing=False 139 | :param Xl: 140 | :param Xr: 141 | :param Y: 142 | :return: 143 | ''' 144 | while True: 145 | Xl, Xr, Y = shuffle(Xl, Xr, Y) 146 | batch_Xl = [] 147 | batch_Xr = [] 148 | batch_y = [] 149 | count = 0 150 | for (xl, xr, y) in zip(Xl, Xr, Y): 151 | batch_Xl.append(from_sparse_arr(xl)) 152 | batch_Xr.append(from_sparse_arr(xr)) 153 | batch_y.append(y) 154 | count += 1 155 | if len(batch_y) == batch_size or count == np.shape(Y)[0]: 156 | yield ([np.array(batch_Xl), np.array(batch_Xr)], 157 | np.expand_dims(np.array(batch_y, dtype=np.float32), 158 | axis=1)) 159 | batch_Xl = [] 160 | batch_Xr = [] 161 | batch_y = [] 162 | 163 | class SequenceSamples(Sequence): 164 | def __init__(self, Xl, Xr, Y, batch_size): 165 | self.Xl, self.Xr, self.Y = Xl, Xr, Y 166 | self.batch_size = batch_size 167 | 168 | def __len__(self): 169 | return np.ceil(np.shape(self.Y)[0] / batch_size) 170 | 171 | def __getitem__(self, item): 172 | batch_Xl = from_sparse_arrs(self.Xl[item * self.batch_size:(item + 1) * self.batch_size]) 173 | batch_Xr = from_sparse_arrs(self.Xr[item * self.batch_size:(item + 1) * self.batch_size]) 174 | # Y shouldn't be (256,), it should has the same shape as the model's 175 | # output 176 | batch_Y = self.Y[item * self.batch_size:(item+1)*self.batch_size]\ 177 | .reshape(batch_size, 1) 178 | print("Batch size: ", batch_Xl.shape[0], batch_Xr.shape[0], 179 | batch_Y.shape[0]) 180 | return ([batch_Xl, batch_Xr], batch_Y) 181 | 182 | 183 | def feed_forward(x): 184 | x = Lambda(lambda input: K.backend.reshape(input, (-1, bin_vec_dim)), 185 | batch_input_shape=K.backend.get_variable_shape(x))(x) 186 | x = Dense(embedding_dim, 187 | kernel_initializer=kernel_init, 188 | bias_initializer=bias_init)(x) 189 | x = BatchNormalization()(x) 190 | x = Activation(activation='relu')(x) 191 | x = Lambda( 192 | lambda input: K.backend.reshape(input, (-1, dim * embedding_dim)))(x) 193 | x = Dense(256, kernel_initializer=kernel_init, 194 | bias_initializer=bias_init)(x) 195 | x = BatchNormalization()(x) 196 | x = Activation(activation='relu')(x) 197 | x = Dropout(keep_prob)(x) 198 | 199 | x = Dense(64, 200 | kernel_initializer=kernel_init, 201 | bias_initializer=bias_init)(x) 202 | x = BatchNormalization()(x) 203 | x = Activation(activation='relu')(x) 204 | x = Dropout(keep_prob)(x) 205 | x = Lambda(lambda input: K.backend.reshape(input, (-1, dim, 64)))(x) 206 | x = GlobalAveragePooling1D()(x) # (batch_size, 64) 207 | return x 208 | 209 | def classification(x1, x2): 210 | input = Input(shape=(dim, dim, bin_vec_dim)) 211 | # share layers 212 | feed_forward_model = Model(inputs=input, outputs=feed_forward(input)) 213 | x1 = feed_forward_model(x1) 214 | x2 = feed_forward_model(x2) 215 | concat_input = Input(shape=(128,)) 216 | # share layers 217 | merge_model = Model(inputs=concat_input, 218 | outputs=Activation(activation='relu')( 219 | BatchNormalization()( 220 | Dense(32, kernel_initializer=kernel_init, 221 | bias_initializer=bias_init, 222 | input_shape=(128,))( 223 | concat_input)))) 224 | 225 | xc1 = K.layers.concatenate([x1, x2]) 226 | xc1 = merge_model(xc1) 227 | 228 | xc2 = K.layers.concatenate([x2, x1]) 229 | xc2 = merge_model(xc2) 230 | 231 | xc = K.layers.average([xc1, xc2]) 232 | 233 | x = Dense(1, use_bias=False, activation='sigmoid', 234 | kernel_initializer=kernel_init, 235 | batch_input_shape=K.backend.get_variable_shape(xc))(xc) 236 | 237 | return x 238 | 239 | def model_summary(): 240 | X_left = Input((dim, dim, bin_vec_dim)) 241 | X_right = Input((dim, dim, bin_vec_dim)) 242 | predictions = classification(X_left, X_right) 243 | model = Model(inputs=[X_left, X_right], outputs=predictions) 244 | model.compile(optimizer=K.optimizers.adam(lr=0.0005), 245 | loss=K.losses.binary_crossentropy, 246 | metrics=['accuracy']) 247 | 248 | # plot_model(model, to_file='./result/plot/whole_model.png', show_shapes=True) 249 | 250 | def train_10_fold_balanced(): 251 | 252 | skf = StratifiedKFold(n_splits=10) 253 | 254 | Xl, Xr, y, ts = preprocessing_bigbench.load_dataset() 255 | 256 | fold_index = 0 257 | avg_accuracy = 0. 258 | avg_recall = 0. 259 | avg_precision = 0. 260 | avg_f1_score = 0. 261 | fout = open('result/10_fold_balanced.txt', 'w') 262 | if os.path.exists('result') is not True: 263 | os.mkdir("result") 264 | if os.path.exists("10_fold_balanced") is not True: 265 | os.mkdir("10_fold_balanced") 266 | for train_idx, test_idx in skf.split(Xl, y): 267 | t_beg = time.clock() 268 | 269 | print ('*' * 40 + str(fold_index) + '*' * 40) 270 | fold_path = os.path.join("10_fold_balanced", str(fold_index)) 271 | if os.path.exists(fold_path) is not True: 272 | os.mkdir(fold_path) 273 | 274 | train_X_left = Xl[train_idx] 275 | train_X_right = Xr[train_idx] 276 | train_Y = y[train_idx] 277 | 278 | train_Yt = train_Y[train_Y == 0] 279 | train_Xlt = train_X_left[train_Y == 0] 280 | train_Xrt = train_X_right[train_Y == 0] 281 | train_Xl = train_X_left[train_Y == 1][:5 * train_Yt.shape[0]] 282 | train_Xr = train_X_right[train_Y == 1][:5 * train_Yt.shape[0]] 283 | train_y = train_Y[train_Y == 1][:5 * train_Yt.shape[0]] 284 | train_X_left = np.concatenate((train_Xlt, train_Xl), axis=0) 285 | train_X_right = np.concatenate((train_Xrt, train_Xr), axis=0) 286 | train_Y = np.concatenate((train_Yt, train_y), axis=0) 287 | train_X_left, train_X_right, train_Y = shuffle(train_X_left, 288 | train_X_right, train_Y) 289 | 290 | test_X_left = Xl[test_idx] 291 | test_X_right = Xr[test_idx] 292 | test_Y = y[test_idx] 293 | test_ts = ts[test_idx] 294 | 295 | validate_X_left = from_sparse_arrs(test_X_left[:256]) 296 | validate_X_right = from_sparse_arrs(test_X_right[:256]) 297 | validate_Y = test_Y[:256] 298 | 299 | X_left = Input(shape=(dim, dim, bin_vec_dim)) 300 | X_right = Input(shape=(dim, dim, bin_vec_dim)) 301 | 302 | predictions = classification(X_left, X_right) 303 | 304 | model = Model(inputs=[X_left, X_right], outputs=predictions) 305 | 306 | model.compile(optimizer=K.optimizers.adam(lr=0.001), 307 | loss=K.losses.binary_crossentropy, 308 | metrics=['accuracy']) 309 | samples_generator = SequenceSamples(train_X_left,train_X_right, 310 | train_Y, batch_size) 311 | model.fit_generator(fit_generator(train_X_left, train_X_right, train_Y), 312 | steps_per_epoch=np.ceil(train_Y.shape[0]/batch_size), 313 | epochs=1, verbose=1, 314 | workers=1, use_multiprocessing=False, 315 | validation_data=([validate_X_left, validate_X_right], validate_Y)) 316 | 317 | t_end = time.clock() 318 | print('Time cost: %.2f' % (t_end - t_beg)) 319 | 320 | model.save(filepath=os.path.join(fold_path, 'model.ckpt')) 321 | 322 | print("Evaluation:") 323 | 324 | test_samples_generator = SequenceSamples(test_X_left, test_X_right, 325 | test_Y, batch_size), 326 | y_pred = model.predict_generator(fit_generator(test_X_left, 327 | test_X_right, test_Y), 328 | steps=np.ceil(test_Y.shape[0] / batch_size), 329 | workers=1, use_multiprocessing=False) 330 | y_pred = np.round(y_pred) 331 | accuracy = accuracy_score(test_Y, y_pred) 332 | precision, recall, fscore, _ = precision_recall_fscore_support(test_Y, 333 | y_pred, average='binary') 334 | print("Fold index: %d, accuracy: %.4f, recall: %.4f, " 335 | "precision: %.4f, f1 score: %.4f\n" % ( 336 | fold_index, accuracy, recall, precision, fscore)) 337 | fout.write('*' * 80 + '\n') 338 | fout.write('Fold %d:\n' % (fold_index)) 339 | fout.write('Time cost: %.2f\n' % (t_end - t_beg)) 340 | fout.write("Fold index: %d, accuracy: %.4f, recall: %.4f, " 341 | "precision: %.4f, f1 score: %.4f\n" % ( 342 | fold_index, accuracy, recall, precision, fscore)) 343 | stat_by_type(test_Y, y_pred, test_ts, fout) 344 | fout.flush() 345 | avg_accuracy += accuracy 346 | avg_precision += precision 347 | avg_recall += recall 348 | avg_f1_score += fscore 349 | 350 | avg_accuracy /= 10.0 351 | avg_precision /= 10.0 352 | avg_recall /= 10.0 353 | avg_f1_score /= 10.0 354 | print('Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 ' 355 | 'score: %.4f' % ( 356 | avg_accuracy, avg_recall, avg_precision, avg_f1_score)) 357 | fout.write('*' * 80 + '\n') 358 | fout.write( 359 | 'Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 ' 360 | 'score: %.4f' % (avg_accuracy, avg_recall, avg_precision, avg_f1_score)) 361 | fout.close() 362 | 363 | def train_on_selected_id(): 364 | t_beg = time.clock() 365 | 366 | Xl_selected, Xr_selected, y_selected, ts_selected, Xl, Xr, y, ts = preprocessing_bigbench.load_train_test(id=4) 367 | 368 | train_X_left = Xl_selected 369 | train_X_right = Xr_selected 370 | train_Y = y_selected 371 | 372 | train_Yt = train_Y[train_Y == 0] 373 | train_Xlt = train_X_left[train_Y == 0] 374 | train_Xrt = train_X_right[train_Y == 0] 375 | train_Xl = train_X_left[train_Y == 1][:5 * train_Yt.shape[0]] 376 | train_Xr = train_X_right[train_Y == 1][:5 * train_Yt.shape[0]] 377 | train_y = train_Y[train_Y == 1][:5 * train_Yt.shape[0]] 378 | train_X_left = np.concatenate((train_Xlt, train_Xl), axis=0) 379 | train_X_right = np.concatenate((train_Xrt, train_Xr), axis=0) 380 | train_Y = np.concatenate((train_Yt, train_y), axis=0) 381 | train_X_left, train_X_right, train_Y = shuffle(train_X_left, 382 | train_X_right, train_Y) 383 | print("Training data size: ", train_Y.shape[0]) 384 | 385 | test_X_left = Xl 386 | test_X_right = Xr 387 | test_Y = y 388 | test_ts = ts 389 | 390 | validate_X_left = from_sparse_arrs(test_X_left[:256]) 391 | validate_X_right = from_sparse_arrs(test_X_right[:256]) 392 | validate_Y = test_Y[:256] 393 | 394 | X_left = Input(shape=(dim, dim, bin_vec_dim)) 395 | X_right = Input(shape=(dim, dim, bin_vec_dim)) 396 | 397 | predictions = classification(X_left, X_right) 398 | 399 | model = Model(inputs=[X_left, X_right], outputs=predictions) 400 | 401 | model.compile(optimizer=K.optimizers.adam(lr=0.001), 402 | loss=K.losses.binary_crossentropy, 403 | metrics=['accuracy']) 404 | model.fit_generator(fit_generator(train_X_left, train_X_right, train_Y), 405 | steps_per_epoch=np.ceil(train_Y.shape[0] / batch_size), 406 | epochs=1, verbose=1, 407 | workers=1, use_multiprocessing=False, 408 | validation_data=( 409 | [validate_X_left, validate_X_right], validate_Y)) 410 | 411 | t_end = time.clock() 412 | print('Time cost: %.2f' % (t_end - t_beg)) 413 | 414 | model.save(filepath=os.path.join('./model', 'model_id4.ckpt')) 415 | 416 | print("Evaluation:") 417 | 418 | y_pred = model.predict_generator(fit_generator(test_X_left, 419 | test_X_right, test_Y), 420 | steps=np.ceil(test_Y.shape[0] / 421 | batch_size), 422 | workers=1, use_multiprocessing=False) 423 | y_pred = np.round(y_pred) 424 | accuracy = accuracy_score(test_Y, y_pred) 425 | precision, recall, fscore, _ = precision_recall_fscore_support(test_Y, 426 | y_pred, 427 | average='binary') 428 | print("accuracy: %.4f, recall: %.4f, " 429 | "precision: %.4f, f1 score: %.4f\n" % ( 430 | accuracy, recall, precision, fscore)) 431 | 432 | stat_by_type(test_Y, y_pred, test_ts) 433 | 434 | 435 | if __name__ == '__main__': 436 | # model_summary() 437 | beg = time.time() 438 | train_10_fold_balanced() 439 | st = time.time() 440 | print("Total time: ", st-beg) -------------------------------------------------------------------------------- /dcsim/encoding/.idea/artifacts/SourceCodeSimilarity_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | $PROJECT_DIR$/out/artifacts/SourceCodeSimilarity_jar 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /dcsim/encoding/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /dcsim/encoding/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /dcsim/encoding/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 18 | 19 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | data_benchmark_fix 40 | exclusionsFileName 41 | 42 | 43 | 44 | 47 | 48 | 49 | 55 | 56 | 57 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 |