├── .gitignore
├── LICENSE
├── README.md
├── dataset
    ├── README.md
    ├── g4_128.npy
    └── googlejam4.tar.gz
└── dcsim
    ├── .gitignore
    ├── .idea
        ├── codeStyleSettings.xml
        ├── codeStyles
        │   └── codeStyleConfig.xml
        ├── inspectionProfiles
        │   └── Project_Default.xml
        ├── misc.xml
        ├── modules.xml
        ├── other.xml
        ├── preferred-vcs.xml
        ├── samples.iml
        ├── vcs.xml
        ├── workspace (zg的MacBook Pro's conflicted copy 2017-08-28).xml
        └── workspace.xml
    ├── README.md
    ├── classification.py
    ├── classification_bigbench_keras.py
    ├── encoding
        ├── .idea
        │   ├── artifacts
        │   │   └── SourceCodeSimilarity_jar.xml
        │   ├── misc.xml
        │   ├── modules.xml
        │   └── workspace.xml
        ├── SourceCodeSimilarity.iml
        ├── bin
        │   └── META-INF
        │   │   └── MANIFEST.MF
        ├── encoding.jar
        └── src
        │   ├── DefaultExclusions.txt
        │   ├── EclipseDefaultInclusions.txt
        │   ├── EmptyExclusion.txt
        │   └── Encoder.java
    ├── graph_mat_data.py
    ├── preprocessing.py
    ├── preprocessing_bigbench.py
    ├── sda_base.py
    └── sda_unsup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | *.ckpt
107 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 zhaogang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepSIM
 2 | 
 3 | This project is a prototype implementation of DeepSim, a deep learning-based approach to measure code functional similarity. If you find the tool useful in your work, please cite our FSE 2018 paper:
 4 | 
 5 | **"DeepSim: Deep Learning Code Functional Similarity"**. In Proceedings of the 26th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering, 2018.
 6 | 
 7 | ## Setup
 8 | 
 9 | *We haven't created a setup script yet. At this moment you can install those dependency packages manually.*
10 | 
11 | We tested it on Ubuntu 16.04.3 64bit. Our hardware environment includes:
12 | - Intel i7 6700K, 4.0GHz
13 | - NVIDIA GTX 1080, 8GB
14 | - DDR4 3000MHz, 48GB
15 | 
16 | The dependency packages are listed below:
17 | - Python 2.7
18 | - Tensorflow 1.3 (higher version should be fine)
19 | - Keras 2.x
20 | - All the other packages required by the above packages
21 | 
22 | This can be easily installed through [Conda](https://anaconda.org/) and instructions on [Tensorflow](https://www.tensorflow.org). For the encoding part, we already included the WALA jar package.
23 | 
24 | ## How To Run it
25 | 
26 | In order to run the tool, you need first to generate encoded matrices from Java bytecode files. We already provided the executable jar file `encoding.jar` in the folder `encoding`. Once you complied your Java source code files into a jar package, you can run the below command to generate the matrices:
27 | 
28 | ```bash
29 | ./encoding.jar your-bytecode-jar-path.jar
30 | ```
31 | The generated matrices will be stored in the folder *data* under your current working directory. (We already test this tool on a set of Java projects and it works well. If you find any crashes/errors, please post an issue here.)
32 | 
33 | *NOTE: the default matrix size is 128. If you want to change this, just change the value of the variable fixedSize in the Encoder.java source code file.*
34 | 
35 | If you just want to have a quick try of the tool, we also provided the matrices we generated for the GCJ dataset used in our paper. They are in the `dataset` folder. In particular, we already stored the matrices using numpy's dump function. So you can directly read it using the below code:
36 | 
37 | ```Python
38 | file_path = "path-to-the-datafile/g4_128.npy"
39 | dataset = np.load(open(file_path, 'r'))
40 | X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int)
41 | ```
42 | Each sample here is a in a spart format. For each 88d feature vector, we only store the indices on which the value is 1. If you want to visualize a sample (as the one in our paper), just convert it back to the normal matrix.
43 | 
44 | After getting the matrices, you can run `classification.py` to train the model. By default we are running a 10-fold cross-validation experiment. *You may need to change some paths to your desired folders, since we haven't cleaned the code yet*. Feel free to tweak those super-parameters (batch size, learning rate, layer size, class weights, etc.)
45 | 
46 | On our environment, each run of the 10-fold takes nearly 3.75 hours. If you are running it using a weaker GPU, please expect longer time to finish. If you use larger batch size, please make sure that you have enough large memory, since each sample contains 128*128*88 elements. If the result you get are different from what reported in the paper, just change the super-parameters to the values presented in the paper (if you are running on the GCJ dataset), or you can write a simple script to find your best parameter setting on your dataset.
47 | 
48 | Running the rest two baseline models are similar.
49 | 
50 | ## NOTE
51 | 
52 | We are working on a set of improved models, some of them are trying to address
53 | the limitations of this work. Hope we can finish and release them soon.
54 | 
55 | In addition, we probably will include a simple web project in this repo for collecting larger
56 | and more comprehensive training samples (though we will not hold it on our server).


--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
1 | # dcsim_dataset
2 | Dataset (projects collected from google code jam competition) for DCSim project.
3 | 


--------------------------------------------------------------------------------
/dataset/g4_128.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parasol-aser/deepsim/6134ac9593806121e7541d1c6c52f5533c38f728/dataset/g4_128.npy


--------------------------------------------------------------------------------
/dataset/googlejam4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parasol-aser/deepsim/6134ac9593806121e7541d1c6c52f5533c38f728/dataset/googlejam4.tar.gz


--------------------------------------------------------------------------------
/dcsim/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | # output folder
92 | out/
93 | idea/
94 | encoding/idea/
95 | 


--------------------------------------------------------------------------------
/dcsim/.idea/codeStyleSettings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectCodeStyleSettingsManager">
4 |     <option name="PER_PROJECT_SETTINGS">
5 |       <value />
6 |     </option>
7 |     <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default (1)" />
8 |   </component>
9 | </project>


--------------------------------------------------------------------------------
/dcsim/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 | <component name="ProjectCodeStyleConfiguration">
2 |   <state>
3 |     <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default (1)" />
4 |   </state>
5 | </component>


--------------------------------------------------------------------------------
/dcsim/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyCompatibilityInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ourVersions">
 6 |         <value>
 7 |           <list size="2">
 8 |             <item index="0" class="java.lang.String" itemvalue="2.7" />
 9 |             <item index="1" class="java.lang.String" itemvalue="3.6" />
10 |           </list>
11 |         </value>
12 |       </option>
13 |     </inspection_tool>
14 |   </profile>
15 | </component>


--------------------------------------------------------------------------------
/dcsim/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.13 (~/anaconda2/envs/tf1.3/bin/python)" project-jdk-type="Python SDK" />
4 |   <component name="PythonCompatibilityInspectionAdvertiser">
5 |     <option name="version" value="2" />
6 |   </component>
7 | </project>


--------------------------------------------------------------------------------
/dcsim/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/samples.iml" filepath="$PROJECT_DIR$/.idea/samples.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/dcsim/.idea/other.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="PySciProjectComponent">
4 |     <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/dcsim/.idea/preferred-vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="PreferredVcsStorage">
4 |     <preferredVcsName>ApexVCS</preferredVcsName>
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/dcsim/.idea/samples.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 2.7.13 (~/anaconda2/envs/tf1.3/bin/python)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/dcsim/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/dcsim/README.md:
--------------------------------------------------------------------------------
1 | # dcsim
2 | A deep learning powered model for measuring code similarity.
3 | 


--------------------------------------------------------------------------------
/dcsim/classification.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | import pandas as pd
  6 | import os
  7 | import time
  8 | 
  9 | import matplotlib
 10 | from matplotlib.ticker import NullFormatter
 11 | from sklearn.decomposition import PCA
 12 | from sklearn.model_selection import StratifiedKFold
 13 | import matplotlib.pyplot as plt
 14 | from mpl_toolkits.mplot3d import Axes3D
 15 | import seaborn as sns
 16 | 
 17 | sns.set(style='white')
 18 | # matplotlib.rcParams['font.family']=''
 19 | matplotlib.rcParams['font.weight'] = 'bold'
 20 | 
 21 | import graph_mat_data
 22 | 
 23 | bin_vec_dim = 88
 24 | embedding_dim = 6
 25 | dim = 128
 26 | keep_prob = 0.75
 27 | 
 28 | batch_size = 256
 29 | test_size = 256
 30 | 
 31 | beta = 0.00003
 32 | # beta = 0.00001 # for model with batch normalization
 33 | reg_term = None
 34 | 
 35 | # disable tensorflow debugging information
 36 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 37 | 
 38 | logdir = '/tmp/tf_logs'
 39 | 
 40 | 
 41 | def _to_tensor(x, dtype):
 42 |     """Convert the input `x` to a tensor of type `dtype`.
 43 |     # Arguments
 44 |         x: An object to be converted (numpy array, list, tensors).
 45 |         dtype: The destination type.
 46 |     # Returns
 47 |         A tensor.
 48 |     """
 49 |     x = tf.convert_to_tensor(x)
 50 |     if x.dtype != dtype:
 51 |         x = tf.cast(x, dtype)
 52 |     return x
 53 | 
 54 | 
 55 | def relu(x, alpha=0., max_value=None):
 56 |     """Rectified linear unit.
 57 |     With default values, it returns element-wise `max(x, 0)`.
 58 |     # Arguments
 59 |         x: A tensor or variable.
 60 |         alpha: A scalar, slope of negative section (default=`0.`).
 61 |         max_value: Saturation threshold.
 62 |     # Returns
 63 |         A tensor.
 64 |     """
 65 |     if alpha != 0.:
 66 |         negative_part = tf.nn.relu(-x)
 67 |     x = tf.nn.relu(x)
 68 |     if max_value is not None:
 69 |         max_value = _to_tensor(max_value, x.dtype.base_dtype)
 70 |         zero = _to_tensor(0., x.dtype.base_dtype)
 71 |         x = tf.clip_by_value(x, zero, max_value)
 72 |     if alpha != 0.:
 73 |         alpha = _to_tensor(alpha, x.dtype.base_dtype)
 74 |         x -= alpha * negative_part
 75 |     return x
 76 | 
 77 | 
 78 | def batch_act(h, act, phase, scope):
 79 |     with tf.variable_scope(scope):
 80 |         return act(h)
 81 | 
 82 | 
 83 | def variable_summaries(var):
 84 |     """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
 85 |     with tf.name_scope('summaries'):
 86 |         mean = tf.reduce_mean(var)
 87 |         tf.summary.scalar('mean', mean)
 88 |         with tf.name_scope('stddev'):
 89 |             stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
 90 |         tf.summary.scalar('stddev', stddev)
 91 |         tf.summary.scalar('max', tf.reduce_max(var))
 92 |         tf.summary.scalar('min', tf.reduce_min(var))
 93 |         tf.summary.histogram('histogram', var)
 94 | 
 95 | 
 96 | def init_weights(shape, name):
 97 |     return tf.get_variable(name=name, shape=shape, dtype=tf.float32,
 98 |                            initializer=tf.contrib.layers.variance_scaling_initializer(
 99 |                                factor=1.0, mode='FAN_AVG', uniform=True))
100 | 
101 | def init_bias(shape, name):
102 |     if len(shape) > 1:
103 |         raise Exception('Bias should be a vector.')
104 |     return tf.get_variable(name=name, shape=shape, dtype=tf.float32,
105 |                            initializer=tf.constant_initializer(
106 |                                0.01))
107 | 
108 | def model(X, dropout, phase):
109 |     global reg_term
110 |     num = tf.shape(X)[0]
111 |     with tf.name_scope('emb_layer'):
112 |         wf = init_weights([bin_vec_dim, embedding_dim], 'wf')
113 |         reg_term = tf.nn.l2_loss(wf)
114 |         variable_summaries(wf)
115 |         bf = init_bias([embedding_dim], 'bf')
116 |         variable_summaries(bf)
117 |         X = tf.reshape(X, [num * dim * dim, bin_vec_dim])
118 |         h0 = tf.nn.bias_add(tf.matmul(X, wf), bf)
119 |         h0 = batch_act(h0, phase=phase, act=tf.nn.elu, scope='emb_layer_bn')
120 |         h0 = tf.reshape(h0, [num * dim, dim * embedding_dim])
121 |         h0 = tf.nn.dropout(h0, dropout)
122 |     with tf.name_scope('row_fc_layer1'):
123 |         wr1 = init_weights([embedding_dim * dim, 256], 'wr1')  # 128
124 |         reg_term += tf.nn.l2_loss(wr1)
125 |         br1 = init_bias([256], 'br1')
126 |         h1 = tf.nn.bias_add(tf.matmul(h0, wr1), br1)
127 |         h1 = batch_act(h1, phase=phase, act=tf.nn.elu, scope='row_fc_layer1_bn')
128 |         h1 = tf.nn.dropout(h1, dropout)
129 |     with tf.name_scope('row_fc_layer2'):
130 |         wr2 = init_weights([256, 64], 'wr2')  # 32
131 |         reg_term += tf.nn.l2_loss(wr2)
132 |         br2 = init_bias([64], 'br2')
133 |         h2 = tf.nn.bias_add(tf.matmul(h1, wr2), br2)
134 |         h2 = batch_act(h2, phase=phase, act=tf.nn.elu, scope='row_fc_layer2_bn')
135 |         h2 = tf.reshape(h2, [num, dim, 64])  # 32
136 |     with tf.name_scope('avg_pooling'):
137 |         h3 = tf.reduce_mean(h2, 1)
138 |     return h3
139 | 
140 | 
141 | def classification(X1, X2, dropout, phase):
142 |     global reg_term
143 |     with tf.variable_scope('encoding') as scope:
144 |         h31 = model(X1, dropout, phase)
145 |         scope.reuse_variables()
146 |         h32 = model(X2, dropout, phase)
147 |     h41 = tf.concat(values=[h31, h32], axis=1)
148 |     with tf.name_scope('fc_layer1_1'):
149 |         w5 = init_weights([128, 32], 'w5')  # 64 16
150 |         reg_term += tf.nn.l2_loss(w5)
151 |         b5 = init_bias([32], 'b5')
152 |         h5_1 = tf.nn.bias_add(tf.matmul(h41, w5), b5)
153 |         h5_1 = batch_act(h5_1, phase=phase, act=tf.nn.elu,
154 |                          scope='fc_layer1_1_bn')
155 |     h42 = tf.concat(values=[h32, h31], axis=1)
156 |     with tf.name_scope('fc_layer1_2'):
157 |         h5_2 = tf.nn.bias_add(tf.matmul(h42, w5), b5)
158 |         h5_2 = batch_act(h5_2, phase=phase, act=tf.nn.elu,
159 |                          scope='fc_layer1_2_bn')
160 |     h5 = (h5_1 + h5_2) / 2.
161 |     with tf.name_scope('sm_layer'):
162 |         w7 = init_weights([32, 2], 'w7')
163 |         reg_term += tf.nn.l2_loss(w7)
164 |         variable_summaries(w7)
165 |         o = tf.matmul(h5, w7)
166 |     return o
167 | 
168 | 
169 | def classification_predict(hl, hr, dropout, phase):
170 |     h41 = tf.concat(values=[hl, hr], axis=1)
171 |     with tf.name_scope('fc_layer1_1'):
172 |         w5 = init_weights([128, 32], 'w5')  # 64 16
173 |         b5 = init_bias([32], 'b5')
174 |         h5_1 = tf.nn.bias_add(tf.matmul(h41, w5), b5)
175 |         h5_1 = batch_act(h5_1, phase=phase, act=tf.nn.elu,
176 |                          scope='fc_layer1_1_bn')
177 |     h42 = tf.concat(values=[hr, hl], axis=1)
178 |     with tf.name_scope('fc_layer1_2'):
179 |         h5_2 = tf.nn.bias_add(tf.matmul(h42, w5), b5)
180 |         h5_2 = batch_act(h5_2, phase=phase, act=tf.nn.elu,
181 |                          scope='fc_layer1_2_bn')
182 |     h5 = (h5_1 + h5_2) / 2.
183 |     with tf.name_scope('sm_layer'):
184 |         w7 = init_weights([32, 2], 'w7')
185 |         variable_summaries(w7)
186 |         o = tf.matmul(h5, w7)
187 |     return o
188 | 
189 | 
190 | def emb_transform(X):
191 |     with tf.variable_scope('encoding'):
192 |         wf = init_weights([bin_vec_dim, embedding_dim], 'wf')
193 |         bf = init_bias([embedding_dim], 'bf')
194 |         emb = tf.nn.bias_add(tf.matmul(X, wf), bf)
195 |         emb = tf.nn.elu(emb)
196 |     return emb
197 | 
198 | 
199 | def from_sparse_arr(sparse_arr):
200 |     mat = np.zeros((dim, dim, bin_vec_dim), dtype=np.float32)
201 |     for (i, j, k) in sparse_arr:
202 |         mat[i, j, k] = 1
203 |     return mat
204 | 
205 | 
206 | def from_sparse_arrs(sparse_arrs):
207 |     mats = []
208 |     for sparse_arr in sparse_arrs:
209 |         mats.append(from_sparse_arr(sparse_arr))
210 |     mats = np.array(mats, dtype=np.float32)
211 |     return mats
212 | 
213 | 
214 | def train():
215 |     global reg_term
216 |     with tf.name_scope('input'):
217 |         X_left = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
218 |         X_right = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
219 |         Y = tf.placeholder(tf.float32, [None, 2])
220 |     dropout = tf.placeholder(tf.float32)
221 |     phase = tf.placeholder(tf.bool, name='phase')
222 | 
223 |     py_x = classification(X_left, X_right, dropout, phase)
224 |     cost = tf.reduce_mean(
225 |         tf.nn.softmax_cross_entropy_with_logits(logits=py_x, labels=Y))
226 |     tf.summary.scalar('cost', cost)
227 |     cost = tf.reduce_mean(cost + beta * reg_term)
228 |     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
229 |     with tf.control_dependencies(update_ops):
230 |         train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)
231 |         predict_op = tf.argmax(py_x, 1)
232 | 
233 |     train_X_left, train_X_right, train_Y, test_X_left, test_X_right, test_Y = graph_mat_data.load_googlejam_data_newencoding(
234 |         neg_ratio=1.3, pos_ratio=1.0)
235 |     t_beg = time.clock()
236 |     with tf.Session() as sess:
237 |         merged = tf.summary.merge_all()
238 |         train_writer = tf.summary.FileWriter(logdir,
239 |                                              sess.graph)
240 |         tf.global_variables_initializer().run()
241 |         saver = tf.train.Saver()
242 | 
243 |         for epoch in xrange(4):
244 |             dense_test_X_left = from_sparse_arrs(test_X_left[0:test_size])
245 |             dense_test_X_right = from_sparse_arrs(test_X_right[0:test_size])
246 |             iter = 0
247 |             for start, end in zip(
248 |                     range(0, np.shape(train_X_left)[0], batch_size),
249 |                     range(batch_size, np.shape(train_X_left)[0] + 1,
250 |                           batch_size)):
251 |                 dense_train_X_left = from_sparse_arrs(train_X_left[start:end])
252 |                 dense_train_X_right = from_sparse_arrs(train_X_right[start:end])
253 |                 summary, _ = sess.run([merged, train_op],
254 |                                       feed_dict={X_left: dense_train_X_left,
255 |                                                  X_right: dense_train_X_right,
256 |                                                  Y: train_Y[start:end],
257 |                                                  dropout: keep_prob, phase: 1})
258 |                 train_writer.add_summary(summary, iter)
259 |                 print('epoch %d, iteration %d\n' % (epoch, iter))
260 |                 iter += 1
261 | 
262 |             predict_Y = sess.run(predict_op,
263 |                                  feed_dict={X_left: dense_test_X_left,
264 |                                             X_right: dense_test_X_right,
265 |                                             dropout: 1.0,
266 |                                             phase: 0})  # no dropout
267 |             print(
268 |             epoch, np.mean(np.argmax(test_Y[:test_size], axis=1) == predict_Y))
269 |             saver.save(sess=sess,
270 |                        save_path='models/model4_' + str(epoch) + '.ckpt')
271 | 
272 |         saver.save(sess, "models/model4.ckpt")
273 |         print "model saved."
274 |         t_end = time.clock()
275 |         print('Time cost: %.2f' % (t_end - t_beg))
276 | 
277 | 
278 | def train_10_fold_balanced():
279 |     global reg_term
280 |     with tf.name_scope('input'):
281 |         X_left = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
282 |         X_right = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
283 |         Y = tf.placeholder(tf.float32, [None, 2])
284 |     dropout = tf.placeholder(tf.float32)
285 |     phase = tf.placeholder(tf.bool, name='phase')
286 |     sample_weights = tf.placeholder(tf.float32, [batch_size])
287 | 
288 |     py_x = classification(X_left, X_right, dropout, phase)
289 |     cost = tf.reduce_mean(
290 |         tf.losses.softmax_cross_entropy(logits=py_x, onehot_labels=Y,
291 |                                         weights=sample_weights))
292 |     tf.summary.scalar('cost', cost)
293 |     cost = tf.reduce_mean(cost + beta * reg_term)
294 |     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
295 |     with tf.control_dependencies(update_ops):
296 |         train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(
297 |             cost)
298 |         predict_op = tf.argmax(py_x, 1)
299 | 
300 |     skf = StratifiedKFold(n_splits=10)
301 |     file_path = "../dataset/g4_128.npy"
302 |     dataset = np.load(open(file_path, 'r'))
303 |     X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int)
304 |     # shuffle
305 |     indices = np.random.permutation(X.shape[0])
306 |     X = X[indices]
307 |     y = y[indices]
308 |     fold_index = 0
309 |     avg_accuracy = 0.
310 |     avg_recall = 0.
311 |     avg_precision = 0.
312 |     avg_f1_score = 0.
313 |     fout = open('result/10_fold_balanced.txt', 'w')
314 |     if os.path.exists('result') is not True:
315 |         os.mkdir("result")
316 |     if os.path.exists("10_fold_balanced") is not True:
317 |         os.mkdir("10_fold_balanced")
318 |     for train_idx, test_idx in skf.split(X, y):
319 |         print ('*' * 40 + str(fold_index) + '*' * 40)
320 |         fold_path = os.path.join("10_fold_balanced", str(fold_index))
321 |         if os.path.exists(fold_path) is not True:
322 |             os.mkdir(fold_path)
323 |         X_train, X_test = X[train_idx], X[test_idx]
324 |         y_train, y_test = y[train_idx], y[test_idx]
325 |         train_X_left, train_X_right, train_Y = \
326 |             graph_mat_data.make_pairs_10_fold(X_train, y_train, neg_ratio=10.0,
327 |                                               pos_ratio=1.0, add_all_neg=True)
328 |         test_X_left, test_X_right, test_Y = \
329 |             graph_mat_data.make_pairs_10_fold(X_test, y_test, neg_ratio=1.0,
330 |                                               pos_ratio=1.0, add_all_neg=True)
331 | 
332 |         # compute the class weights
333 |         classes_numbers = np.bincount(np.argmax(train_Y, axis=1))
334 |         classes_weights = np.array([classes_numbers[1] * 2.0 /
335 |                                      (classes_numbers[0] + classes_numbers[1]),
336 |                                      classes_numbers[0] * 1.0 /
337 |                                      (classes_numbers[0] + classes_numbers[1])],
338 |                                     dtype=np.float32)
339 |         classes_weights = np.reshape(classes_weights, newshape=[2,1])
340 | 
341 |         t_beg = time.clock()
342 |         # tf.reset_default_graph() # reset the model
343 |         with tf.Session() as sess:
344 |             sess.run(tf.global_variables_initializer())
345 |             sess.run(tf.local_variables_initializer())
346 |             merged = tf.summary.merge_all()
347 |             train_writer = tf.summary.FileWriter(
348 |                 logdir, sess.graph)
349 |             saver = tf.train.Saver(max_to_keep=3)
350 |             step = 0
351 |             for epoch in xrange(4):
352 |                 # re-shuffle for each epoch
353 |                 indices = np.random.permutation(train_X_left.shape[0])
354 |                 train_X_left = train_X_left[indices]
355 |                 train_X_right = train_X_right[indices]
356 |                 train_Y = train_Y[indices]
357 |                 # for small test
358 |                 dense_test_X_left = from_sparse_arrs(test_X_left[0:test_size])
359 |                 dense_test_X_right = from_sparse_arrs(test_X_right[0:test_size])
360 | 
361 |                 for start, end in zip(
362 |                         range(0, np.shape(train_X_left)[0], batch_size),
363 |                         range(batch_size, np.shape(train_X_left)[0] + 1,
364 |                               batch_size)):
365 |                     dense_train_X_left = from_sparse_arrs(
366 |                         train_X_left[start:end])
367 |                     dense_train_X_right = from_sparse_arrs(
368 |                         train_X_right[start:end])
369 |                     batch_samples_weights = np.matmul(train_Y[start:end],
370 |                                                      classes_weights)
371 |                     batch_samples_weights = np.reshape(batch_samples_weights,
372 |                                                        newshape=[batch_size])
373 |                     _ = sess.run([train_op],
374 |                                           feed_dict={X_left: dense_train_X_left,
375 |                                                      X_right: dense_train_X_right,
376 |                                                      Y: train_Y[start:end],
377 |                                                      sample_weights:
378 |                                                          batch_samples_weights,
379 |                                                      dropout: keep_prob,
380 |                                                      phase: 1})
381 |                     print('epoch %d, iteration %d\n' % (epoch, step))
382 |                     step += 1
383 |                     if step % 100 == 0 and step != 0:
384 |                         batch_samples_weights = np.matmul(test_Y[:test_size],
385 |                                                           classes_weights)
386 |                         batch_samples_weights = np.reshape(
387 |                             batch_samples_weights,
388 |                             newshape=[test_size])
389 |                         predict_Y, summary = sess.run([predict_op, merged],
390 |                                              feed_dict={
391 |                                                  X_left: dense_test_X_left,
392 |                                                  X_right: dense_test_X_right,
393 |                                                  Y: test_Y[:test_size],
394 |                                                  sample_weights:batch_samples_weights,
395 |                                                  dropout: 1.0,
396 |                                                  phase: 0})  # no dropout
397 |                         train_writer.add_summary(summary, step)
398 |                         print(epoch, np.mean(
399 |                             np.argmax(test_Y[:test_size], axis=1) == predict_Y))
400 |             saver.save(sess, os.path.join(fold_path, 'mode.ckpt'))
401 |             print "model saved."
402 |             t_end = time.clock()
403 |             print('Time cost: %.2f' % (t_end - t_beg))
404 | 
405 |             # validation
406 |             overall_accuracy = 0.
407 |             overall_predict_Y = []
408 |             iter = 0
409 |             for start, end in zip(
410 |                     range(0, np.shape(test_X_left)[0], batch_size),
411 |                     range(batch_size, np.shape(test_X_left)[0] + 1,
412 |                           batch_size)):
413 |                 dense_test_X_left = from_sparse_arrs(test_X_left[start:end])
414 |                 dense_test_X_right = from_sparse_arrs(test_X_right[start:end])
415 |                 predict_Y = sess.run(predict_op,
416 |                                      feed_dict={X_left: dense_test_X_left,
417 |                                                 X_right: dense_test_X_right,
418 |                                                 dropout: 1.0,
419 |                                                 phase: 0})  # no dropout
420 |                 overall_predict_Y.extend(predict_Y.tolist())
421 |                 accuracy = np.mean(
422 |                     np.argmax(test_Y[start:end], axis=1) == predict_Y)
423 |                 iter += 1
424 |                 overall_accuracy += accuracy
425 | 
426 |             print('Overall accuracy: %.5f' % (overall_accuracy / iter))
427 |             t_end = time.clock()
428 |             print('Time cost: %.2f' % (t_end - t_beg))
429 |             fout.write('*' * 80 + '\n')
430 |             fout.write('Fold %d:\n' % (fold_index))
431 |             fout.write('Overall accuracy: %.5f\n' % (overall_accuracy / iter))
432 |             fout.write('Time cost: %.2f\n' % (t_end - t_beg))
433 |             recall, precision, f1_score = stat(
434 |                 np.argmax(test_Y[:len(overall_predict_Y)], axis=1),
435 |                 np.array(overall_predict_Y, dtype=np.int32), fout=fout)
436 |             fout.flush()
437 |             avg_accuracy += overall_accuracy / iter
438 |             avg_recall += recall
439 |             avg_precision += precision
440 |             avg_f1_score += f1_score
441 |         print('*' * 80)
442 |         fold_index += 1
443 |     avg_accuracy /= 10.0
444 |     avg_precision /= 10.0
445 |     avg_recall /= 10.0
446 |     avg_f1_score /= 10.0
447 |     print('Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 '
448 |           'score: %.4f' % (
449 |           avg_accuracy, avg_recall, avg_precision, avg_f1_score))
450 |     fout.write('*' * 80 + '\n')
451 |     fout.write(
452 |         'Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 '
453 |         'score: %.4f' % (avg_accuracy, avg_recall, avg_precision, avg_f1_score))
454 |     fout.close()
455 | 
456 | 
457 | def stat(Y, predicted_Y, fout=None):
458 |     real_positive_count = 0
459 |     predict_positive_count = 0
460 |     recall = 0
461 |     precision = 0
462 |     for i in xrange(Y.shape[0]):
463 |         if Y[i] == 1:
464 |             real_positive_count += 1
465 |             if predicted_Y[i] == 1:
466 |                 recall += 1
467 |         if predicted_Y[i] == 1:
468 |             predict_positive_count += 1
469 |             if Y[i] == 1:
470 |                 precision += 1
471 |     retrieved_positive_count = recall
472 |     recall /= real_positive_count * 1.0
473 |     precision /= max(predict_positive_count * 1.0, 1.0)
474 |     f1_score = 2 * recall * precision / max(
475 |     recall + precision, 0.00001)
476 |     print "Clone pairs: %d, non-clone pairs: %d " % (
477 |     real_positive_count, Y.shape[0] - real_positive_count)
478 |     print "Recall: %f, precision: %f, f1 score: %f" % (
479 |     recall, precision, f1_score)
480 |     print "Predicted_positive_count: %d, recall truly positive: %d, false positive: %d, missed true positive: %d" \
481 |           % (predict_positive_count, retrieved_positive_count,
482 |              predict_positive_count - retrieved_positive_count,
483 |              real_positive_count - retrieved_positive_count)
484 |     if fout is not None:
485 |         fout.write("Clone pairs: %d, non-clone pairs: %d\n" % (
486 |     real_positive_count, Y.shape[0] - real_positive_count))
487 |         fout.write("Recall: %.4f, precision: %.4f, f1 score: %.4f\n" % (
488 |     recall, precision, f1_score))
489 |         fout.write("Predicted_positive_count: %d, recall truly positive: %d, "
490 |                    "false positive: %d, missed true positive: %d\n" \
491 |           % (predict_positive_count, retrieved_positive_count,
492 |              predict_positive_count - retrieved_positive_count,
493 |              real_positive_count - retrieved_positive_count))
494 |     return recall, precision, f1_score
495 | 
496 | 
497 | def predict_on_full_dataset():
498 |     with tf.name_scope('input'):
499 |         X_left = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
500 |         X_right = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
501 |         Y = tf.placeholder(tf.float32, [None, 2])
502 |     dropout = tf.placeholder(tf.float32)
503 |     phase = tf.placeholder(tf.bool, name='phase')
504 | 
505 |     with tf.variable_scope('encoding'):
506 |         h_op = model(X_left, dropout, phase)
507 | 
508 |     h_left = tf.placeholder(tf.float32, [None, 64])
509 |     h_right = tf.placeholder(tf.float32, [None, 64])
510 |     py_x = classification_predict(h_left, h_right, dropout, phase)
511 |     predict_op = tf.argmax(py_x, 1)
512 | 
513 |     file_path = "../dataset/g4_128.npy"
514 |     dataset = np.load(open(file_path, 'r'))
515 |     X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int)
516 |     
517 |     t_beg = time.clock()
518 |     saver = tf.train.Saver()
519 |     sess = tf.InteractiveSession()
520 |     saver.restore(sess, '10_fold_balanced/2/mode.ckpt')
521 | 
522 |     iter = 0
523 |     X_reps = []
524 |     for start, end in zip(range(0, np.shape(X)[0], batch_size), \
525 |                      range(batch_size, np.shape(X)[0] + 1, batch_size)):
526 |         dense_X = from_sparse_arrs(X[start:end])
527 |         h_val = sess.run(h_op, feed_dict={X_left: dense_X, dropout: 1.0,
528 |                                           phase:0})
529 |         X_reps.extend(h_val.tolist())
530 |     dense_X = from_sparse_arrs(X[end:])
531 |     h_val = sess.run(h_op, feed_dict={X_left: dense_X, dropout: 1.0, phase:0})
532 |     X_reps.extend(h_val.tolist())
533 |     test_X_left = []
534 |     test_X_right = []
535 |     test_Y = []
536 |     for i in xrange(y.shape[0]):
537 |         for j in xrange(i+1, y.shape[0]):
538 |             if y[i] == y[j]:
539 |                 test_X_left.append(X_reps[i])
540 |                 test_X_right.append(X_reps[j])
541 |                 test_Y.append([0, 1])
542 |             else:
543 |                 test_X_left.append(X_reps[i])
544 |                 test_X_right.append(X_reps[j])
545 |                 test_Y.append([1, 0])
546 |     test_X_left = np.array(test_X_left)
547 |     test_X_right = np.array(test_X_right)
548 |     test_Y = np.array(test_Y, dtype=np.float32)
549 |     
550 | 
551 |     overall_predict_Y = []
552 |     for start, end in zip(range(0, np.shape(test_X_left)[0], batch_size),
553 |                           range(batch_size, np.shape(test_X_left)[0] + 1,
554 |                                 batch_size)):
555 |         predict_Y = sess.run(predict_op,
556 |                              feed_dict={h_left: test_X_left[start:end],
557 |                                         h_right: test_X_right[start:end],
558 |                                         dropout: 1.0, phase: 0})  # no dropout
559 |         overall_predict_Y.extend(predict_Y.tolist())
560 |         iter += 1
561 | 
562 |     stat(np.argmax(test_Y[:end], axis=1),
563 |          np.array(overall_predict_Y, dtype=np.int32))
564 | 
565 | 
566 | if __name__ == '__main__':
567 |     train_10_fold_balanced()
568 |     st = time.time()
569 |     predict_on_full_dataset()
570 |     print "Predict time on the full dataset: ", time.time() - st


--------------------------------------------------------------------------------
/dcsim/classification_bigbench_keras.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import print_function, division
  4 | 
  5 | import tensorflow as tf
  6 | from keras.models import Model
  7 | from keras.layers import Dense, Flatten, Dropout, Activation, Input, Lambda
  8 | from keras.layers.normalization import BatchNormalization
  9 | from keras.layers.pooling import GlobalAveragePooling1D
 10 | from keras.utils import np_utils, Sequence
 11 | from keras.utils.vis_utils import plot_model
 12 | import keras as K
 13 | import numpy as np
 14 | import pandas as pd
 15 | import os
 16 | import time
 17 | 
 18 | import matplotlib
 19 | from matplotlib.ticker import NullFormatter
 20 | from sklearn.decomposition import PCA
 21 | from sklearn.metrics import precision_recall_fscore_support, accuracy_score
 22 | from sklearn.model_selection import StratifiedKFold
 23 | from sklearn.utils import class_weight, shuffle
 24 | import matplotlib.pyplot as plt
 25 | from mpl_toolkits.mplot3d import Axes3D
 26 | 
 27 | # matplotlib.rcParams['font.family']=''
 28 | matplotlib.rcParams['font.weight'] = 'bold'
 29 | 
 30 | import graph_mat_data
 31 | import preprocessing_bigbench
 32 | 
 33 | bin_vec_dim = 88
 34 | embedding_dim = 6
 35 | dim = 128
 36 | keep_prob = 0.6
 37 | 
 38 | batch_size = 256
 39 | test_size = 256
 40 | 
 41 | 
 42 | # disable tensorflow debugging information
 43 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 44 | 
 45 | logdir = '/tmp/logs'
 46 | 
 47 | kernel_init = K.initializers.VarianceScaling(scale=1.0, mode='fan_avg',
 48 |                                              distribution='uniform')
 49 | bias_init = K.initializers.Constant(value=0.01)
 50 | 
 51 | 
 52 | def stat_by_type(y_true, y_pred, ts, fout=None):
 53 |     print('*' * 40 + " Performance by Type " + '*' * 40)
 54 |     # T1
 55 |     indices = np.where(ts==0)
 56 |     accuracy = accuracy_score(y_true[indices], y_pred[indices])
 57 |     precision, recall, fscore, _ = \
 58 |         precision_recall_fscore_support(y_true[indices], y_pred[indices],
 59 |                                         average='binary')
 60 |     print("T1: accuracy: %.4f, recall: %.4f, "
 61 |           "precision: %.4f, f1 score: %.4f\n" % (
 62 |               accuracy, recall, precision, fscore))
 63 |     if fout is not None:
 64 |         fout.write("T1: accuracy: %.4f, recall: %.4f, "
 65 |               "precision: %.4f, f1 score: %.4f\n" % (
 66 |                   accuracy, recall, precision, fscore))
 67 |     
 68 |     #T2
 69 |     indices = np.where(ts == 1)
 70 |     accuracy = accuracy_score(y_true[indices], y_pred[indices])
 71 |     precision, recall, fscore, _ = \
 72 |         precision_recall_fscore_support(y_true[indices], y_pred[indices],
 73 |                                         average='binary')
 74 |     print("T2: accuracy: %.4f, recall: %.4f, "
 75 |           "precision: %.4f, f1 score: %.4f\n" % (
 76 |               accuracy, recall, precision, fscore))
 77 |     if fout is not None:
 78 |         fout.write("T2: accuracy: %.4f, recall: %.4f, "
 79 |                    "precision: %.4f, f1 score: %.4f\n" % (
 80 |                        accuracy, recall, precision, fscore))
 81 |     
 82 |     # ST3
 83 |     indices = np.where(ts == 2)
 84 |     accuracy = accuracy_score(y_true[indices], y_pred[indices])
 85 |     precision, recall, fscore, _ = \
 86 |         precision_recall_fscore_support(y_true[indices], y_pred[indices],
 87 |                                         average='binary')
 88 |     print("ST3: accuracy: %.4f, recall: %.4f, "
 89 |           "precision: %.4f, f1 score: %.4f\n" % (
 90 |               accuracy, recall, precision, fscore))
 91 |     if fout is not None:
 92 |         fout.write("ST3: accuracy: %.4f, recall: %.4f, "
 93 |                    "precision: %.4f, f1 score: %.4f\n" % (
 94 |                        accuracy, recall, precision, fscore))
 95 |     
 96 |     #MT3
 97 |     indices = np.where(ts == 3)
 98 |     accuracy = accuracy_score(y_true[indices], y_pred[indices])
 99 |     precision, recall, fscore, _ = \
100 |         precision_recall_fscore_support(y_true[indices], y_pred[indices],
101 |                                         average='binary')
102 |     print("MT3: accuracy: %.4f, recall: %.4f, "
103 |           "precision: %.4f, f1 score: %.4f\n" % (
104 |               accuracy, recall, precision, fscore))
105 |     if fout is not None:
106 |         fout.write("MT3: accuracy: %.4f, recall: %.4f, "
107 |                    "precision: %.4f, f1 score: %.4f\n" % (
108 |                        accuracy, recall, precision, fscore))
109 |     
110 |     indices = np.where(ts == 4)
111 |     accuracy = accuracy_score(y_true[indices], y_pred[indices])
112 |     precision, recall, fscore, _ = \
113 |         precision_recall_fscore_support(y_true[indices], y_pred[indices],
114 |                                         average='binary')
115 |     print("WT3/T4: accuracy: %.4f, recall: %.4f, "
116 |           "precision: %.4f, f1 score: %.4f\n" % (
117 |               accuracy, recall, precision, fscore))
118 |     if fout is not None:
119 |         fout.write("WT3/T4: accuracy: %.4f, recall: %.4f, "
120 |                    "precision: %.4f, f1 score: %.4f\n" % (
121 |                        accuracy, recall, precision, fscore))
122 | 
123 | def from_sparse_arr(sparse_arr):
124 |     mat = np.zeros((dim, dim, bin_vec_dim), dtype=np.float32)
125 |     for (i, j, k) in sparse_arr:
126 |         mat[i, j, k] = 1
127 |     return mat
128 | 
129 | def from_sparse_arrs(sparse_arrs):
130 |     mats = []
131 |     for sparse_arr in sparse_arrs:
132 |         mats.append(from_sparse_arr(sparse_arr))
133 |     mats = np.array(mats, dtype=np.float32)
134 |     return mats
135 | 
136 | def fit_generator(Xl, Xr, Y):
137 |     '''
138 |     Best set worker=1, use_multiprocessing=False
139 |     :param Xl:
140 |     :param Xr:
141 |     :param Y:
142 |     :return:
143 |     '''
144 |     while True:
145 |         Xl, Xr, Y = shuffle(Xl, Xr, Y)
146 |         batch_Xl = []
147 |         batch_Xr = []
148 |         batch_y = []
149 |         count = 0
150 |         for (xl, xr, y) in zip(Xl, Xr, Y):
151 |             batch_Xl.append(from_sparse_arr(xl))
152 |             batch_Xr.append(from_sparse_arr(xr))
153 |             batch_y.append(y)
154 |             count += 1
155 |             if len(batch_y) == batch_size or count == np.shape(Y)[0]:
156 |                 yield ([np.array(batch_Xl), np.array(batch_Xr)],
157 |                        np.expand_dims(np.array(batch_y, dtype=np.float32),
158 |                                       axis=1))
159 |                 batch_Xl = []
160 |                 batch_Xr = []
161 |                 batch_y = []
162 | 
163 | class SequenceSamples(Sequence):
164 |     def __init__(self, Xl, Xr, Y, batch_size):
165 |         self.Xl, self.Xr, self.Y = Xl, Xr, Y
166 |         self.batch_size = batch_size
167 |     
168 |     def __len__(self):
169 |         return np.ceil(np.shape(self.Y)[0] / batch_size)
170 |         
171 |     def __getitem__(self, item):
172 |         batch_Xl = from_sparse_arrs(self.Xl[item * self.batch_size:(item + 1) * self.batch_size])
173 |         batch_Xr = from_sparse_arrs(self.Xr[item * self.batch_size:(item + 1) * self.batch_size])
174 |         # Y shouldn't be (256,), it should has the same shape as the model's
175 |         # output
176 |         batch_Y = self.Y[item * self.batch_size:(item+1)*self.batch_size]\
177 |                         .reshape(batch_size, 1)
178 |         print("Batch size: ", batch_Xl.shape[0], batch_Xr.shape[0],
179 |               batch_Y.shape[0])
180 |         return ([batch_Xl, batch_Xr], batch_Y)
181 |     
182 | 
183 | def feed_forward(x):
184 |     x = Lambda(lambda input: K.backend.reshape(input, (-1, bin_vec_dim)),
185 |                batch_input_shape=K.backend.get_variable_shape(x))(x)
186 |     x = Dense(embedding_dim,
187 |               kernel_initializer=kernel_init,
188 |               bias_initializer=bias_init)(x)
189 |     x = BatchNormalization()(x)
190 |     x = Activation(activation='relu')(x)
191 |     x = Lambda(
192 |         lambda input: K.backend.reshape(input, (-1, dim * embedding_dim)))(x)
193 |     x = Dense(256, kernel_initializer=kernel_init,
194 |               bias_initializer=bias_init)(x)
195 |     x = BatchNormalization()(x)
196 |     x = Activation(activation='relu')(x)
197 |     x = Dropout(keep_prob)(x)
198 |     
199 |     x = Dense(64,
200 |               kernel_initializer=kernel_init,
201 |               bias_initializer=bias_init)(x)
202 |     x = BatchNormalization()(x)
203 |     x = Activation(activation='relu')(x)
204 |     x = Dropout(keep_prob)(x)
205 |     x = Lambda(lambda input: K.backend.reshape(input, (-1, dim, 64)))(x)
206 |     x = GlobalAveragePooling1D()(x)  # (batch_size, 64)
207 |     return x
208 | 
209 | def classification(x1, x2):
210 |     input = Input(shape=(dim, dim, bin_vec_dim))
211 |     # share layers
212 |     feed_forward_model = Model(inputs=input, outputs=feed_forward(input))
213 |     x1 = feed_forward_model(x1)
214 |     x2 = feed_forward_model(x2)
215 |     concat_input = Input(shape=(128,))
216 |     # share layers
217 |     merge_model = Model(inputs=concat_input,
218 |                         outputs=Activation(activation='relu')(
219 |                             BatchNormalization()(
220 |                                 Dense(32, kernel_initializer=kernel_init,
221 |                                       bias_initializer=bias_init,
222 |                                       input_shape=(128,))(
223 |                                     concat_input))))
224 |     
225 |     xc1 = K.layers.concatenate([x1, x2])
226 |     xc1 = merge_model(xc1)
227 |     
228 |     xc2 = K.layers.concatenate([x2, x1])
229 |     xc2 = merge_model(xc2)
230 |     
231 |     xc = K.layers.average([xc1, xc2])
232 |     
233 |     x = Dense(1, use_bias=False, activation='sigmoid',
234 |               kernel_initializer=kernel_init,
235 |               batch_input_shape=K.backend.get_variable_shape(xc))(xc)
236 |     
237 |     return x
238 | 
239 | def model_summary():
240 |     X_left = Input((dim, dim, bin_vec_dim))
241 |     X_right = Input((dim, dim, bin_vec_dim))
242 |     predictions = classification(X_left, X_right)
243 |     model = Model(inputs=[X_left, X_right], outputs=predictions)
244 |     model.compile(optimizer=K.optimizers.adam(lr=0.0005),
245 |                   loss=K.losses.binary_crossentropy,
246 |                   metrics=['accuracy'])
247 |     
248 |     # plot_model(model, to_file='./result/plot/whole_model.png', show_shapes=True)
249 | 
250 | def train_10_fold_balanced():
251 |     
252 |     skf = StratifiedKFold(n_splits=10)
253 |     
254 |     Xl, Xr, y, ts = preprocessing_bigbench.load_dataset()
255 |     
256 |     fold_index = 0
257 |     avg_accuracy = 0.
258 |     avg_recall = 0.
259 |     avg_precision = 0.
260 |     avg_f1_score = 0.
261 |     fout = open('result/10_fold_balanced.txt', 'w')
262 |     if os.path.exists('result') is not True:
263 |         os.mkdir("result")
264 |     if os.path.exists("10_fold_balanced") is not True:
265 |         os.mkdir("10_fold_balanced")
266 |     for train_idx, test_idx in skf.split(Xl, y):
267 |         t_beg = time.clock()
268 |         
269 |         print ('*' * 40 + str(fold_index) + '*' * 40)
270 |         fold_path = os.path.join("10_fold_balanced", str(fold_index))
271 |         if os.path.exists(fold_path) is not True:
272 |             os.mkdir(fold_path)
273 |         
274 |         train_X_left = Xl[train_idx]
275 |         train_X_right = Xr[train_idx]
276 |         train_Y = y[train_idx]
277 |         
278 |         train_Yt = train_Y[train_Y == 0]
279 |         train_Xlt = train_X_left[train_Y == 0]
280 |         train_Xrt = train_X_right[train_Y == 0]
281 |         train_Xl = train_X_left[train_Y == 1][:5 * train_Yt.shape[0]]
282 |         train_Xr = train_X_right[train_Y == 1][:5 * train_Yt.shape[0]]
283 |         train_y = train_Y[train_Y == 1][:5 * train_Yt.shape[0]]
284 |         train_X_left = np.concatenate((train_Xlt, train_Xl), axis=0)
285 |         train_X_right = np.concatenate((train_Xrt, train_Xr), axis=0)
286 |         train_Y = np.concatenate((train_Yt, train_y), axis=0)
287 |         train_X_left, train_X_right, train_Y = shuffle(train_X_left,
288 |                                                        train_X_right, train_Y)
289 |         
290 |         test_X_left = Xl[test_idx]
291 |         test_X_right = Xr[test_idx]
292 |         test_Y = y[test_idx]
293 |         test_ts = ts[test_idx]
294 |         
295 |         validate_X_left = from_sparse_arrs(test_X_left[:256])
296 |         validate_X_right = from_sparse_arrs(test_X_right[:256])
297 |         validate_Y = test_Y[:256]
298 | 
299 |         X_left = Input(shape=(dim, dim, bin_vec_dim))
300 |         X_right = Input(shape=(dim, dim, bin_vec_dim))
301 | 
302 |         predictions = classification(X_left, X_right)
303 | 
304 |         model = Model(inputs=[X_left, X_right], outputs=predictions)
305 | 
306 |         model.compile(optimizer=K.optimizers.adam(lr=0.001),
307 |                       loss=K.losses.binary_crossentropy,
308 |                       metrics=['accuracy'])
309 |         samples_generator = SequenceSamples(train_X_left,train_X_right,
310 |                                             train_Y, batch_size)
311 |         model.fit_generator(fit_generator(train_X_left, train_X_right, train_Y),
312 |                             steps_per_epoch=np.ceil(train_Y.shape[0]/batch_size),
313 |                             epochs=1, verbose=1,
314 |                             workers=1, use_multiprocessing=False,
315 |                             validation_data=([validate_X_left, validate_X_right], validate_Y))
316 |         
317 |         t_end = time.clock()
318 |         print('Time cost: %.2f' % (t_end - t_beg))
319 |         
320 |         model.save(filepath=os.path.join(fold_path, 'model.ckpt'))
321 |         
322 |         print("Evaluation:")
323 | 
324 |         test_samples_generator = SequenceSamples(test_X_left, test_X_right,
325 |                                                  test_Y, batch_size),
326 |         y_pred = model.predict_generator(fit_generator(test_X_left,
327 |                                                        test_X_right, test_Y),
328 |                             steps=np.ceil(test_Y.shape[0] / batch_size),
329 |                             workers=1, use_multiprocessing=False)
330 |         y_pred = np.round(y_pred)
331 |         accuracy = accuracy_score(test_Y, y_pred)
332 |         precision, recall, fscore, _ = precision_recall_fscore_support(test_Y,
333 |                                                                  y_pred, average='binary')
334 |         print("Fold index: %d, accuracy: %.4f, recall: %.4f, "
335 |                    "precision: %.4f, f1 score: %.4f\n" % (
336 |                    fold_index, accuracy, recall, precision, fscore))
337 |         fout.write('*' * 80 + '\n')
338 |         fout.write('Fold %d:\n' % (fold_index))
339 |         fout.write('Time cost: %.2f\n' % (t_end - t_beg))
340 |         fout.write("Fold index: %d, accuracy: %.4f, recall: %.4f, "
341 |                    "precision: %.4f, f1 score: %.4f\n" % (
342 |                    fold_index, accuracy, recall, precision, fscore))
343 |         stat_by_type(test_Y, y_pred, test_ts, fout)
344 |         fout.flush()
345 |         avg_accuracy += accuracy
346 |         avg_precision += precision
347 |         avg_recall += recall
348 |         avg_f1_score += fscore
349 |         
350 |     avg_accuracy /= 10.0
351 |     avg_precision /= 10.0
352 |     avg_recall /= 10.0
353 |     avg_f1_score /= 10.0
354 |     print('Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 '
355 |           'score: %.4f' % (
356 |               avg_accuracy, avg_recall, avg_precision, avg_f1_score))
357 |     fout.write('*' * 80 + '\n')
358 |     fout.write(
359 |         'Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 '
360 |         'score: %.4f' % (avg_accuracy, avg_recall, avg_precision, avg_f1_score))
361 |     fout.close()
362 | 
363 | def train_on_selected_id():
364 |     t_beg = time.clock()
365 | 
366 |     Xl_selected, Xr_selected, y_selected, ts_selected, Xl, Xr, y, ts = preprocessing_bigbench.load_train_test(id=4)
367 |     
368 |     train_X_left = Xl_selected
369 |     train_X_right = Xr_selected
370 |     train_Y = y_selected
371 | 
372 |     train_Yt = train_Y[train_Y == 0]
373 |     train_Xlt = train_X_left[train_Y == 0]
374 |     train_Xrt = train_X_right[train_Y == 0]
375 |     train_Xl = train_X_left[train_Y == 1][:5 * train_Yt.shape[0]]
376 |     train_Xr = train_X_right[train_Y == 1][:5 * train_Yt.shape[0]]
377 |     train_y = train_Y[train_Y == 1][:5 * train_Yt.shape[0]]
378 |     train_X_left = np.concatenate((train_Xlt, train_Xl), axis=0)
379 |     train_X_right = np.concatenate((train_Xrt, train_Xr), axis=0)
380 |     train_Y = np.concatenate((train_Yt, train_y), axis=0)
381 |     train_X_left, train_X_right, train_Y = shuffle(train_X_left,
382 |                                                    train_X_right, train_Y)
383 |     print("Training data size: ", train_Y.shape[0])
384 |     
385 |     test_X_left = Xl
386 |     test_X_right = Xr
387 |     test_Y = y
388 |     test_ts = ts
389 |     
390 |     validate_X_left = from_sparse_arrs(test_X_left[:256])
391 |     validate_X_right = from_sparse_arrs(test_X_right[:256])
392 |     validate_Y = test_Y[:256]
393 |     
394 |     X_left = Input(shape=(dim, dim, bin_vec_dim))
395 |     X_right = Input(shape=(dim, dim, bin_vec_dim))
396 |     
397 |     predictions = classification(X_left, X_right)
398 |     
399 |     model = Model(inputs=[X_left, X_right], outputs=predictions)
400 |     
401 |     model.compile(optimizer=K.optimizers.adam(lr=0.001),
402 |                   loss=K.losses.binary_crossentropy,
403 |                   metrics=['accuracy'])
404 |     model.fit_generator(fit_generator(train_X_left, train_X_right, train_Y),
405 |                         steps_per_epoch=np.ceil(train_Y.shape[0] / batch_size),
406 |                         epochs=1, verbose=1,
407 |                         workers=1, use_multiprocessing=False,
408 |                         validation_data=(
409 |                         [validate_X_left, validate_X_right], validate_Y))
410 |     
411 |     t_end = time.clock()
412 |     print('Time cost: %.2f' % (t_end - t_beg))
413 |     
414 |     model.save(filepath=os.path.join('./model', 'model_id4.ckpt'))
415 |     
416 |     print("Evaluation:")
417 |     
418 |     y_pred = model.predict_generator(fit_generator(test_X_left,
419 |                                                    test_X_right, test_Y),
420 |                                      steps=np.ceil(test_Y.shape[0] /
421 |                                                    batch_size),
422 |                                      workers=1, use_multiprocessing=False)
423 |     y_pred = np.round(y_pred)
424 |     accuracy = accuracy_score(test_Y, y_pred)
425 |     precision, recall, fscore, _ = precision_recall_fscore_support(test_Y,
426 |                                                                    y_pred,
427 |                                                                    average='binary')
428 |     print("accuracy: %.4f, recall: %.4f, "
429 |           "precision: %.4f, f1 score: %.4f\n" % (
430 |               accuracy, recall, precision, fscore))
431 |     
432 |     stat_by_type(test_Y, y_pred, test_ts)
433 | 
434 | 
435 | if __name__ == '__main__':
436 |     # model_summary()
437 |     beg = time.time()
438 |     train_10_fold_balanced()
439 |     st = time.time()
440 |     print("Total time: ", st-beg)


--------------------------------------------------------------------------------
/dcsim/encoding/.idea/artifacts/SourceCodeSimilarity_jar.xml:
--------------------------------------------------------------------------------
 1 | <component name="ArtifactManager">
 2 |   <artifact type="jar" name="SourceCodeSimilarity:jar">
 3 |     <output-path>$PROJECT_DIR$/out/artifacts/SourceCodeSimilarity_jar</output-path>
 4 |     <root id="archive" name="SourceCodeSimilarity.jar">
 5 |       <element id="directory" name="META-INF">
 6 |         <element id="file-copy" path="$PROJECT_DIR$/bin/META-INF/MANIFEST.MF" />
 7 |       </element>
 8 |       <element id="module-output" name="SourceCodeSimilarity" />
 9 |       <element id="extracted-dir" path="$PROJECT_DIR$/lib/wala_core_test_zg.jar" path-in-jar="/" />
10 |       <element id="extracted-dir" path="$PROJECT_DIR$/lib/wala-jeff.jar" path-in-jar="/" />
11 |     </root>
12 |   </artifact>
13 | </component>


--------------------------------------------------------------------------------
/dcsim/encoding/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="1.8" project-jdk-type="JavaSDK">
4 |     <output url="file://$PROJECT_DIR$/out" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/dcsim/encoding/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/SourceCodeSimilarity.iml" filepath="$PROJECT_DIR$/SourceCodeSimilarity.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/dcsim/encoding/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ArtifactsWorkspaceSettings">
  4 |     <artifacts-to-build>
  5 |       <artifact name="SourceCodeSimilarity:jar" />
  6 |     </artifacts-to-build>
  7 |   </component>
  8 |   <component name="ChangeListManager">
  9 |     <list default="true" id="4a417fc2-aded-43e5-b686-bb76f96e66bb" name="Default" comment="" />
 10 |     <ignored path="$PROJECT_DIR$/out/" />
 11 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
 12 |     <option name="TRACKING_ENABLED" value="true" />
 13 |     <option name="SHOW_DIALOG" value="false" />
 14 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 15 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 16 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 17 |   </component>
 18 |   <component name="CreatePatchCommitExecutor">
 19 |     <option name="PATCH_PATH" value="" />
 20 |   </component>
 21 |   <component name="FileEditorManager">
 22 |     <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
 23 |       <file leaf-file-name="Encoder.java" pinned="false" current-in-tab="true">
 24 |         <entry file="file://$PROJECT_DIR$/src/Encoder.java">
 25 |           <provider selected="true" editor-type-id="text-editor">
 26 |             <state relative-caret-position="22">
 27 |               <caret line="77" column="28" lean-forward="false" selection-start-line="77" selection-start-column="28" selection-end-line="77" selection-end-column="28" />
 28 |               <folding>
 29 |                 <element signature="imports" expanded="true" />
 30 |               </folding>
 31 |             </state>
 32 |           </provider>
 33 |         </entry>
 34 |       </file>
 35 |     </leaf>
 36 |   </component>
 37 |   <component name="FindInProjectRecents">
 38 |     <findStrings>
 39 |       <find>data_benchmark_fix</find>
 40 |       <find>exclusionsFileName</find>
 41 |     </findStrings>
 42 |   </component>
 43 |   <component name="GradleLocalSettings">
 44 |     <option name="externalProjectsViewState">
 45 |       <projects_view />
 46 |     </option>
 47 |   </component>
 48 |   <component name="IdeDocumentHistory">
 49 |     <option name="CHANGED_PATHS">
 50 |       <list>
 51 |         <option value="$PROJECT_DIR$/src/EmptyExclusion.txt" />
 52 |         <option value="$PROJECT_DIR$/src/Encoder.java" />
 53 |       </list>
 54 |     </option>
 55 |   </component>
 56 |   <component name="ProjectFrameBounds">
 57 |     <option name="x" value="270" />
 58 |     <option name="y" value="33" />
 59 |     <option name="width" value="1522" />
 60 |     <option name="height" value="1000" />
 61 |   </component>
 62 |   <component name="ProjectView">
 63 |     <navigator currentView="ProjectPane" proportions="" version="1">
 64 |       <flattenPackages />
 65 |       <showMembers />
 66 |       <showModules />
 67 |       <showLibraryContents />
 68 |       <hideEmptyPackages />
 69 |       <abbreviatePackageNames />
 70 |       <autoscrollToSource />
 71 |       <autoscrollFromSource />
 72 |       <sortByType />
 73 |       <manualOrder />
 74 |       <foldersAlwaysOnTop value="true" />
 75 |     </navigator>
 76 |     <panes>
 77 |       <pane id="Scope" />
 78 |       <pane id="Scratches" />
 79 |       <pane id="PackagesPane" />
 80 |       <pane id="ProjectPane">
 81 |         <subPane>
 82 |           <expand>
 83 |             <path>
 84 |               <item name="encoding" type="b2602c69:ProjectViewProjectNode" />
 85 |               <item name="encoding" type="462c0819:PsiDirectoryNode" />
 86 |             </path>
 87 |           </expand>
 88 |           <select />
 89 |         </subPane>
 90 |       </pane>
 91 |     </panes>
 92 |   </component>
 93 |   <component name="PropertiesComponent">
 94 |     <property name="WebServerToolWindowFactoryState" value="false" />
 95 |     <property name="aspect.path.notification.shown" value="true" />
 96 |     <property name="project.structure.last.edited" value="Artifacts" />
 97 |     <property name="project.structure.proportion" value="0.15" />
 98 |     <property name="project.structure.side.proportion" value="0.2" />
 99 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
100 |     <property name="settings.editor.selected.configurable" value="preferences.keymap" />
101 |   </component>
102 |   <component name="RunDashboard">
103 |     <option name="ruleStates">
104 |       <list>
105 |         <RuleState>
106 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
107 |         </RuleState>
108 |         <RuleState>
109 |           <option name="name" value="StatusDashboardGroupingRule" />
110 |         </RuleState>
111 |       </list>
112 |     </option>
113 |   </component>
114 |   <component name="RunManager" selected="Application.Encoder">
115 |     <configuration name="Encoder" type="Application" factoryName="Application" temporary="true" nameIsGenerated="true">
116 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
117 |       <option name="MAIN_CLASS_NAME" value="Encoder" />
118 |       <option name="VM_PARAMETERS" />
119 |       <option name="PROGRAM_PARAMETERS" />
120 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
121 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
122 |       <option name="ALTERNATIVE_JRE_PATH" />
123 |       <option name="ENABLE_SWING_INSPECTOR" value="false" />
124 |       <option name="ENV_VARIABLES" />
125 |       <option name="PASS_PARENT_ENVS" value="true" />
126 |       <module name="SourceCodeSimilarity" />
127 |       <envs />
128 |     </configuration>
129 |     <configuration default="true" type="Applet" factoryName="Applet">
130 |       <option name="HTML_USED" value="false" />
131 |       <option name="WIDTH" value="400" />
132 |       <option name="HEIGHT" value="300" />
133 |       <option name="POLICY_FILE" value="$APPLICATION_HOME_DIR$/bin/appletviewer.policy" />
134 |       <module />
135 |     </configuration>
136 |     <configuration default="true" type="Application" factoryName="Application">
137 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
138 |       <option name="MAIN_CLASS_NAME" />
139 |       <option name="VM_PARAMETERS" />
140 |       <option name="PROGRAM_PARAMETERS" />
141 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
142 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
143 |       <option name="ALTERNATIVE_JRE_PATH" />
144 |       <option name="ENABLE_SWING_INSPECTOR" value="false" />
145 |       <option name="ENV_VARIABLES" />
146 |       <option name="PASS_PARENT_ENVS" value="true" />
147 |       <module name="" />
148 |       <envs />
149 |     </configuration>
150 |     <configuration default="true" type="JUnit" factoryName="JUnit">
151 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
152 |       <module name="" />
153 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
154 |       <option name="ALTERNATIVE_JRE_PATH" />
155 |       <option name="PACKAGE_NAME" />
156 |       <option name="MAIN_CLASS_NAME" />
157 |       <option name="METHOD_NAME" />
158 |       <option name="TEST_OBJECT" value="class" />
159 |       <option name="VM_PARAMETERS" value="-ea" />
160 |       <option name="PARAMETERS" />
161 |       <option name="WORKING_DIRECTORY" value="$MODULE_DIR$" />
162 |       <option name="ENV_VARIABLES" />
163 |       <option name="PASS_PARENT_ENVS" value="true" />
164 |       <option name="TEST_SEARCH_SCOPE">
165 |         <value defaultName="singleModule" />
166 |       </option>
167 |       <envs />
168 |       <patterns />
169 |     </configuration>
170 |     <configuration default="true" type="#org.jetbrains.idea.devkit.run.PluginConfigurationType" factoryName="Plugin">
171 |       <module name="" />
172 |       <option name="VM_PARAMETERS" value="-Xmx512m -Xms256m -XX:MaxPermSize=250m -ea" />
173 |       <option name="PROGRAM_PARAMETERS" />
174 |       <predefined_log_file id="idea.log" enabled="true" />
175 |     </configuration>
176 |     <configuration default="true" type="Remote" factoryName="Remote">
177 |       <option name="USE_SOCKET_TRANSPORT" value="true" />
178 |       <option name="SERVER_MODE" value="false" />
179 |       <option name="SHMEM_ADDRESS" value="javadebug" />
180 |       <option name="HOST" value="localhost" />
181 |       <option name="PORT" value="5005" />
182 |     </configuration>
183 |     <configuration default="true" type="TestNG" factoryName="TestNG">
184 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
185 |       <module name="" />
186 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
187 |       <option name="ALTERNATIVE_JRE_PATH" />
188 |       <option name="SUITE_NAME" />
189 |       <option name="PACKAGE_NAME" />
190 |       <option name="MAIN_CLASS_NAME" />
191 |       <option name="METHOD_NAME" />
192 |       <option name="GROUP_NAME" />
193 |       <option name="TEST_OBJECT" value="CLASS" />
194 |       <option name="VM_PARAMETERS" value="-ea" />
195 |       <option name="PARAMETERS" />
196 |       <option name="WORKING_DIRECTORY" value="$MODULE_DIR$" />
197 |       <option name="OUTPUT_DIRECTORY" />
198 |       <option name="ANNOTATION_TYPE" />
199 |       <option name="ENV_VARIABLES" />
200 |       <option name="PASS_PARENT_ENVS" value="true" />
201 |       <option name="TEST_SEARCH_SCOPE">
202 |         <value defaultName="singleModule" />
203 |       </option>
204 |       <option name="USE_DEFAULT_REPORTERS" value="false" />
205 |       <option name="PROPERTIES_FILE" />
206 |       <envs />
207 |       <properties />
208 |       <listeners />
209 |     </configuration>
210 |     <configuration default="true" type="CucumberJavaRunConfigurationType" factoryName="Cucumber java">
211 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
212 |       <option name="myFilePath" />
213 |       <option name="GLUE" />
214 |       <option name="myNameFilter" />
215 |       <option name="myGeneratedName" />
216 |       <option name="MAIN_CLASS_NAME" />
217 |       <option name="VM_PARAMETERS" />
218 |       <option name="PROGRAM_PARAMETERS" />
219 |       <option name="WORKING_DIRECTORY" />
220 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
221 |       <option name="ALTERNATIVE_JRE_PATH" />
222 |       <option name="ENABLE_SWING_INSPECTOR" value="false" />
223 |       <option name="ENV_VARIABLES" />
224 |       <option name="PASS_PARENT_ENVS" value="true" />
225 |       <module name="" />
226 |       <envs />
227 |       <method />
228 |     </configuration>
229 |     <configuration default="true" type="GradleRunConfiguration" factoryName="Gradle">
230 |       <ExternalSystemSettings>
231 |         <option name="executionName" />
232 |         <option name="externalProjectPath" />
233 |         <option name="externalSystemIdString" value="GRADLE" />
234 |         <option name="scriptParameters" />
235 |         <option name="taskDescriptions">
236 |           <list />
237 |         </option>
238 |         <option name="taskNames">
239 |           <list />
240 |         </option>
241 |         <option name="vmOptions" />
242 |       </ExternalSystemSettings>
243 |       <method />
244 |     </configuration>
245 |     <configuration default="true" type="JarApplication" factoryName="JAR Application">
246 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
247 |       <envs />
248 |       <method />
249 |     </configuration>
250 |     <configuration default="true" type="Java Scratch" factoryName="Java Scratch">
251 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
252 |       <option name="SCRATCH_FILE_ID" value="0" />
253 |       <option name="MAIN_CLASS_NAME" />
254 |       <option name="VM_PARAMETERS" />
255 |       <option name="PROGRAM_PARAMETERS" />
256 |       <option name="WORKING_DIRECTORY" />
257 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
258 |       <option name="ALTERNATIVE_JRE_PATH" />
259 |       <option name="ENABLE_SWING_INSPECTOR" value="false" />
260 |       <option name="ENV_VARIABLES" />
261 |       <option name="PASS_PARENT_ENVS" value="true" />
262 |       <module name="" />
263 |       <envs />
264 |       <method />
265 |     </configuration>
266 |     <configuration default="true" type="JetRunConfigurationType" factoryName="Kotlin">
267 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
268 |       <option name="MAIN_CLASS_NAME" />
269 |       <option name="VM_PARAMETERS" />
270 |       <option name="PROGRAM_PARAMETERS" />
271 |       <option name="WORKING_DIRECTORY" />
272 |       <option name="ALTERNATIVE_JRE_PATH_ENABLED" value="false" />
273 |       <option name="ALTERNATIVE_JRE_PATH" />
274 |       <option name="PASS_PARENT_ENVS" value="true" />
275 |       <module name="SourceCodeSimilarity" />
276 |       <envs />
277 |       <method />
278 |     </configuration>
279 |     <configuration default="true" type="KotlinStandaloneScriptRunConfigurationType" factoryName="Kotlin script">
280 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
281 |       <option name="filePath" />
282 |       <option name="vmParameters" />
283 |       <option name="alternativeJrePath" />
284 |       <option name="programParameters" />
285 |       <option name="passParentEnvs" value="true" />
286 |       <option name="workingDirectory" />
287 |       <option name="isAlternativeJrePathEnabled" value="false" />
288 |       <envs />
289 |       <method />
290 |     </configuration>
291 |     <configuration default="true" type="SpringBootApplicationConfigurationType" factoryName="Spring Boot">
292 |       <extension name="coverage" enabled="false" merge="false" sample_coverage="true" runner="idea" />
293 |       <module name="" />
294 |       <envs />
295 |       <method />
296 |     </configuration>
297 |     <configuration default="true" type="osgi.bnd.run" factoryName="Run Launcher">
298 |       <method />
299 |     </configuration>
300 |     <configuration default="true" type="osgi.bnd.run" factoryName="Test Launcher (JUnit)">
301 |       <method />
302 |     </configuration>
303 |     <recent_temporary>
304 |       <list size="1">
305 |         <item index="0" class="java.lang.String" itemvalue="Application.Encoder" />
306 |       </list>
307 |     </recent_temporary>
308 |   </component>
309 |   <component name="ShelveChangesManager" show_recycled="false">
310 |     <option name="remove_strategy" value="false" />
311 |   </component>
312 |   <component name="TaskManager">
313 |     <task active="true" id="Default" summary="Default task">
314 |       <changelist id="4a417fc2-aded-43e5-b686-bb76f96e66bb" name="Default" comment="" />
315 |       <created>1500436499540</created>
316 |       <option name="number" value="Default" />
317 |       <option name="presentableId" value="Default" />
318 |       <updated>1500436499540</updated>
319 |       <workItem from="1500436500912" duration="593000" />
320 |       <workItem from="1500437104340" duration="503000" />
321 |       <workItem from="1500437696959" duration="4735000" />
322 |       <workItem from="1500844996712" duration="1238000" />
323 |       <workItem from="1500998313050" duration="2437000" />
324 |       <workItem from="1501391396816" duration="619000" />
325 |       <workItem from="1501603743347" duration="3000" />
326 |       <workItem from="1501770225006" duration="11000" />
327 |     </task>
328 |     <servers />
329 |   </component>
330 |   <component name="TimeTrackingManager">
331 |     <option name="totallyTimeSpent" value="10139000" />
332 |   </component>
333 |   <component name="ToolWindowManager">
334 |     <frame x="270" y="33" width="1522" height="1000" extended-state="0" />
335 |     <editor active="true" />
336 |     <layout>
337 |       <window_info id="Palette" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
338 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
339 |       <window_info id="Palette&#9;" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
340 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
341 |       <window_info id="Maven Projects" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
342 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33813748" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
343 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
344 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
345 |       <window_info id="Designer" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
346 |       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.195795" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
347 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
348 |       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
349 |       <window_info id="UI Designer" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
350 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
351 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
352 |       <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
353 |       <window_info id="Messages" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
354 |       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
355 |       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
356 |       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
357 |       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
358 |       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
359 |     </layout>
360 |   </component>
361 |   <component name="VcsContentAnnotationSettings">
362 |     <option name="myLimit" value="2678400000" />
363 |   </component>
364 |   <component name="XDebuggerManager">
365 |     <breakpoint-manager />
366 |     <watches-manager />
367 |   </component>
368 |   <component name="antWorkspaceConfiguration">
369 |     <option name="IS_AUTOSCROLL_TO_SOURCE" value="false" />
370 |     <option name="FILTER_TARGETS" value="false" />
371 |   </component>
372 |   <component name="editorHistoryManager">
373 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
374 |       <provider selected="true" editor-type-id="text-editor">
375 |         <state relative-caret-position="0">
376 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
377 |           <folding>
378 |             <element signature="imports" expanded="true" />
379 |           </folding>
380 |         </state>
381 |       </provider>
382 |     </entry>
383 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
384 |       <provider selected="true" editor-type-id="text-editor">
385 |         <state relative-caret-position="0">
386 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
387 |           <folding>
388 |             <element signature="imports" expanded="true" />
389 |           </folding>
390 |         </state>
391 |       </provider>
392 |     </entry>
393 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
394 |       <provider selected="true" editor-type-id="text-editor">
395 |         <state relative-caret-position="0">
396 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
397 |           <folding>
398 |             <element signature="imports" expanded="true" />
399 |           </folding>
400 |         </state>
401 |       </provider>
402 |     </entry>
403 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
404 |       <provider selected="true" editor-type-id="text-editor">
405 |         <state relative-caret-position="0">
406 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
407 |           <folding>
408 |             <element signature="imports" expanded="true" />
409 |           </folding>
410 |         </state>
411 |       </provider>
412 |     </entry>
413 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
414 |       <provider selected="true" editor-type-id="text-editor">
415 |         <state relative-caret-position="0">
416 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
417 |           <folding>
418 |             <element signature="imports" expanded="true" />
419 |           </folding>
420 |         </state>
421 |       </provider>
422 |     </entry>
423 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
424 |       <provider selected="true" editor-type-id="text-editor">
425 |         <state relative-caret-position="0">
426 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
427 |           <folding>
428 |             <element signature="imports" expanded="true" />
429 |           </folding>
430 |         </state>
431 |       </provider>
432 |     </entry>
433 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
434 |       <provider selected="true" editor-type-id="text-editor">
435 |         <state relative-caret-position="0">
436 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
437 |           <folding>
438 |             <element signature="imports" expanded="true" />
439 |           </folding>
440 |         </state>
441 |       </provider>
442 |     </entry>
443 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
444 |       <provider selected="true" editor-type-id="text-editor">
445 |         <state relative-caret-position="0">
446 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
447 |           <folding>
448 |             <element signature="imports" expanded="true" />
449 |           </folding>
450 |         </state>
451 |       </provider>
452 |     </entry>
453 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
454 |       <provider selected="true" editor-type-id="text-editor">
455 |         <state relative-caret-position="0">
456 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
457 |           <folding>
458 |             <element signature="imports" expanded="true" />
459 |           </folding>
460 |         </state>
461 |       </provider>
462 |     </entry>
463 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
464 |       <provider selected="true" editor-type-id="text-editor">
465 |         <state relative-caret-position="0">
466 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
467 |           <folding>
468 |             <element signature="imports" expanded="true" />
469 |           </folding>
470 |         </state>
471 |       </provider>
472 |     </entry>
473 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
474 |       <provider selected="true" editor-type-id="text-editor">
475 |         <state relative-caret-position="0">
476 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
477 |           <folding>
478 |             <element signature="imports" expanded="true" />
479 |           </folding>
480 |         </state>
481 |       </provider>
482 |     </entry>
483 |     <entry file="file://$PROJECT_DIR$/src/DefaultExclusions.txt">
484 |       <provider selected="true" editor-type-id="text-editor">
485 |         <state relative-caret-position="126">
486 |           <caret line="7" column="16" lean-forward="true" selection-start-line="7" selection-start-column="16" selection-end-line="7" selection-end-column="16" />
487 |         </state>
488 |       </provider>
489 |     </entry>
490 |     <entry file="file://$PROJECT_DIR$/src/EclipseDefaultInclusions.txt">
491 |       <provider selected="true" editor-type-id="text-editor">
492 |         <state relative-caret-position="0">
493 |           <caret line="0" column="14" lean-forward="true" selection-start-line="0" selection-start-column="14" selection-end-line="0" selection-end-column="14" />
494 |         </state>
495 |       </provider>
496 |     </entry>
497 |     <entry file="jar:///usr/lib/jvm/java-8-oracle/src.zip!/java/lang/String.java">
498 |       <provider selected="true" editor-type-id="text-editor">
499 |         <state relative-caret-position="189">
500 |           <caret line="975" column="19" lean-forward="false" selection-start-line="975" selection-start-column="19" selection-end-line="975" selection-end-column="19" />
501 |         </state>
502 |       </provider>
503 |     </entry>
504 |     <entry file="jar://$PROJECT_DIR$/lib/wala_core_test_zg.jar!/com/ibm/wala/core/tests/callGraph/CallGraphTestUtil.java">
505 |       <provider selected="true" editor-type-id="text-editor">
506 |         <state relative-caret-position="189">
507 |           <caret line="40" column="23" lean-forward="false" selection-start-line="40" selection-start-column="23" selection-end-line="40" selection-end-column="23" />
508 |         </state>
509 |       </provider>
510 |     </entry>
511 |     <entry file="file://$PROJECT_DIR$/src/EmptyExclusion.txt">
512 |       <provider selected="true" editor-type-id="text-editor">
513 |         <state relative-caret-position="0">
514 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
515 |         </state>
516 |       </provider>
517 |     </entry>
518 |     <entry file="jar://$PROJECT_DIR$/lib/wala-jeff.jar!/com/ibm/wala/util/config/AnalysisScopeReader.class">
519 |       <provider selected="true" editor-type-id="text-editor">
520 |         <state relative-caret-position="432">
521 |           <caret line="25" column="1" lean-forward="false" selection-start-line="25" selection-start-column="1" selection-end-line="25" selection-end-column="1" />
522 |         </state>
523 |       </provider>
524 |     </entry>
525 |     <entry file="jar:///usr/lib/jvm/java-8-oracle/src.zip!/java/util/jar/JarFile.java">
526 |       <provider selected="true" editor-type-id="text-editor">
527 |         <state relative-caret-position="180">
528 |           <caret line="102" column="0" lean-forward="false" selection-start-line="102" selection-start-column="0" selection-end-line="102" selection-end-column="0" />
529 |         </state>
530 |       </provider>
531 |     </entry>
532 |     <entry file="jar:///usr/lib/jvm/java-8-oracle/src.zip!/java/util/zip/ZipFile.java">
533 |       <provider selected="true" editor-type-id="text-editor">
534 |         <state relative-caret-position="189">
535 |           <caret line="218" column="0" lean-forward="false" selection-start-line="218" selection-start-column="0" selection-end-line="218" selection-end-column="0" />
536 |         </state>
537 |       </provider>
538 |     </entry>
539 |     <entry file="jar:///usr/lib/jvm/java-8-oracle/src.zip!/java/io/File.java">
540 |       <provider selected="true" editor-type-id="text-editor">
541 |         <state relative-caret-position="189">
542 |           <caret line="942" column="0" lean-forward="false" selection-start-line="942" selection-start-column="0" selection-end-line="942" selection-end-column="0" />
543 |         </state>
544 |       </provider>
545 |     </entry>
546 |     <entry file="file://$PROJECT_DIR$/src/Encoder.java">
547 |       <provider selected="true" editor-type-id="text-editor">
548 |         <state relative-caret-position="22">
549 |           <caret line="77" column="28" lean-forward="false" selection-start-line="77" selection-start-column="28" selection-end-line="77" selection-end-column="28" />
550 |           <folding>
551 |             <element signature="imports" expanded="true" />
552 |           </folding>
553 |         </state>
554 |       </provider>
555 |     </entry>
556 |   </component>
557 |   <component name="masterDetails">
558 |     <states>
559 |       <state key="ArtifactsStructureConfigurable.UI">
560 |         <settings>
561 |           <artifact-editor />
562 |           <last-edited>SourceCodeSimilarity:jar</last-edited>
563 |           <splitter-proportions>
564 |             <option name="proportions">
565 |               <list>
566 |                 <option value="0.2" />
567 |                 <option value="0.5" />
568 |               </list>
569 |             </option>
570 |           </splitter-proportions>
571 |         </settings>
572 |       </state>
573 |       <state key="FacetStructureConfigurable.UI">
574 |         <settings>
575 |           <last-edited>No facets are configured</last-edited>
576 |           <splitter-proportions>
577 |             <option name="proportions">
578 |               <list>
579 |                 <option value="0.2" />
580 |               </list>
581 |             </option>
582 |           </splitter-proportions>
583 |         </settings>
584 |       </state>
585 |       <state key="GlobalLibrariesConfigurable.UI">
586 |         <settings>
587 |           <splitter-proportions>
588 |             <option name="proportions">
589 |               <list>
590 |                 <option value="0.2" />
591 |               </list>
592 |             </option>
593 |           </splitter-proportions>
594 |         </settings>
595 |       </state>
596 |       <state key="JdkListConfigurable.UI">
597 |         <settings>
598 |           <last-edited>1.8</last-edited>
599 |           <splitter-proportions>
600 |             <option name="proportions">
601 |               <list>
602 |                 <option value="0.2" />
603 |               </list>
604 |             </option>
605 |           </splitter-proportions>
606 |         </settings>
607 |       </state>
608 |       <state key="ModuleStructureConfigurable.UI">
609 |         <settings>
610 |           <last-edited>SourceCodeSimilarity</last-edited>
611 |           <splitter-proportions>
612 |             <option name="proportions">
613 |               <list>
614 |                 <option value="0.2" />
615 |               </list>
616 |             </option>
617 |           </splitter-proportions>
618 |         </settings>
619 |       </state>
620 |       <state key="ProjectLibrariesConfigurable.UI">
621 |         <settings>
622 |           <splitter-proportions>
623 |             <option name="proportions">
624 |               <list>
625 |                 <option value="0.2" />
626 |               </list>
627 |             </option>
628 |           </splitter-proportions>
629 |         </settings>
630 |       </state>
631 |     </states>
632 |   </component>
633 | </project>


--------------------------------------------------------------------------------
/dcsim/encoding/SourceCodeSimilarity.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="JAVA_MODULE" version="4">
 3 |   <component name="NewModuleRootManager" inherit-compiler-output="true">
 4 |     <exclude-output />
 5 |     <content url="file://$MODULE_DIR$">
 6 |       <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
 7 |     </content>
 8 |     <orderEntry type="inheritedJdk" />
 9 |     <orderEntry type="sourceFolder" forTests="false" />
10 |     <orderEntry type="module-library" exported="">
11 |       <library>
12 |         <CLASSES>
13 |           <root url="jar://$MODULE_DIR$/lib/wala-jeff.jar!/" />
14 |         </CLASSES>
15 |         <JAVADOC />
16 |         <SOURCES>
17 |           <root url="jar://$MODULE_DIR$/lib/wala-jeff.jar!/" />
18 |         </SOURCES>
19 |       </library>
20 |     </orderEntry>
21 |     <orderEntry type="module-library">
22 |       <library>
23 |         <CLASSES>
24 |           <root url="jar://$MODULE_DIR$/lib/wala_core_test_zg.jar!/" />
25 |         </CLASSES>
26 |         <JAVADOC />
27 |         <SOURCES>
28 |           <root url="jar://$MODULE_DIR$/lib/wala_core_test_zg.jar!/" />
29 |         </SOURCES>
30 |       </library>
31 |     </orderEntry>
32 |   </component>
33 | </module>


--------------------------------------------------------------------------------
/dcsim/encoding/bin/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Main-Class: Encoder
3 | 
4 | 


--------------------------------------------------------------------------------
/dcsim/encoding/encoding.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parasol-aser/deepsim/6134ac9593806121e7541d1c6c52f5533c38f728/dcsim/encoding/encoding.jar


--------------------------------------------------------------------------------
/dcsim/encoding/src/DefaultExclusions.txt:
--------------------------------------------------------------------------------
1 | java\/awt\/.*
2 | javax\/swing\/.*
3 | sun\/awt\/.*
4 | sun\/swing\/.*
5 | com\/sun\/.*
6 | sun\/.*
7 | org\/.*
8 | com\/ibm\/wala.*


--------------------------------------------------------------------------------
/dcsim/encoding/src/EclipseDefaultInclusions.txt:
--------------------------------------------------------------------------------
1 | java\/util\/.*
2 | 
3 | 


--------------------------------------------------------------------------------
/dcsim/encoding/src/EmptyExclusion.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parasol-aser/deepsim/6134ac9593806121e7541d1c6c52f5533c38f728/dcsim/encoding/src/EmptyExclusion.txt


--------------------------------------------------------------------------------
/dcsim/encoding/src/Encoder.java:
--------------------------------------------------------------------------------
   1 | /**
   2 |  * Created by zg on 4/19/17.
   3 |  */
   4 | import java.io.File;
   5 | import java.io.FileWriter;
   6 | import java.io.IOException;
   7 | import java.util.*;
   8 | import java.util.BitSet;
   9 | 
  10 | import com.ibm.wala.classLoader.*;
  11 | import org.apache.commons.io.FilenameUtils;
  12 | 
  13 | import com.ibm.wala.analysis.typeInference.TypeAbstraction;
  14 | import com.ibm.wala.analysis.typeInference.TypeInference;
  15 | import com.ibm.wala.core.tests.callGraph.CallGraphTestUtil;
  16 | import com.ibm.wala.ipa.callgraph.AnalysisCache;
  17 | import com.ibm.wala.ipa.callgraph.AnalysisOptions;
  18 | import com.ibm.wala.ipa.callgraph.AnalysisScope;
  19 | import com.ibm.wala.ipa.callgraph.impl.Everywhere;
  20 | import com.ibm.wala.ipa.cha.ClassHierarchy;
  21 | import com.ibm.wala.shrikeBT.IComparisonInstruction;
  22 | import com.ibm.wala.shrikeBT.IConditionalBranchInstruction;
  23 | import com.ibm.wala.shrikeCT.InvalidClassFileException;
  24 | import com.ibm.wala.ssa.*;
  25 | import com.ibm.wala.types.ClassLoaderReference;
  26 | import com.ibm.wala.types.FieldReference;
  27 | import com.ibm.wala.types.TypeName;
  28 | import com.ibm.wala.util.WalaException;
  29 | import com.ibm.wala.util.config.AnalysisScopeReader;
  30 | import com.ibm.wala.util.io.FileProvider;
  31 | import com.ibm.wala.util.strings.Atom;
  32 | 
  33 | public class Encoder {
  34 | 
  35 |     final static String exclusionsFileName = "DefaultExclusions.txt";
  36 | 
  37 |     // the mSize of the matrix is fixed to 128
  38 |     final static int fixedSize = 128;
  39 | 
  40 |     // Unprocessed instructions (not included in the current version)
  41 |     static String excludedInstruction = "";
  42 | 
  43 | 
  44 |     static String base_dir = "data/";
  45 | 
  46 |     static String appJar = ""; // the jar file to be analyzed
  47 | 
  48 | 
  49 |     public enum BlockType {
  50 |         NORMAL,
  51 |         LOOP,
  52 |         LOOP_BODY,
  53 |         IF,
  54 |         IF_BODY,
  55 |         SWITCH,
  56 |         SWITCH_BODY
  57 |     }
  58 | 
  59 |     static Map<BlockType, Integer> blockWeights = new HashMap<>(BlockType.values().length);
  60 | 
  61 |     static {
  62 |         blockWeights.put(BlockType.NORMAL, 81);
  63 |         blockWeights.put(BlockType.IF, 82);
  64 |         blockWeights.put(BlockType.IF_BODY, 83);
  65 |         blockWeights.put(BlockType.LOOP, 84);
  66 |         blockWeights.put(BlockType.LOOP_BODY, 85);
  67 |         blockWeights.put(BlockType.SWITCH, 86);
  68 |         blockWeights.put(BlockType.SWITCH_BODY, 87);
  69 |     }
  70 | 
  71 | 
  72 |     public static void main(String[] args) {
  73 |         if (args.length == 0 || !args[0].endsWith(".jar"))
  74 |             return;
  75 |         appJar = args[0];
  76 |         File dataDir = new File(base_dir);
  77 |         if (!dataDir.exists())
  78 |             dataDir.mkdir();
  79 | 
  80 |         try {
  81 |             AnalysisScope scope = AnalysisScopeReader.makeJavaBinaryAnalysisScope(appJar,
  82 |                     (new FileProvider()).getFile(CallGraphTestUtil.REGRESSION_EXCLUSIONS));
  83 | 
  84 |             ClassHierarchy cha = ClassHierarchy.make(scope);
  85 | 
  86 |             AnalysisCache cache = new AnalysisCache();
  87 |             IRFactory<IMethod> irFactory = cache.getIRFactory();
  88 |             int count = 0;
  89 |             String jarName = FilenameUtils.removeExtension(FilenameUtils.getName(appJar));
  90 |             FileWriter srcFilePathWriter = new FileWriter(base_dir + jarName + ".txt");
  91 |             for (IClass cl : cha) {
  92 |                 //ignore interface or innerclass
  93 |                 if (cl.isInterface())
  94 |                     continue;
  95 |                 ShrikeClass scl = (ShrikeClass) cl;
  96 |                 if (scl != null && (scl.isInnerClass() || scl.isStaticInnerClass()))
  97 |                     continue;
  98 | 
  99 |                 String className = cl.getName().toString().replace('/', '.');
 100 | 
 101 |                 if (!cl.getClassLoader().getReference().equals(ClassLoaderReference.Application))
 102 |                     continue;
 103 |                 for (IMethod m : cl.getDeclaredMethods()) {
 104 |                     if (m.isInit())
 105 |                         continue;
 106 |                     if (m.isClinit())
 107 |                         continue;
 108 |                     if (m.getSignature().indexOf("$SWITCH_TABLE$") != -1)
 109 |                         continue;
 110 |                     //ignore abstract method since it ir is null
 111 |                     if (m.isAbstract())
 112 |                         continue;
 113 |                     String mName = m.getName().toString();
 114 | 
 115 |                     System.out.println("Method signature: " + m.getSignature());
 116 | //					String sig = m.getSignature() + "\n";
 117 | 
 118 |                     AnalysisOptions opt = new AnalysisOptions();
 119 |                     opt.getSSAOptions().setPiNodePolicy(SSAOptions.getAllBuiltInPiNodes());
 120 |                     if (irFactory == null) {
 121 |                         System.out.println("irFactory is null.");
 122 |                         continue;
 123 |                     }
 124 |                     IR ir = null;
 125 |                     try{
 126 |                         ir = irFactory.makeIR(m, Everywhere.EVERYWHERE, opt.getSSAOptions());
 127 |                     }
 128 |                     catch (java.lang.NullPointerException e){
 129 |                         e.printStackTrace();
 130 |                         continue;
 131 |                     }
 132 |                     if (ir == null) {
 133 |                         System.out.println("ir is null");
 134 |                         continue;
 135 |                     }
 136 | 
 137 |                     //igonore methods with less than 15 instructions
 138 |                     if (ir.getInstructions().length < 15)
 139 |                         continue;
 140 |                     IBytecodeMethod ibm = (IBytecodeMethod) m;
 141 |                     if (m != null) {
 142 |                         int bcStrartIndex = ibm.getBytecodeIndex(0);
 143 |                         int bcEndIndex = ibm.getBytecodeIndex(ir.getInstructions().length - 1);
 144 |                         int srcStartLine = m.getLineNumber(bcStrartIndex);
 145 |                         int srcEndLine = m.getLineNumber(bcEndIndex);
 146 |                         int loc = srcEndLine = srcEndLine - srcStartLine;
 147 |                         if (loc < 5)    //ignroe methods less than 5 lines
 148 |                             continue;
 149 |                     }
 150 | 
 151 |                     computeAdjacencyMatrix(ir, cha, m.getSignature(), className.substring(1));
 152 | 
 153 |                     count++;
 154 | 
 155 |                     System.out.print(excludedInstruction);
 156 |                     excludedInstruction = "";
 157 | 
 158 | //					java.lang.System.out.println("**************************");
 159 |                 }
 160 |             }
 161 |             srcFilePathWriter.close();
 162 |             System.out.println("Data files are in: deepcode-master/data");
 163 |             System.out.println("Totally " + Integer.toString(count) + " methods processed.");
 164 | 
 165 |         } catch (WalaException | IOException e) {
 166 |             e.printStackTrace();
 167 |         } catch (InvalidClassFileException e) {
 168 |             e.printStackTrace();
 169 |         }
 170 |     }
 171 | 
 172 |     /**
 173 |      * Simply detect if this ir contains any invocation, if it does, return true, otherwise false
 174 |      *
 175 |      * @param ir
 176 |      * @return
 177 |      */
 178 |     public static boolean containInvocation(IR ir, String mName) {
 179 |         SSAInstruction[] insList = ir.getInstructions();
 180 |         for (SSAInstruction ins : insList) {
 181 |             if (ins instanceof SSAInvokeInstruction) {
 182 |                 String invokeStr = ins.toString().toLowerCase();
 183 |                 if (invokeStr.contains("compareto") || invokeStr.contains("equals") || invokeStr.contains(mName.toLowerCase()))
 184 |                     continue;
 185 |                 return true;
 186 |             }
 187 |         }
 188 |         return false;
 189 |     }
 190 | 
 191 |     public static void getBlocksType(SSACFG cfg, BlockType[] types) {
 192 |         Iterator<ISSABasicBlock> blockIter = cfg.iterator();
 193 |         while (blockIter.hasNext()) {
 194 |             ISSABasicBlock block = blockIter.next();
 195 |             int blockNumber = block.getNumber();
 196 |             Iterator<ISSABasicBlock> succNodes = cfg.getSuccNodes(block);
 197 |             if (!block.iterator().hasNext())
 198 |                 continue;
 199 |             SSAInstruction instruction = block.getLastInstruction();
 200 |             if (instruction instanceof SSASwitchInstruction) {   //Switch
 201 |                 types[blockNumber] = BlockType.SWITCH;
 202 |                 while (succNodes.hasNext()) {
 203 |                     ISSABasicBlock succblock = succNodes.next();
 204 |                     int succNumber = succblock.getNumber();
 205 |                     types[succNumber] = BlockType.SWITCH_BODY;
 206 |                 }
 207 |             }
 208 | 
 209 |             if (instruction instanceof SSAConditionalBranchInstruction) {
 210 |                 types[blockNumber] = BlockType.IF;
 211 |                 ISSABasicBlock endBlock = null;
 212 |                 while (succNodes.hasNext()) {
 213 |                     endBlock = succNodes.next();
 214 |                 }
 215 |                 succNodes = cfg.getSuccNodes(block);
 216 |                 if (cfg.getSuccNodes(endBlock).next().equals(block)) {   //Loop
 217 |                     types[blockNumber] = BlockType.LOOP;
 218 |                     while (succNodes.hasNext()) {
 219 |                         ISSABasicBlock succblock = succNodes.next();
 220 |                         int succNumber = succblock.getNumber();
 221 |                         types[succNumber] = BlockType.NORMAL;
 222 |                     }
 223 |                     int endNumber = endBlock.getNumber();
 224 |                     types[endNumber] = BlockType.LOOP_BODY;
 225 |                 } else {   //IF
 226 |                     ISSABasicBlock firstBlock = succNodes.next();
 227 |                     int firstNumber = firstBlock.getNumber();
 228 |                     types[firstNumber] = BlockType.NORMAL;
 229 |                     while (succNodes.hasNext()) {
 230 |                         ISSABasicBlock succblock = succNodes.next();
 231 |                         int succNumber = succblock.getNumber();
 232 |                         types[succNumber] = BlockType.IF_BODY;
 233 |                     }
 234 |                 }
 235 |             }
 236 |         }
 237 |     }
 238 | 
 239 |     /**
 240 |      * @param ir
 241 |      * @param cha
 242 |      * @param mName
 243 |      * @param className
 244 |      */
 245 |     public static void computeAdjacencyMatrix(IR ir, ClassHierarchy cha, String mName, String className) {
 246 |         SymbolTable st = ir.getSymbolTable();
 247 |         int varCount = st.getMaxValueNumber();
 248 |         int blockCount = ir.getControlFlowGraph().getNumberOfNodes();
 249 | 
 250 |         // class fields of static variables
 251 |         List<FieldReference> frList = CodeScanner.getFieldsRead(ir.getInstructions());
 252 |         frList.addAll(CodeScanner.getFieldsWritten(ir.getInstructions()));
 253 |         // <name of static variable, its number>
 254 |         Map<String, Integer> fieldMap = new HashMap<String, Integer>();
 255 |         for (FieldReference fr : frList) {
 256 |             if (fieldMap.get(fr.getName().toString()) == null)
 257 |                 fieldMap.put(fr.getName().toString(), ++varCount);
 258 |         }
 259 | 
 260 |         // indicate the types of all variables
 261 |         TypeName[] typeNames = new TypeName[varCount + 1];
 262 | 
 263 |         /**
 264 |          * typeCode (binary)
 265 |          * last two digits:
 266 |          * 		local(00), constant(01) or static(10)
 267 |          * two digits in the middle:
 268 |          * 		common(00), array(01), pointer(10) or reference(11)
 269 |          * first four digits:
 270 |          * 		9 primitive types, other primitive types, Java library class and user-defined class
 271 |          * e.g., 0110 10 01
 272 |          *
 273 |          * new:
 274 |          *
 275 |          *
 276 |          *
 277 |          */
 278 |         int[] typeCode = new int[varCount + 1];
 279 |         for (int i = 0; i < varCount + 1; i++)
 280 |             typeCode[i] = 0; // default value
 281 | 
 282 |         TypeInference ti = TypeInference.make(ir, true);
 283 |         TypeAbstraction[] taArray = ti.extractAllResults(); //get types for all variable
 284 |         for (int i = 0; i < taArray.length; i++) {
 285 |             if (taArray[i] != null) {
 286 |                 if (taArray[i].getTypeReference() != null) {
 287 |                     TypeName type = taArray[i].getTypeReference().getName();
 288 |                     typeNames[i] = type;
 289 | 
 290 |                     if (st.isConstant(i))
 291 |                         typeCode[i] = 1;
 292 |                     else
 293 |                         typeCode[i] = 0;
 294 |                 }
 295 |             }
 296 |         }
 297 | 
 298 |         for (FieldReference fr : frList) { // for static variable
 299 |             int index = fieldMap.get(fr.getName().toString());
 300 |             typeNames[index] = fr.getFieldType().getName();
 301 |             typeCode[index] = 2;
 302 |         }
 303 | 
 304 |         for (int i = 0; i < varCount + 1; i++) {
 305 |             if (typeNames[i] == null)
 306 |                 continue;
 307 | 
 308 |             // array, pointer, reference or common?
 309 |             if (typeNames[i].toString().startsWith("["))
 310 |                 typeCode[i] = typeCode[i] + 4;
 311 |             else if (typeNames[i].toString().startsWith("*"))
 312 |                 typeCode[i] = typeCode[i] + 8;
 313 |             else if (typeNames[i].toString().startsWith("&"))
 314 |                 typeCode[i] = typeCode[i] + 12;
 315 |             else
 316 |                 typeCode[i] = typeCode[i] + 0;
 317 | 
 318 |             // get the innermost type of a variable (without array or reference)
 319 |             Atom inner = typeNames[i].getClassName();
 320 |             if (inner == Atom.findOrCreateUnicodeAtom("Z"))
 321 |                 typeCode[i] = typeCode[i] + 16;
 322 |             else if (inner == Atom.findOrCreateUnicodeAtom("B"))
 323 |                 typeCode[i] = typeCode[i] + 32;
 324 |             else if (inner == Atom.findOrCreateUnicodeAtom("C"))
 325 |                 typeCode[i] = typeCode[i] + 48;
 326 |             else if (inner == Atom.findOrCreateUnicodeAtom("D"))
 327 |                 typeCode[i] = typeCode[i] + 64;
 328 |             else if (inner == Atom.findOrCreateUnicodeAtom("F"))
 329 |                 typeCode[i] = typeCode[i] + 80;
 330 |             else if (inner == Atom.findOrCreateUnicodeAtom("I"))
 331 |                 typeCode[i] = typeCode[i] + 96;
 332 |             else if (inner == Atom.findOrCreateUnicodeAtom("J"))
 333 |                 typeCode[i] = typeCode[i] + 112;
 334 |             else if (inner == Atom.findOrCreateUnicodeAtom("S"))
 335 |                 typeCode[i] = typeCode[i] + 128;
 336 |             else if (inner == Atom.findOrCreateUnicodeAtom("V"))
 337 |                 typeCode[i] = typeCode[i] + 144;
 338 |             else if (typeNames[i].isPrimitiveType())
 339 |                 typeCode[i] = typeCode[i] + 160;
 340 |             else {
 341 |                 assert (inner.toString().startsWith("L"));
 342 |                 if (typeNames[i].getPackage() != null)
 343 |                     if (typeNames[i].getPackage().toString().startsWith("java"))
 344 |                         typeCode[i] = typeCode[i] + 176;
 345 |                     else
 346 |                         typeCode[i] = typeCode[i] + 192;
 347 |                 else
 348 |                     typeCode[i] = typeCode[i] + 192;
 349 |             }
 350 | 
 351 |         }
 352 | 
 353 |         /**
 354 |          * Adjacency Matrix
 355 |          * the computation of the value:
 356 |          * 		6-digit opcode + 8-digit "from" type code + 8-digit "to" type code
 357 |          * e.g., 001010 01101001 00111000
 358 |          *
 359 |          * variable-block relationship: 1<<22 (in CFG)
 360 |          * block-block relationship: 1<<23 (in CFG)
 361 |          *
 362 |          * index allocation: [0, varCount+z]
 363 |          * local variables 			V1~Vx->[0, x-1]
 364 |          * static variables 		[x, varCount-1]
 365 |          * basic blocks 			B0~Bz->[varCount, varCount+z]
 366 |          */
 367 |         int mSize = Math.max(varCount + blockCount, fixedSize);
 368 |         // int[][] mat = new int[varCount + blockCount][varCount + blockCount];
 369 | //        int[][] mat = new int[mSize][mSize];
 370 |         BitSet[][] mat = new BitSet[mSize][mSize];
 371 |         int maxVecDim = 89;
 372 |         for (int i = 0; i < mSize; ++i) {
 373 |             for (int j = 0; j < mSize; ++j)
 374 | //                mat[i][j] = 0;
 375 |                 mat[i][j] = new BitSet(maxVecDim); // default value
 376 |         }
 377 | 
 378 |         SSACFG cfg = ir.getControlFlowGraph();
 379 |         BlockType[] blockTypes = new BlockType[cfg.getMaxNumber() + 1];
 380 |         for (int i = 0; i < blockTypes.length; ++i) {
 381 |             blockTypes[i] = BlockType.NORMAL;
 382 |         }
 383 |         getBlocksType(cfg, blockTypes);
 384 |         Iterator<ISSABasicBlock> blockIter = cfg.iterator();
 385 |         while (blockIter.hasNext()) {
 386 |             // control flow
 387 |             ISSABasicBlock block = blockIter.next();
 388 |             int blockNumber = block.getNumber();
 389 |             Iterator<ISSABasicBlock> succIter = cfg.getSuccNodes(block);
 390 |             while (succIter.hasNext()) {
 391 |                 ISSABasicBlock succBlock = succIter.next();
 392 |                 int succNumber = succBlock.getNumber();
 393 |                 blockToBlcok(mat, varCount, blockNumber, succNumber, blockTypes);
 394 |             }
 395 | 
 396 |             // data flow
 397 |             Iterator<SSAInstruction> insIter = block.iterator();
 398 |             while (insIter.hasNext()) {
 399 |                 SSAInstruction ins = insIter.next();
 400 |                 if (ins != null) {
 401 |                     if (ins instanceof SSAGetInstruction) {
 402 |                         SSAGetInstruction getIns = (SSAGetInstruction) ins;
 403 |                         FieldReference fr = getIns.getDeclaredField();
 404 |                         int staticVar = fieldMap.get(fr.getName().toString());
 405 |                         int def = getIns.getDef();
 406 |                         variableToBlock(mat, varCount, staticVar, blockNumber, blockTypes);
 407 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 408 | 
 409 |                         setDataFlow(mat[staticVar-1][def-1], 28, typeCode[def], typeCode[staticVar]);
 410 |                     } else if (ins instanceof SSAPutInstruction) {
 411 |                         SSAPutInstruction putIns = (SSAPutInstruction) ins;
 412 |                         FieldReference fr = putIns.getDeclaredField();
 413 |                         int staticVar = fieldMap.get(fr.getName().toString());
 414 |                         int var = putIns.getUse(0);
 415 |                         variableToBlock(mat, varCount, staticVar, blockNumber, blockTypes);
 416 |                         variableToBlock(mat, varCount, var, blockNumber, blockTypes);
 417 | 
 418 |                         setDataFlow(mat[var - 1][staticVar - 1], 38, typeCode[staticVar], typeCode[var]);
 419 |                     } else if (ins instanceof SSANewInstruction) {
 420 |                         SSANewInstruction newIns = (SSANewInstruction) ins;
 421 |                         int def = newIns.getDef();
 422 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 423 |                         if (newIns.getNumberOfUses() == 0) {// single variable
 424 |                             setDataFlow(mat[def - 1][def - 1], 35, typeCode[def], typeCode[def]);
 425 |                         }
 426 |                         else { // array
 427 |                             int var = newIns.getUse(0);
 428 |                             variableToBlock(mat, varCount, var, blockNumber, blockTypes);
 429 |                             setDataFlow(mat[var - 1][def - 1], 35, typeCode[def], typeCode[var]);
 430 |                         }
 431 |                     } else if (ins instanceof SSAConversionInstruction) {
 432 |                         SSAConversionInstruction convIns = (SSAConversionInstruction) ins;
 433 |                         int def = convIns.getDef();
 434 |                         int var = convIns.getUse(0);
 435 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 436 |                         variableToBlock(mat, varCount, var, blockNumber, blockTypes);
 437 | 
 438 |                         setDataFlow(mat[var - 1][def - 1], 26, typeCode[def], typeCode[var]);
 439 |                     } else if (ins instanceof SSAArrayLoadInstruction) {
 440 |                         SSAArrayLoadInstruction loadIns = (SSAArrayLoadInstruction) ins;
 441 |                         int def = loadIns.getDef();
 442 |                         int addr = loadIns.getArrayRef();
 443 |                         int index = loadIns.getIndex();
 444 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 445 |                         variableToBlock(mat, varCount, addr, blockNumber, blockTypes);
 446 |                         variableToBlock(mat, varCount, index, blockNumber, blockTypes);
 447 | 
 448 |                         setDataFlow(mat[addr - 1][def - 1], 3, typeCode[def], typeCode[addr]);
 449 |                         setDataFlow(mat[index - 1][def - 1], 3, typeCode[def], typeCode[index]);
 450 |                     } else if (ins instanceof SSAArrayStoreInstruction) {
 451 |                         SSAArrayStoreInstruction storeIns = (SSAArrayStoreInstruction) ins;
 452 |                         int def = storeIns.getValue();
 453 |                         int addr = storeIns.getArrayRef();
 454 |                         int index = storeIns.getIndex();
 455 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 456 |                         variableToBlock(mat, varCount, addr, blockNumber, blockTypes);
 457 |                         variableToBlock(mat, varCount, index, blockNumber, blockTypes);
 458 | 
 459 |                         setDataFlow(mat[addr - 1][def - 1], 4, typeCode[def], typeCode[addr]);
 460 |                         setDataFlow(mat[index - 1][def - 1], 4, typeCode[def], typeCode[index]);
 461 |                     } else if (ins instanceof SSAArrayLengthInstruction) {
 462 |                         SSAArrayLengthInstruction lengthIns = (SSAArrayLengthInstruction) ins;
 463 |                         int ref = lengthIns.getArrayRef();
 464 |                         int def = lengthIns.getDef();
 465 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 466 |                         variableToBlock(mat, varCount, ref, blockNumber, blockTypes);
 467 | 
 468 |                         setDataFlow(mat[ref - 1][def - 1], 2, typeCode[def], typeCode[ref]);
 469 |                     } else if (ins instanceof SSAInvokeInstruction) {
 470 |                         SSAInvokeInstruction invokeIns = (SSAInvokeInstruction) ins;
 471 |                         int numUses = invokeIns.getNumberOfUses();
 472 |                         int numReturns = invokeIns.getNumberOfReturnValues();
 473 |                         int exception = invokeIns.getException();
 474 |                         variableToBlock(mat, varCount, exception, blockNumber, blockTypes);
 475 | 
 476 |                         //detect simple method invocation
 477 |                         String invokeStr = invokeIns.toString().toLowerCase();
 478 |                         if (invokeStr.contains("compareto") && (numUses == 2) && (numReturns == 1)) {
 479 |                             int def = invokeIns.getReturnValue(0);
 480 |                             int lvar = invokeIns.getUse(0);
 481 |                             int rvar = invokeIns.getUse(1);
 482 |                             variableToBlock(mat, varCount, lvar, blockNumber, blockTypes);
 483 |                             variableToBlock(mat, varCount, rvar, blockNumber, blockTypes);
 484 |                             variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 485 | 
 486 |                             setDataFlow(mat[lvar - 1][def - 1], 19, typeCode[def], typeCode[lvar]);
 487 |                             setDataFlow(mat[rvar - 1][def - 1], 19, typeCode[def], typeCode[rvar]);
 488 | 
 489 |                             continue;
 490 |                         } else if (invokeStr.contains("equals") && (numUses == 2) && (numReturns == 1)) {
 491 |                             int def = invokeIns.getReturnValue(0);
 492 |                             int lvar = invokeIns.getUse(0);
 493 |                             int rvar = invokeIns.getUse(1);
 494 |                             variableToBlock(mat, varCount, lvar, blockNumber, blockTypes);
 495 |                             variableToBlock(mat, varCount, rvar, blockNumber, blockTypes);
 496 |                             variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 497 | 
 498 |                             setDataFlow(mat[lvar - 1][def - 1], 17, typeCode[def], typeCode[lvar]);
 499 |                             setDataFlow(mat[rvar - 1][def - 1], 17, typeCode[def], typeCode[rvar]);
 500 | 
 501 |                             continue;
 502 |                         }
 503 | 
 504 |                         //relation between parameters themselves. [gang. 2016.09.26]
 505 |                         if (numUses > 1) {
 506 |                             for (int i = 0; i < numUses - 1; ++i) {
 507 |                                 int param_i = invokeIns.getUse(i);
 508 |                                 for (int j = i + 1; j < numUses; ++j) {
 509 |                                     int param_j = invokeIns.getUse(j);
 510 |                                     setDataFlow(mat[param_i - 1][param_j - 1], 31, typeCode[param_j], typeCode[param_i]);
 511 |                                     setDataFlow(mat[param_j - 1][param_i - 1], 31, typeCode[param_i], typeCode[param_j]);
 512 |                                 }
 513 |                             }
 514 |                         }
 515 | 
 516 |                         //relation between params and return value
 517 |                         if (numReturns == 0) { // no return value
 518 |                             for (int k = 0; k < numUses; k++) {
 519 |                                 int parameter = invokeIns.getUse(k);
 520 |                                 variableToBlock(mat, varCount, parameter, blockNumber, blockTypes);
 521 |                                 setDataFlow(mat[parameter - 1][exception - 1], 31, typeCode[exception], typeCode[parameter]);
 522 |                             }
 523 |                         } else {
 524 |                             int def = invokeIns.getReturnValue(0);
 525 |                             variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 526 |                             setDataFlow(mat[def - 1][exception - 1], 31, typeCode[exception], typeCode[def]);
 527 |                             for (int k = 0; k < numUses; k++) {
 528 |                                 int parameter = invokeIns.getUse(k);
 529 |                                 variableToBlock(mat, varCount, parameter, blockNumber, blockTypes);
 530 |                                 setDataFlow(mat[parameter - 1][def - 1], 31, typeCode[def], typeCode[parameter]);
 531 |                             }
 532 |                         }
 533 |                     } else if (ins instanceof SSAPiInstruction) {
 534 |                         SSAPiInstruction piIns = (SSAPiInstruction) ins;
 535 |                         int def = piIns.getDef();
 536 |                         int var = piIns.getUse(0);
 537 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 538 |                         variableToBlock(mat, varCount, var, blockNumber, blockTypes);
 539 | 
 540 |                         setDataFlow(mat[var - 1][def - 1], 37, typeCode[def], typeCode[var]);
 541 |                     } else if (ins instanceof SSAUnaryOpInstruction && !(ins instanceof SSAPiInstruction)) {
 542 |                         SSAUnaryOpInstruction opIns = (SSAUnaryOpInstruction) ins;
 543 |                         String op = opIns.getOpcode().toString();
 544 |                         if (op != "neg")
 545 |                             System.out.println("Excluded op in UnaryOpInstruction: " + op);
 546 | 
 547 |                         int def = opIns.getDef();
 548 |                         int var = opIns.getUse(0);
 549 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 550 |                         variableToBlock(mat, varCount, var, blockNumber, blockTypes);
 551 | 
 552 |                         setDataFlow(mat[var - 1][def - 1], 43, typeCode[def], typeCode[var]);
 553 |                     } else if (ins instanceof SSABinaryOpInstruction) {
 554 |                         SSABinaryOpInstruction opIns = (SSABinaryOpInstruction) ins;
 555 |                         String op = opIns.getOperator().toString();
 556 |                         int def = opIns.getDef();
 557 |                         int lvar = opIns.getUse(0);
 558 |                         int rvar = opIns.getUse(1);
 559 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 560 |                         variableToBlock(mat, varCount, lvar, blockNumber, blockTypes);
 561 |                         variableToBlock(mat, varCount, rvar, blockNumber, blockTypes);
 562 | 
 563 |                         // get different value according to the op
 564 |                         switch (op) {
 565 |                             case "add":
 566 |                                 setDataFlow(mat[rvar - 1][def - 1], 5, typeCode[def], typeCode[lvar]);
 567 |                                 setDataFlow(mat[rvar - 1][def - 1], 5, typeCode[def], typeCode[rvar]);
 568 |                                 break;
 569 |                             case "sub":
 570 |                                 setDataFlow(mat[lvar - 1][def - 1], 6, typeCode[def], typeCode[lvar]);
 571 |                                 setDataFlow(mat[rvar - 1][def - 1], 6, typeCode[def], typeCode[rvar]);
 572 |                                 break;
 573 |                             case "mul":
 574 |                                 setDataFlow(mat[lvar - 1][def - 1], 7, typeCode[def], typeCode[lvar]);
 575 |                                 setDataFlow(mat[rvar - 1][def - 1], 7, typeCode[def], typeCode[rvar]);
 576 |                                 break;
 577 |                             case "div":
 578 |                                 setDataFlow(mat[lvar - 1][def - 1], 8, typeCode[def], typeCode[lvar]);
 579 |                                 setDataFlow(mat[rvar - 1][def - 1], 8, typeCode[def], typeCode[rvar]);
 580 |                                 break;
 581 |                             case "rem":
 582 |                                 setDataFlow(mat[lvar - 1][def - 1], 9, typeCode[def], typeCode[lvar]);
 583 |                                 setDataFlow(mat[rvar - 1][def - 1], 9, typeCode[def], typeCode[rvar]);
 584 |                                 break;
 585 |                             case "and":
 586 |                                 setDataFlow(mat[lvar - 1][def - 1], 10, typeCode[def], typeCode[lvar]);
 587 |                                 setDataFlow(mat[rvar - 1][def - 1], 10, typeCode[def], typeCode[rvar]);
 588 |                                 break;
 589 |                             case "or":
 590 |                                 setDataFlow(mat[lvar - 1][def - 1], 11, typeCode[def], typeCode[lvar]);
 591 |                                 setDataFlow(mat[rvar - 1][def - 1], 11, typeCode[def], typeCode[rvar]);
 592 |                                 break;
 593 |                             case "xor":
 594 |                                 setDataFlow(mat[lvar - 1][def - 1], 12, typeCode[def], typeCode[lvar]);
 595 |                                 setDataFlow(mat[rvar - 1][def - 1], 12, typeCode[def], typeCode[rvar]);
 596 |                                 break;
 597 |                             case "SHL":
 598 |                                 setDataFlow(mat[lvar - 1][def - 1], 13, typeCode[def], typeCode[lvar]);
 599 |                                 setDataFlow(mat[rvar - 1][def - 1], 13, typeCode[def], typeCode[rvar]);
 600 |                                 break;
 601 |                             case "SHR":
 602 |                                 setDataFlow(mat[lvar - 1][def - 1], 14, typeCode[def], typeCode[lvar]);
 603 |                                 setDataFlow(mat[rvar - 1][def - 1], 14, typeCode[def], typeCode[rvar]);
 604 |                                 break;
 605 |                             default:
 606 |                                 setDataFlow(mat[lvar - 1][def - 1], 15, typeCode[def], typeCode[lvar]);
 607 |                                 setDataFlow(mat[rvar - 1][def - 1], 15, typeCode[def], typeCode[rvar]);
 608 |                                 System.out.println("Excluded op in BinaryOpInstruction: " + op);
 609 |                                 break;
 610 |                         }
 611 |                     } else if (ins instanceof SSAComparisonInstruction) {
 612 |                         SSAComparisonInstruction compIns = (SSAComparisonInstruction) ins;
 613 |                         IComparisonInstruction.Operator op = compIns.getOperator();
 614 |                         int def = compIns.getDef();
 615 |                         int lvar = compIns.getUse(0);
 616 |                         int rvar = compIns.getUse(1);
 617 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 618 |                         variableToBlock(mat, varCount, lvar, blockNumber, blockTypes);
 619 |                         variableToBlock(mat, varCount, rvar, blockNumber, blockTypes);
 620 | 
 621 |                         switch (op) {
 622 |                             case CMP:
 623 |                                 setDataFlow(mat[lvar - 1][def - 1], 17, typeCode[def], typeCode[lvar]);
 624 |                                 setDataFlow(mat[rvar - 1][def - 1], 17, typeCode[def], typeCode[rvar]);
 625 |                                 break;
 626 |                             case CMPL:
 627 |                                 setDataFlow(mat[lvar - 1][def - 1], 18, typeCode[def], typeCode[lvar]);
 628 |                                 setDataFlow(mat[rvar - 1][def - 1], 18, typeCode[def], typeCode[rvar]);
 629 |                                 break;
 630 |                             case CMPG:
 631 |                                 setDataFlow(mat[lvar - 1][def - 1], 19, typeCode[def], typeCode[lvar]);
 632 |                                 setDataFlow(mat[rvar - 1][def - 1], 19, typeCode[def], typeCode[rvar]);
 633 |                                 break;
 634 |                         }
 635 |                     } else if (ins instanceof SSAConditionalBranchInstruction) {
 636 |                         SSAConditionalBranchInstruction condIns = (SSAConditionalBranchInstruction) ins;
 637 |                         IConditionalBranchInstruction.Operator op = (IConditionalBranchInstruction.Operator) condIns
 638 |                                 .getOperator();
 639 |                         int lvar = condIns.getUse(0);
 640 |                         int rvar = condIns.getUse(1);
 641 |                         variableToBlock(mat, varCount, lvar, blockNumber, blockTypes);
 642 |                         variableToBlock(mat, varCount, rvar, blockNumber, blockTypes);
 643 | 
 644 |                         switch (op) {
 645 |                             case EQ:
 646 |                                 setDataFlow(mat[rvar - 1][lvar - 1], 20, typeCode[lvar], typeCode[rvar]);
 647 |                                 break;
 648 |                             case NE:
 649 |                                 setDataFlow(mat[rvar - 1][lvar - 1], 21, typeCode[lvar], typeCode[rvar]);
 650 |                                 break;
 651 |                             case LT:
 652 |                                 setDataFlow(mat[rvar - 1][lvar - 1], 22, typeCode[lvar], typeCode[rvar]);
 653 |                                 break;
 654 |                             case GE:
 655 |                                 setDataFlow(mat[rvar - 1][lvar - 1], 23, typeCode[lvar], typeCode[rvar]);
 656 |                                 break;
 657 |                             case GT:
 658 |                                 setDataFlow(mat[rvar - 1][lvar - 1], 24, typeCode[lvar], typeCode[rvar]);
 659 |                                 break;
 660 |                             case LE:
 661 |                                 setDataFlow(mat[rvar - 1][lvar - 1], 25, typeCode[lvar], typeCode[rvar]);
 662 |                                 break;
 663 |                         }
 664 |                     } else if (ins instanceof SSAPhiInstruction) {
 665 |                         SSAPhiInstruction phiIns = (SSAPhiInstruction) ins;
 666 |                         int def = phiIns.getDef();
 667 |                         int lvar = phiIns.getUse(0);
 668 |                         int rvar = phiIns.getUse(1);
 669 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 670 |                         variableToBlock(mat, varCount, lvar, blockNumber, blockTypes);
 671 |                         variableToBlock(mat, varCount, rvar, blockNumber, blockTypes);
 672 | 
 673 |                         setDataFlow(mat[lvar - 1][def - 1], 36, typeCode[def], typeCode[lvar]);
 674 |                         setDataFlow(mat[rvar - 1][def - 1], 36, typeCode[def], typeCode[rvar]);
 675 |                     } else if (ins instanceof SSAGetCaughtExceptionInstruction) {
 676 |                         SSAGetCaughtExceptionInstruction caughtIns = (SSAGetCaughtExceptionInstruction) ins;
 677 |                         int var = caughtIns.getDef();
 678 |                         variableToBlock(mat, varCount, var, blockNumber, blockTypes);
 679 | 
 680 |                         setDataFlow(mat[var - 1][var - 1], 27, typeCode[var], typeCode[var]);
 681 |                     } else if (ins instanceof SSAThrowInstruction) {
 682 |                         SSAThrowInstruction throwIns = (SSAThrowInstruction) ins;
 683 |                         int var = throwIns.getUse(0);
 684 |                         variableToBlock(mat, varCount, var, blockNumber, blockTypes);
 685 | 
 686 |                         setDataFlow(mat[var - 1][var - 1], 42, typeCode[var], typeCode[var]);
 687 |                     } else if (ins instanceof SSACheckCastInstruction) {
 688 |                         SSACheckCastInstruction castIns = (SSACheckCastInstruction) ins;
 689 |                         int def = castIns.getDef();
 690 |                         int var = castIns.getUse(0);
 691 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 692 |                         variableToBlock(mat, varCount, var, blockNumber, blockTypes);
 693 | 
 694 |                         setDataFlow(mat[var - 1][def - 1], 16, typeCode[def], typeCode[var]);
 695 |                     } else if (ins instanceof SSAInstanceofInstruction) {
 696 |                         SSAInstanceofInstruction instanceIns = (SSAInstanceofInstruction) ins;
 697 |                         int def = instanceIns.getDef();
 698 |                         int var = instanceIns.getUse(0);
 699 |                         variableToBlock(mat, varCount, def, blockNumber, blockTypes);
 700 |                         variableToBlock(mat, varCount, var, blockNumber, blockTypes);
 701 | 
 702 |                         setDataFlow(mat[var - 1][def - 1], 30, typeCode[def], typeCode[var]);
 703 |                     } else if (ins instanceof SSAReturnInstruction || ins instanceof SSAGotoInstruction
 704 |                             || ins instanceof SSASwitchInstruction) {
 705 |                         // related to control flow
 706 |                         // have nothing to do with data flow
 707 | //                        System.out.println("Switch Instruction");
 708 |                     } else {
 709 |                         excludedInstruction = excludedInstruction + ins.toString() + '\n';
 710 |                     }
 711 |                 }
 712 |             }
 713 |         }
 714 | 
 715 | 
 716 |         String dir = base_dir + FilenameUtils.removeExtension(FilenameUtils.getName(appJar));
 717 |         File filedir = new File(dir);
 718 |         if (!filedir.exists()) {
 719 |             filedir.mkdir();
 720 |         }
 721 |         String funcName = mName.substring(mName.lastIndexOf('.') + 1);
 722 |         funcName = funcName.substring(0, funcName.indexOf('('));
 723 |         String parameterString = mName.substring(mName.indexOf('(') + 1, mName.indexOf(')'));
 724 |         if (parameterString.length() == 0)
 725 |             funcName += "()";
 726 |         else if (parameterString.contains(";")) {
 727 |             String[] parameters = parameterString.split(";");
 728 |             funcName += "(";
 729 |             for (int i = 0; i < parameters.length - 1; ++i) {
 730 |                 String param = "";
 731 |                 if (parameters[i].contains("/"))
 732 |                     param = parameters[i].substring(parameters[i].lastIndexOf('/') + 1);
 733 |                 else {
 734 |                     if (parameters[i].matches("\\[?[IDBFSJ]+")) {
 735 |                         byte[] reg = parameters[i].getBytes();
 736 |                         int l = 0;
 737 |                         while (l < reg.length) {
 738 |                             if (reg[l] == '[') {
 739 |                                 switch (reg[l + 1]) {
 740 |                                     case 'I': {
 741 |                                         param += "int[]";
 742 |                                         break;
 743 |                                     }
 744 |                                     case 'D': {
 745 |                                         param += "double[]";
 746 |                                         break;
 747 |                                     }
 748 |                                     case 'B': {
 749 |                                         param += "byte[]";
 750 |                                         break;
 751 |                                     }
 752 |                                     case 'J': {
 753 |                                         param += "long[]";
 754 |                                         break;
 755 |                                     }
 756 |                                     case 'F': {
 757 |                                         param += "float[]";
 758 |                                         break;
 759 |                                     }
 760 |                                     case 'S': {
 761 |                                         param += "short[]";
 762 |                                         break;
 763 |                                     }
 764 |                                     case 'Z': {
 765 |                                         param += "boolean[]";
 766 |                                         break;
 767 |                                     }
 768 |                                     default:
 769 |                                         param += "[]";
 770 |                                 }
 771 |                                 l += 2;
 772 |                                 if (l <= reg.length - 1)
 773 |                                     funcName += ",";
 774 |                             } else {
 775 |                                 switch (reg[l]) {
 776 |                                     case 'I': {
 777 |                                         param += "int";
 778 |                                         break;
 779 |                                     }
 780 |                                     case 'D': {
 781 |                                         param += "double";
 782 |                                         break;
 783 |                                     }
 784 |                                     case 'B': {
 785 |                                         funcName += "byte";
 786 |                                         break;
 787 |                                     }
 788 |                                     case 'J': {
 789 |                                         param += "long";
 790 |                                         break;
 791 |                                     }
 792 |                                     case 'F': {
 793 |                                         param += "float";
 794 |                                         break;
 795 |                                     }
 796 |                                     case 'S': {
 797 |                                         param += "short";
 798 |                                         break;
 799 |                                     }
 800 |                                     case 'Z': {
 801 |                                         param += "boolean";
 802 |                                         break;
 803 |                                     }
 804 |                                     default:
 805 |                                         param += "";
 806 |                                 }
 807 |                                 l += 1;
 808 |                                 if (l <= reg.length - 1)
 809 |                                     param += ",";
 810 |                             }
 811 |                         }
 812 |                     } else {
 813 |                         param = parameters[i];
 814 |                     }
 815 |                 }
 816 |                 funcName += param;
 817 |                 funcName += ',';
 818 |             }
 819 |             String param = "";
 820 |             if (parameters[parameters.length - 1].contains("/"))
 821 |                 param = parameters[parameters.length - 1].substring(parameters[parameters.length - 1].lastIndexOf('/') + 1);
 822 |             else {
 823 |                 if (parameters[parameters.length - 1].matches("\\[?[IDBFSJ]+")) {
 824 |                     byte[] reg = parameters[parameters.length - 1].getBytes();
 825 |                     int l = 0;
 826 |                     while (l < reg.length) {
 827 |                         if (reg[l] == '[') {
 828 |                             switch (reg[l + 1]) {
 829 |                                 case 'I': {
 830 |                                     param += "int[]";
 831 |                                     break;
 832 |                                 }
 833 |                                 case 'D': {
 834 |                                     param += "double[]";
 835 |                                     break;
 836 |                                 }
 837 |                                 case 'B': {
 838 |                                     param += "byte[]";
 839 |                                     break;
 840 |                                 }
 841 |                                 case 'J': {
 842 |                                     param += "long[]";
 843 |                                     break;
 844 |                                 }
 845 |                                 case 'F': {
 846 |                                     param += "float[]";
 847 |                                     break;
 848 |                                 }
 849 |                                 case 'S': {
 850 |                                     param += "short[]";
 851 |                                     break;
 852 |                                 }
 853 |                                 case 'Z': {
 854 |                                     param += "boolean[]";
 855 |                                     break;
 856 |                                 }
 857 |                                 default:
 858 |                                     param += "[]";
 859 |                             }
 860 |                             l += 2;
 861 |                             if (l <= reg.length - 1)
 862 |                                 funcName += ",";
 863 |                         } else {
 864 |                             switch (reg[l]) {
 865 |                                 case 'I': {
 866 |                                     param += "int";
 867 |                                     break;
 868 |                                 }
 869 |                                 case 'D': {
 870 |                                     param += "double";
 871 |                                     break;
 872 |                                 }
 873 |                                 case 'B': {
 874 |                                     funcName += "byte";
 875 |                                     break;
 876 |                                 }
 877 |                                 case 'J': {
 878 |                                     param += "long";
 879 |                                     break;
 880 |                                 }
 881 |                                 case 'F': {
 882 |                                     param += "float";
 883 |                                     break;
 884 |                                 }
 885 |                                 case 'S': {
 886 |                                     param += "short";
 887 |                                     break;
 888 |                                 }
 889 |                                 case 'Z': {
 890 |                                     param += "boolean";
 891 |                                     break;
 892 |                                 }
 893 |                                 default:
 894 |                                     param += "";
 895 |                             }
 896 |                             l += 1;
 897 |                             if (l <= reg.length - 1)
 898 |                                 param += ",";
 899 |                         }
 900 |                     }
 901 |                 } else {
 902 |                     param = parameters[parameters.length - 1];
 903 |                 }
 904 |             }
 905 |             funcName += param;
 906 |             funcName += ")";
 907 |         } else {
 908 |             funcName += "(";
 909 |             byte[] reg = parameterString.getBytes();
 910 |             int l = 0;
 911 |             while (l < reg.length) {
 912 |                 if (reg[l] == '[') {
 913 |                     switch (reg[l + 1]) {
 914 |                         case 'I': {
 915 |                             funcName += "int[]";
 916 |                             break;
 917 |                         }
 918 |                         case 'D': {
 919 |                             funcName += "double[]";
 920 |                             break;
 921 |                         }
 922 |                         case 'B': {
 923 |                             funcName += "byte[]";
 924 |                             break;
 925 |                         }
 926 |                         case 'J': {
 927 |                             funcName += "long[]";
 928 |                             break;
 929 |                         }
 930 |                         case 'F': {
 931 |                             funcName += "float[]";
 932 |                             break;
 933 |                         }
 934 |                         case 'S': {
 935 |                             funcName += "short[]";
 936 |                             break;
 937 |                         }
 938 |                         case 'Z': {
 939 |                             funcName += "boolean[]";
 940 |                             break;
 941 |                         }
 942 |                         default:
 943 |                             funcName += "[]";
 944 |                     }
 945 |                     l += 2;
 946 |                     if (l <= reg.length - 1)
 947 |                         funcName += ",";
 948 |                 } else {
 949 |                     switch (reg[l]) {
 950 |                         case 'I': {
 951 |                             funcName += "int";
 952 |                             break;
 953 |                         }
 954 |                         case 'D': {
 955 |                             funcName += "double";
 956 |                             break;
 957 |                         }
 958 |                         case 'B': {
 959 |                             funcName += "byte";
 960 |                             break;
 961 |                         }
 962 |                         case 'J': {
 963 |                             funcName += "long";
 964 |                             break;
 965 |                         }
 966 |                         case 'F': {
 967 |                             funcName += "float";
 968 |                             break;
 969 |                         }
 970 |                         case 'S': {
 971 |                             funcName += "short";
 972 |                             break;
 973 |                         }
 974 |                         case 'Z': {
 975 |                             funcName += "boolean";
 976 |                             break;
 977 |                         }
 978 |                         default:
 979 |                             funcName += "";
 980 |                     }
 981 |                     l += 1;
 982 |                     if (l <= reg.length - 1)
 983 |                         funcName += ",";
 984 |                 }
 985 |             }
 986 |             funcName += ")";
 987 |         }
 988 | //        String fileName = mName + ".txt";
 989 |         String fileName = funcName + ".txt";
 990 |         String filePath = dir + "/" + className;
 991 |         try {
 992 |             File childDir = new File(filePath);
 993 |             if (!childDir.exists()) {
 994 |                 childDir.mkdir();
 995 |             }
 996 |             File file = new File(filePath + "/" + fileName);
 997 |             if (file.exists())  //clear previous data
 998 |                 file.delete();
 999 |             file.createNewFile();
1000 |             FileWriter out = new FileWriter(file, true);
1001 | 
1002 |             //record the actual varCount + blockCount at the last element of the mat
1003 | //            mat[fixedSize - 1][fixedSize - 1] = varCount + blockCount;
1004 |             for (int i = 0; i < fixedSize; ++i) {
1005 |                 for (int j = 0; j < fixedSize; ++j)
1006 |                     out.write(mat[i][j].toString() + "\t");
1007 |                 out.write("\n");
1008 |             }
1009 | 
1010 |             out.close();
1011 | 
1012 | 
1013 |         } catch (IOException e) {
1014 |             e.printStackTrace();
1015 |         }
1016 | 
1017 |     }
1018 | 
1019 |     // record the relationship between variables and blocks
1020 |     public static void variableToBlock(BitSet[][] mat, int varCount, int var, int block, BlockType[] blockTypes) {
1021 |         int insType = blockWeights.get(blockTypes[block]);
1022 |         mat[var-1][varCount+block].set(insType);
1023 |         mat[varCount+block][var-1].set(insType);
1024 |     }
1025 | 
1026 |     public static void blockToBlcok(BitSet[][] mat, int varCount, int block, int succBlock, BlockType[] blockTypes) {
1027 |         int instType = blockWeights.get(blockTypes[block]);
1028 |         mat[varCount + block][varCount + succBlock].set(instType);
1029 |     }
1030 | 
1031 |     public static void setDataFlow(BitSet bs, int instType, int opCode1, int opCode2) {
1032 |         bs.set(instType + 37);
1033 |         setOpCode(bs, opCode1, 0);
1034 |         setOpCode(bs, opCode2, 18);
1035 |     }
1036 | 
1037 |     public static void setOpCode(BitSet bs, int opCode, int offset) {
1038 |         int idx = opCode & 0x03;
1039 |         bs.set(offset+idx);
1040 |         idx = (opCode & 0x0C) >> 2;
1041 |         bs.set(offset+idx);
1042 |         idx = (opCode & 0xF0) >> 4;
1043 |         bs.set(offset+idx);
1044 |     }
1045 | }
1046 | 
1047 | /*
1048 | BooleanName Z 	1
1049 | ByteName B 		2
1050 | CharName C 		3
1051 | DoubleName D 	4
1052 | FloatName F 	5
1053 | IntName I 		6
1054 | LongName J 		7
1055 | ShortName S 	8
1056 | VoidName V 		9
1057 | OtherPrimi P 	10
1058 | ClassTypeCode L
1059 | Java 			11
1060 | User-defined 	12
1061 | 
1062 | 
1063 | ArrayTypeCode = '[';
1064 | PointerTypeCode = '*';
1065 | ReferenceTypeCode = '&';
1066 | 
1067 | SSAAddressOfInstruction 1 			XX
1068 | SSAArrayLengthInstruction 2
1069 | SSAArrayLoadInstruction 3
1070 | SSAArrayStoreInstruction 4
1071 | SSABinaryOpInstruction 5~15
1072 | SSACheckCastInstruction 16
1073 | SSAComparisonInstruction 17~19
1074 | SSAConditionalBranchInstruction 20~25
1075 | SSAConversionInstruction 26
1076 | SSAGetCaughtExceptionInstruction 27
1077 | SSAGetInstruction 28
1078 | SSAGotoInstruction 29
1079 | SSAInstanceofInstruction 30
1080 | SSAInvokeInstruction 31
1081 | SSALoadIndirectInstruction 32 		XX
1082 | SSALoadMetadataInstruction 33 		XX
1083 | SSAMonitorInstruction 34 			XX
1084 | SSANewInstruction 35
1085 | SSAPhiInstruction 36
1086 | SSAPiInstruction 37
1087 | SSAPutInstruction 38
1088 | SSAReturnInstruction 39
1089 | SSAStoreIndirectInstruction 40 		XX
1090 | SSASwitchInstruction 41
1091 | SSAThrowInstruction 42
1092 | SSAUnaryOpInstruction 43
1093 | 
1094 | 
1095 | x = [op1, op2, InstType, BBType]
1096 | op1: 0-18        0-2 modifier property, 3-6 var type, 7-18: var data type
1097 | op2: 19-37 offset:18
1098 | Instype: 38-80 offset:37
1099 | BBtype: 81-87
1100 | 
1101 | Dim of x = 19+19+43+7 = 88
1102 | 
1103 | */
1104 | 


--------------------------------------------------------------------------------
/dcsim/graph_mat_data.py:
--------------------------------------------------------------------------------
  1 | import sys, os, string
  2 | import numpy as np
  3 | import pandas as pd
  4 | from PIL import Image
  5 | import matplotlib
  6 | #matplotlib.use('Qt4Agg')
  7 | import matplotlib.pyplot as plt
  8 | import cPickle
  9 | import random
 10 | 
 11 | 
 12 | neg = 40
 13 | 
 14 | dim = 128
 15 | 
 16 | def is_in(X, e):
 17 |     for x in X:
 18 |         if np.array_equal(x, e):
 19 |             return True
 20 |     return False
 21 | 
 22 | 
 23 | def load_googlejam_data_newencoding(neg_ratio = 1.0, pos_ratio = 1.0, add_positive_sample = True):
 24 |     train_filepath = "../detector/dataset/training/googlejam_newencoding/train4_128.npy"
 25 |     test_filepath = "../detector/dataset/training/googlejam_newencoding/test4_128.npy"
 26 |     dataset = cPickle.load(open(train_filepath, 'r'))
 27 |     train_data_dict, train_file_dest, train_infos = dataset['data'], dataset['file_dest'], dataset['infos']
 28 |     dataset = cPickle.load(open(test_filepath, 'r'))
 29 |     test_data_dict, test_file_dest, test_infos = dataset['data'], dataset['file_dest'], dataset['infos']
 30 |     trainX_left, trainX_right, trainY = make_pairs_encoding(train_data_dict, neg_ratio, pos_ratio=pos_ratio, add_positive_sample = add_positive_sample)
 31 |     testX_left, testX_right, testY = make_pairs_encoding_test(test_data_dict, neg_ratio, add_positive_sample = add_positive_sample)
 32 |     return trainX_left, trainX_right, trainY, testX_left, testX_right, testY
 33 | 
 34 | 
 35 | def make_pairs_10_fold(X, Y, pos_ratio = 1.0, neg_ratio=1.0, add_all_neg=False):
 36 |     indices = np.random.permutation(np.shape(Y)[0])
 37 |     X = np.array(X)[indices]
 38 |     Y = np.array(Y, dtype=np.int)[indices]
 39 |     y_dist = np.bincount(Y)
 40 |     positive_count = reduce(lambda n1, n2: n1+n2, map(lambda num: num*num/2,
 41 |                                           y_dist.tolist()))
 42 |     X_left = []
 43 |     X_right = []
 44 |     trainY = []
 45 |     p = positive_count * neg_ratio * pos_ratio / (len(X) * len(X) / 2)
 46 |     for i in xrange(len(X)):
 47 |         for j in xrange(i + 1, len(X)):
 48 |             if Y[i] == Y[j] and np.random.rand(1)[0] <= pos_ratio:
 49 |                 X_left.append(X[i])
 50 |                 X_right.append(X[j])
 51 |                 trainY.append([0, 1])
 52 |             elif np.random.rand(1)[0] <= p or add_all_neg:
 53 |                 X_left.append(X[i])
 54 |                 X_right.append(X[j])
 55 |                 trainY.append([1, 0])
 56 | 
 57 |     indices = np.random.permutation(np.shape(trainY)[0])
 58 |     sample_X_left = np.array(X_left)[indices]
 59 |     sample_X_right = np.array(X_right)[indices]
 60 |     sample_Y = np.array(trainY, dtype=np.float32)[indices]
 61 |     return sample_X_left, sample_X_right, sample_Y
 62 | 
 63 | # for sda_unsup and sda_base
 64 | def make_pairs_10_fold_old_model(X, Y, pos_ratio = 1.0, neg_ratio=1.0,
 65 |                         add_all_neg=False):
 66 |     indices = np.random.permutation(np.shape(Y)[0])
 67 |     X = np.array(X)[indices]
 68 |     Y = np.array(Y, dtype=np.int)[indices]
 69 |     y_dist = np.bincount(Y)
 70 |     positive_count = reduce(lambda n1, n2: n1+n2, map(lambda num: num*num/2,
 71 |                                           y_dist.tolist()))
 72 |     X_left = []
 73 |     X_right = []
 74 |     trainY = []
 75 |     p = positive_count * neg_ratio * pos_ratio / (len(X) * len(X) / 2)
 76 |     for i in xrange(len(X)):
 77 |         for j in xrange(i + 1, len(X)):
 78 |             if Y[i] == Y[j] and np.random.rand(1)[0] <= pos_ratio:
 79 |                 X_left.append(X[i])
 80 |                 X_right.append(X[j])
 81 |                 trainY.append([0, 1])
 82 |             elif np.random.rand(1)[0] <= p or add_all_neg:
 83 |                 X_left.append(X[i])
 84 |                 X_right.append(X[j])
 85 |                 trainY.append([1, 0])
 86 |     m = min(np.bincount(np.argmax(np.array(trainY), axis=1)))
 87 |     sample_X_left = []
 88 |     sample_X_right = []
 89 |     sample_Y = []
 90 |     iter = 0
 91 |     for i in xrange(len(X_left)):
 92 |         if iter >= m:
 93 |             break
 94 |         if trainY[i][0] ==1:
 95 |             sample_X_left.append(X_left[i])
 96 |             sample_X_right.append(X_right[i])
 97 |             sample_Y.append([1, 0])
 98 |             iter += 1
 99 |     iter = 0
100 |     for i in xrange (len(X_left)):
101 |         if iter >= m:
102 |             break
103 |         if trainY[i][1] == 1:
104 |             sample_X_left.append(X_left[i])
105 |             sample_X_right.append(X_right[i])
106 |             sample_Y.append([0, 1])
107 |             iter += 1
108 |     indices = np.random.permutation(np.shape(sample_Y)[0])
109 |     sample_X_left = np.array(sample_X_left)[indices]
110 |     sample_X_right = np.array(sample_X_right)[indices]
111 |     sample_Y = np.array(sample_Y, dtype=np.float32)[indices]
112 |     return sample_X_left, sample_X_right, sample_Y
113 | 
114 | 
115 | def make_pairs_10_fold_for_predict(X, Y):
116 |     X_left = []
117 |     X_right = []
118 |     trainY = []
119 |     for i in xrange(len(X)):
120 |         for j in xrange(i + 1, len(X)):
121 |             if Y[i] == Y[j]:
122 |                 X_left.append(X[i])
123 |                 X_right.append(X[j])
124 |                 trainY.append([0, 1])
125 |             else:
126 |                 X_left.append(X[i])
127 |                 X_right.append(X[j])
128 |                 trainY.append([1, 0])
129 |     sample_X_left = np.array(X_left)
130 |     sample_X_right = np.array(X_right)
131 |     sample_Y = np.array(trainY, dtype=np.float32)
132 |     return sample_X_left, sample_X_right, sample_Y


--------------------------------------------------------------------------------
/dcsim/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import shutil
  3 | import pandas as pd
  4 | import numpy as np
  5 | import matplotlib
  6 | import cPickle
  7 | 
  8 | matplotlib.use('Qt4Agg')
  9 | import matplotlib.pyplot as plt
 10 | from sklearn import (manifold, decomposition)
 11 | from PIL import Image
 12 | 
 13 | mat_dim = 128
 14 | 
 15 | 
 16 | def remove_invalid_samples(dir):
 17 |     list_dirs = os.walk(dir)
 18 |     samples_count = 0
 19 |     for root, dirs, _ in list_dirs:
 20 |         for d in dirs:
 21 |             data_folder = os.path.join(root, d)
 22 |             list_sub_dirs = os.walk(data_folder)
 23 |             for subroot, subdirs, data_files in list_sub_dirs:
 24 |                 # print "Read googlejam_data in folder ", subroot
 25 |                 if len(data_files) > 2:  # .txt and .info
 26 |                     # os.rmdir(data_folder) #cannot delete unempty dirs
 27 |                     shutil.rmtree(data_folder)
 28 |                 if data_folder.find('$') != -1:
 29 |                     # os.rmdir(data_folder)
 30 |                     if os.path.exists(data_folder):
 31 |                         shutil.rmtree(data_folder)
 32 |                     related_data_folder = os.path.join(root, d[:d.index('$')])
 33 |                     if os.path.exists(related_data_folder):
 34 |                         # os.rmdir(related_data_folder)
 35 |                         shutil.rmtree(related_data_folder)
 36 |                 if os.path.exists(data_folder):
 37 |                     for data_file in data_files:
 38 |                         if data_file.find('main') == -1:
 39 |                             shutil.rmtree(data_folder)
 40 |                             break
 41 | 
 42 |     print "Clean all folder contains more than 1 functions."
 43 | 
 44 | 
 45 | def record_valid_pairs(dir):
 46 |     list_dirs = os.walk(dir)
 47 |     samples_count = 0
 48 |     clones_sets = dict()
 49 |     for root, dirs, _ in list_dirs:
 50 |         for d in dirs:
 51 |             list_sub_dirs = os.walk(os.path.join(root, d))
 52 |             for subroot, subdirs, data_files in list_sub_dirs:
 53 |                 # print "Read googlejam_data in folder ", subroot
 54 |                 if len(data_files) > 2:
 55 |                     continue
 56 |                 relative_root = os.path.join("googlejam", d)
 57 |                 print "googlejam_data file: ", data_files[1]
 58 |                 jam_dir = subroot[subroot.rindex('/') + 1:]
 59 |                 jam_dir = jam_dir[:jam_dir.index('.')]
 60 |                 jam_number = int(jam_dir[9:])
 61 |                 if clones_sets.has_key(jam_number):
 62 |                     clones_sets[jam_number].append(os.path.join(relative_root, data_files[0]))
 63 |                 else:
 64 |                     clones_sets[jam_number] = []
 65 |                     clones_sets[jam_number].append(os.path.join(relative_root, data_files[0]))
 66 |                 samples_count = samples_count + 1
 67 | 
 68 |     print "%d methods remain" % (samples_count)
 69 | 
 70 |     fout = open('googlejam_data/googlejam.txt', 'w')
 71 |     for i in xrange(len(clones_sets)):
 72 |         clones = clones_sets[i + 1]
 73 |         for j in xrange(len(clones)):
 74 |             for k in xrange(j + 1, len(clones)):
 75 |                 fout.write("%s %s\n" % (clones[j], clones[k]))
 76 |     fout.close()
 77 | 
 78 | 
 79 | def input_data(file_dir):
 80 |     min = 0
 81 |     max = 2 ** 24 - 1
 82 |     list_dirs = os.walk(file_dir)
 83 |     X = []
 84 |     file_dest = []
 85 |     vcount_list = []
 86 |     for root, dirs, files in list_dirs:
 87 |         for d in dirs:
 88 |             list_sub_dirs = os.walk(os.path.join(root, d))
 89 |             for sub_root, sub_dirs, sub_files in list_sub_dirs:
 90 |                 for datafile in sub_files:
 91 |                     if os.path.splitext(datafile)[1] != '.txt':
 92 |                         continue
 93 |                     mat = pd.read_csv(os.path.join(sub_root, datafile), delim_whitespace=True, header=None).values
 94 |                     vcount_list.append(mat[-1][-1])
 95 |                     mat = (mat - min) * 1.0 / max
 96 |                     # if np.sum(np.sum(mat, axis=1), axis=0) < 0.00001:
 97 |                     #     continue
 98 |                     X.append(mat)
 99 |                     file_dest.append(os.path.join(sub_root, datafile))
100 |         for file in files:
101 |             if os.path.splitext(file)[1] != '.txt':
102 |                 continue
103 |             mat = pd.read_csv(os.path.join(root, file), delim_whitespace=True, header=None).values
104 |             vcount_list.append(mat[-1][-1])
105 |             mat = (mat - min) * 1.0 / max
106 |             # if np.sum(np.sum(mat, axis=1), axis=0) < 0.00001:
107 |             #     continue
108 |             X.append(mat)
109 |             file_dest.append(os.path.join(root, file))
110 |     X = np.array(X, dtype=float)
111 |     X = X.reshape(len(X), mat_dim * mat_dim)
112 |     return X, file_dest, vcount_list
113 | 
114 | 
115 | def store(file_dir, save_dir):
116 |     X, file_dest, _ = input_data(file_dir)
117 |     for x, file in zip(X, file_dest):
118 |         new_file = os.path.splitext(file)[0] + ".txt.npy"
119 |         np.save(new_file, x)
120 |     file = save_dir + '/mat.npy'
121 |     np.save(file, X)
122 |     file = save_dir + '/file_dest.npy'
123 |     np.save(file, file_dest)
124 | 
125 | 
126 | def store_sub(file_dir, save_dir):
127 |     list_dirs = os.walk(file_dir)
128 |     for root, dirs, files in list_dirs:
129 |         for d in dirs:
130 |             X, file_dest, vcount_list = input_data(os.path.join(root, d))
131 |             save_file = save_dir + "/" + d + "_mat.npy"
132 |             np.save(save_file, X)
133 |             save_file = save_dir + "/" + d + "_file_dest.npy"
134 |             np.save(save_file, file_dest)
135 |             save_file = save_dir + "/" + d + "_vcount.npy"  # varCount + blockCount
136 |             np.save(save_file, vcount_list)
137 |         break
138 | 
139 | 
140 | def load_dict(file_path):
141 |     dataset = cPickle.load(open(file_path, 'r'))
142 |     data_dict, file_dest, infos = dataset['data'], dataset['file_dest'], dataset['infos']
143 |     return data_dict, file_dest, infos
144 | 
145 | 
146 | def load_pairs(file_path):
147 |     dataset = cPickle.load(open(file_path))
148 |     return dataset['pairs'], dataset['matrices'], dataset['label']
149 | 
150 | 
151 | # def extract_test_pair_src_file():
152 | #     file_path = "../detector/dataset/training/googlejam/dataset4_dict_128.npy"
153 | #     testing_save_path = "../detector/dataset/training/fixed_data/googlejam4_testing_128.npy"
154 | #     data_dict, file_dest, infos = load_dict(file_path)
155 | #     pairs, _, labels = load_pairs(testing_save_path)
156 | #     test_file_set = set()
157 | #     for (i, j) in pairs:
158 | #         test_file_set.add(i)
159 | #         test_file_set.add(j)
160 | #     save_path = "./googlejam4_classes_test_pair_128.txt"
161 | #     fout = open(save_path, 'w')
162 | #     for i in test_file_set:
163 | #         mat_file = file_dest[i]
164 | #         tokens = mat_file.split('/')
165 | #         src_path = ""
166 | #         src_path += tokens[-3] + '/'
167 | #         src_path += '/'.join(tokens[-2].split('.')) + ".java"
168 | #         fout.write(src_path + "\n")
169 | #     fout.close()
170 | #     testing_label_save_path = "./googlejam4_classes_test_pair_128_label.txt"
171 | #     fout = open(testing_label_save_path, 'w')
172 | #     for ((i, j), l) in zip(pairs, labels):
173 | #         fout.write("%s %s %d\n" % (file_dest[i].split('/')[-2] + ".java", file_dest[j].split('/')[-2] + ".java", l))
174 | #     fout.close()
175 | 
176 | 
177 | 
178 | # def extract_test_src_file(file, save_path):
179 | #     data_dict, file_dest, infos = load_dict(file)
180 | #     fout = open(save_path, 'w')
181 | #     for mat_file in file_dest:
182 | #         tokens = mat_file.split('/')
183 | #         src_path = ""
184 | #         src_path += tokens[-3] + '/'
185 | #         src_path += '/'.join(tokens[-2].split('.')) + ".java"
186 | #         fout.write(src_path + "\n")
187 | #     fout.close()
188 | 
189 | 
190 | # def copy_test_src_file(file_list_path, src_files_folder, dest_folder):
191 | #     fin = open(file_list_path, 'r')
192 | #     line = fin.readline()
193 | #     while line is not None and len(line) > 0:
194 | #         src_file = os.path.join(src_files_folder, "/".join(line.split('/')[1:])[:-1])
195 | #         tokens = line.split('/')
196 | #         dest_file = ".".join(tokens[1:])[:-1]
197 | #         dest_file = os.path.join(dest_folder, tokens[0] + "/" + dest_file)
198 | #         if os.path.exists(src_file):
199 | #             shutil.copyfile(src_file, dest_file)
200 | #         if os.path.exists(src_file) is not True:
201 | #             print "file not exist: ", src_file
202 | #         line = fin.readline()
203 | 
204 | 
205 | 
206 | # def extract_src_file():
207 | #     file_dir = "../benchmark/google_code_jam/complete/googlejam4_classes_128"
208 | #     list_dirs = os.walk(file_dir)
209 | #     fout = open("../benchmark/google_code_jam/complete/googlejam4_classes_128.txt", 'w')
210 | #     for root, dirs, _ in list_dirs:
211 | #         for d in dirs:
212 | #             set_folder = os.path.join(root, d)
213 | #             list_sub_dirs = os.walk(set_folder)
214 | #             for subroot, subdirs, _ in list_sub_dirs:
215 | #                 if len(subdirs) == 0:
216 | #                     continue
217 | #                 fout.write("%s %d\n" % (d, len(subdirs)))
218 | #                 for subdir in subdirs:
219 | #                     src_path = subdir.replace('.', '/') + '.java'
220 | #                     fout.write(src_path + "\n")
221 | #     fout.close()
222 | 
223 | 
224 | # def copy_src_files():
225 | #     src_list_file = "../benchmark/google_code_jam/complete/googlejam4_classes_128.txt"
226 | #     src_folder = "../benchmark/google_code_jam/benchmark4"
227 | #     dest_folder = "../benchmark/google_code_jam/complete/googlejam4_src"
228 | #     fin = open(src_list_file, 'r')
229 | #     line = fin.readline()
230 | #     while line is not None and len(line) > 0:
231 | #         label, num = line.split(' ')
232 | #         num = int(num)
233 | #         sub_dest_folder = os.path.join(dest_folder, label)
234 | #         if os.path.exists(sub_dest_folder) is not True:
235 | #             os.mkdir(sub_dest_folder)
236 | #         for _ in xrange(num):
237 | #             short_file_name = fin.readline()[:-1]
238 | #             file_name = os.path.join(src_folder, short_file_name)
239 | #             dest_short_file_name = short_file_name.replace('/', '.')
240 | #             # dest_file = os.path.join(sub_dest_folder, file_name[file_name.rindex('/') + 1:])
241 | #             dest_file = os.path.join(sub_dest_folder, dest_short_file_name)
242 | #             if os.path.exists(file_name) is not True:   #class name is not the java file name
243 | #                 file_name = os.listdir(os.path.join(src_folder, short_file_name[:short_file_name.rindex('/')]))[0]
244 | #                 file_name = os.path.join(src_folder, short_file_name[:short_file_name.rindex('/') + 1] + file_name)
245 | #                 dest_short_file_name = dest_short_file_name[:dest_short_file_name.rindex(".")]
246 | #                 dest_short_file_name = dest_short_file_name[:dest_short_file_name.rindex(".") + 1]
247 | #                 dest_file = os.path.join(sub_dest_folder, dest_short_file_name + file_name[file_name.rindex('/') + 1:])
248 | #             shutil.copyfile(file_name, dest_file)
249 | #         line = fin.readline()
250 | 


--------------------------------------------------------------------------------
/dcsim/preprocessing_bigbench.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | from __future__ import print_function
  5 | 
  6 | import os, sys
  7 | import cPickle as pickle
  8 | import numpy as np
  9 | import pandas as pd
 10 | from sklearn.model_selection import train_test_split
 11 | from sklearn.utils import shuffle
 12 | 
 13 | 
 14 | cfg_folder = "../BigCloneBench/dcsim" \
 15 |              "/cfgs_antlr"
 16 | 
 17 | labels_folder = "../BigCloneBench/dcsim" \
 18 |                 "/labels_antlr"
 19 | 
 20 | seed = 233
 21 | 
 22 | 
 23 | def read_sample_sparse(filepath):
 24 |     sparse_arr = []
 25 |     row = 0
 26 |     for line in open(filepath, 'r'):
 27 |         xs = line.split('\t')[:-1]  # remove '\n'
 28 |         if (len(xs) != 128):
 29 |             continue
 30 |         col = 0
 31 |         for x in xs:
 32 |             if x != '{}':
 33 |                 x = x[1:-1]
 34 |                 if len(x) > 0:
 35 |                     indices = x.split(',')
 36 |                     for index in indices:
 37 |                         sparse_arr.append((row, col, int(index)))
 38 |             col += 1
 39 |         row += 1
 40 |     return sparse_arr
 41 | 
 42 | 
 43 | def read_data_info(filepath):
 44 |     """
 45 |     Read information about each node in the graph for each method.
 46 |     :param filepath: The file path of the *.info file
 47 |     :return:
 48 |     """
 49 |     fin = open(filepath, 'r')
 50 |     line = fin.readline()
 51 |     var_count = int(line.split(' ')[0])
 52 |     block_count = int(line.split(' ')[1])
 53 |     types = []
 54 |     for _ in xrange(var_count):
 55 |         line = fin.readline()
 56 |         types.append(int(line))
 57 |     for _ in xrange(block_count):
 58 |         line = fin.readline()
 59 |         types.append(int(line))
 60 |     return types, var_count, block_count
 61 | 
 62 | 
 63 | def flatten_clones_set(clones_set):
 64 |     X = []
 65 |     y = []
 66 |     file_dest = []
 67 |     infos = []
 68 |     for (l, clones) in clones_set.items():
 69 |         for m, file_path, info in zip(clones['data'], clones['file_dest'], clones['infos']):
 70 |             X.append(m)
 71 |             y.append(l)
 72 |             file_dest.append(file_path)
 73 |             infos.append(info)
 74 |     # X = np.array(X, dtype=np.float)
 75 |     y = np.array(y, dtype=np.int)
 76 |     return X, y, file_dest, infos
 77 | 
 78 | 
 79 | def select_by_functionality_id(id=None):
 80 |     cfgs = {}
 81 |     Xl = []
 82 |     Xr = []
 83 |     y = []
 84 |     ts = []
 85 |     Xl_selected = []
 86 |     Xr_selected = []
 87 |     y_selected = []
 88 |     ts_selected = []
 89 |     label_files = os.listdir(labels_folder)
 90 |     for label_file in label_files:
 91 |         type = int(label_file[1])
 92 |         if label_file.find('FP') != -1:
 93 |             pairs = None
 94 |             # check if the file is empty
 95 |             if os.path.getsize(os.path.join(labels_folder, label_file)) == 0:
 96 |                 continue
 97 |             try:
 98 |                 pairs = pd.read_csv(os.path.join(labels_folder, label_file),
 99 |                                     sep=',',
100 |                                     header=None).values
101 |             except IOError:
102 |                 print("IO error for pairs file: " + os.path.join(
103 |                     labels_folder, label_file))
104 |                 continue
105 |             for idx in xrange(pairs.shape[0]):
106 |                 pair = pairs[idx]
107 |                 p1 = int(pair[0])
108 |                 p2 = int(pair[1])
109 |                 label = int(pair[2])
110 |                 if cfgs.has_key(p1) is not True:
111 |                     cfg_path = os.path.join(cfg_folder, str(p1) + '.mat')
112 |                     if os.path.exists(cfg_path) is not True:
113 |                         continue
114 |                     cfgs[p1] = read_sample_sparse(cfg_path)
115 |                 if cfgs.has_key(p2) is not True:
116 |                     cfg_path = os.path.join(cfg_folder, str(p2) + '.mat')
117 |                     if os.path.exists(cfg_path) is not True:
118 |                         continue
119 |                     cfgs[p2] = read_sample_sparse(cfg_path)
120 |                 if id is not None and label==id:
121 |                     Xl_selected.append(cfgs[p1])
122 |                     Xr_selected.append(cfgs[p2])
123 |                     y_selected.append(0)
124 |                     ts_selected.append(type)
125 |                 else:
126 |                     Xl.append(cfgs[p1])
127 |                     Xr.append(cfgs[p2])
128 |                     y.append(0)
129 |                     ts.append(type)
130 |         else:
131 |             pairs = pd.read_csv(os.path.join(labels_folder, label_file),
132 |                                 sep=',',
133 |                                 header=None).values
134 |             for idx in xrange(pairs.shape[0]):
135 |                 pair = pairs[idx]
136 |                 p1 = int(pair[0])
137 |                 p2 = int(pair[1])
138 |                 label = pair[2]
139 |                 if cfgs.has_key(p1) is not True:
140 |                     cfg_path = os.path.join(cfg_folder, str(p1) + '.mat')
141 |                     if os.path.exists(cfg_path) is not True:
142 |                         continue
143 |                     cfgs[p1] = read_sample_sparse(cfg_path)
144 |                 if cfgs.has_key(p2) is not True:
145 |                     cfg_path = os.path.join(cfg_folder, str(p2) + '.mat')
146 |                     if os.path.exists(cfg_path) is not True:
147 |                         continue
148 |                     cfgs[p2] = read_sample_sparse(cfg_path)
149 |                 if id is not None and label == id:
150 |                     Xl_selected.append(cfgs[p1])
151 |                     Xr_selected.append(cfgs[p2])
152 |                     y_selected.append(1)
153 |                     ts_selected.append(type)
154 |                 else:
155 |                     Xl.append(cfgs[p1])
156 |                     Xr.append(cfgs[p2])
157 |                     y.append(1)
158 |                     ts.append(type)
159 |     return np.array(Xl), np.array(Xr), np.array(y), np.array(ts),\
160 |            np.array(Xl_selected), np.array(Xr_selected),\
161 |            np.array(y_selected), np.array(ts_selected)
162 |             
163 | 
164 | def load_train_test(id=None):
165 |     if id is not None and os.path.exists("./data/bigbench/id_" + str(id) +
166 |                                          "_train.npy"):
167 |         train_dataset = pickle.load(open('./data/bigbench/id_'
168 |                                                    + str(id) + '_train.npy', 'r'))
169 |         test_dataset = pickle.load(open('./data/bigbench/id_'
170 |                                          + str(id) + '_test.npy', 'r'))
171 |         return train_dataset['Xl'], train_dataset['Xr'], train_dataset['y'],\
172 |                 train_dataset['ts'], test_dataset['Xl'], test_dataset['Xr'],\
173 |                 test_dataset['y'], test_dataset['ts']
174 | 
175 |     Xl, Xr, y, ts, Xl_selected, Xr_selected, y_selected, ts_selected = \
176 |         select_by_functionality_id(id)
177 |     Xl, Xr, y, ts = shuffle(Xl, Xr, y, ts, random_state=seed)
178 |     Xl_selected, Xr_selected, y_selected, ts_selected = \
179 |         shuffle(Xl_selected, Xr_selected, y_selected, ts_selected, random_state=seed)
180 |     
181 |     if id is None:
182 |         return train_test_split(Xl, Xr, y, ts, test_size=0.2, random_state=seed)
183 |     else:
184 |         dataset = {}
185 |         dataset['Xl'] = Xl_selected
186 |         dataset['Xr'] = Xr_selected
187 |         dataset['y'] = y_selected
188 |         dataset['ts'] = ts_selected
189 |         pickle.dump(dataset, open('./data/bigbench/id_' + str(id) +
190 |                                   '_train.npy', 'w'))
191 |         dataset = {}
192 |         dataset['Xl'] = Xl
193 |         dataset['Xr'] = Xr
194 |         dataset['y'] = y
195 |         dataset['ts'] = ts
196 |         pickle.dump(dataset, open('./data/bigbench/id_' + str(id) +
197 |                                   '_test.npy', 'w'))
198 |         return Xl_selected, Xr_selected, y_selected, ts_selected, Xl, Xr, y, ts
199 | 
200 | 
201 | def load_dataset():
202 |     if os.path.exists("./data/bigbench/full.npy") is not True:
203 |         Xl, Xr, y, ts, Xl_selected, Xr_selected, y_selected, ts_selected = \
204 |             select_by_functionality_id()
205 |         
206 |         dataset = {}
207 |         dataset['Xl'] = Xl
208 |         dataset['Xr'] = Xr
209 |         dataset['y'] = y
210 |         dataset['ts'] = ts
211 |         pickle.dump(dataset, open('./data/bigbench/full.npy', 'w'))
212 |         return shuffle(Xl, Xr, y, ts, random_state=seed)
213 |     else:
214 |         dataset = pickle.load(open('./data/bigbench/full.npy', 'r'))
215 |         return shuffle(dataset['Xl'], dataset['Xr'], dataset['y'],
216 |                        dataset['ts'], random_state=seed)
217 | 
218 | 
219 | if __name__ == '__main__':
220 |     Xl, Xr, y, ts = load_dataset()
221 |     print("Y test distribution: ", np.bincount(y))
222 |     print("T1: ", (np.bincount(y[ts==0])))
223 |     print("T2: ", (np.bincount(y[ts == 1])))
224 |     print("ST3: ", (np.bincount(y[ts == 2])))
225 |     print("M3: ", (np.bincount(y[ts == 3])))
226 |     print("WT3/T4 ", (np.bincount(y[ts == 4])))


--------------------------------------------------------------------------------
/dcsim/sda_base.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import pandas as pd
  4 | import os
  5 | import time
  6 | from sklearn.model_selection import StratifiedKFold
  7 | 
  8 | import graph_mat_data
  9 | 
 10 | dim = graph_mat_data.dim
 11 | bin_vec_dim = 8
 12 | 
 13 | w_0_s = [dim * dim * bin_vec_dim, 1024]; b_0_s = [1024]
 14 | w_1_s = [1024, 256]; b_1_s = [256]
 15 | w_2_s = [256, 128]; b_2_s = [128]
 16 | w_m_s = [256, 64]; b_m_s = [64]
 17 | w_o_s = [64, 2]
 18 | 
 19 | keep_prob = 0.75
 20 | beta = 0.000003
 21 | 
 22 | 
 23 | 
 24 | logdir = '/tmp/tf_logs'
 25 | 
 26 | 
 27 | 
 28 | def init_weights(shape, name):
 29 |     return tf.get_variable(name=name, shape=shape, dtype=tf.float32,
 30 |                            initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG',
 31 |                                                                                       uniform=True),
 32 |                            regularizer=tf.contrib.layers.l2_regularizer(scale=beta))
 33 | 
 34 | 
 35 | def bias_variable(shape, name):
 36 |     if len(shape) > 1:
 37 |         raise Exception('Bias should be a vector.')
 38 |     return tf.get_variable(name=name, shape=shape, dtype=tf.float32, initializer=tf.constant_initializer(0.01))
 39 | 
 40 | def model(X_left, X_right, dropout):
 41 |     with tf.name_scope("Encoder_Layer_1"):
 42 |         with tf.name_scope('weights'):
 43 |             w_0 = init_weights(w_0_s, 'w_0')
 44 |         with tf.name_scope('bias'):
 45 |             b_0 = bias_variable(b_0_s, 'b_0')
 46 |         with tf.name_scope('drop_out'):
 47 |             X_left = tf.nn.dropout(X_left, dropout)
 48 |             X_right = tf.nn.dropout(X_right, dropout)
 49 |         with tf.name_scope('hidden'):
 50 |             a_1_l = tf.nn.elu(tf.nn.bias_add(tf.matmul(X_left, w_0), b_0))
 51 |             a_1_r = tf.nn.elu(tf.nn.bias_add(tf.matmul(X_right, w_0), b_0))
 52 |     with tf.name_scope("Encoder_Layer_2"):
 53 |         with tf.name_scope('weights'):
 54 |             w_1 = init_weights(w_1_s, 'w_1')
 55 |         with tf.name_scope('bias'):
 56 |             b_1 = bias_variable(b_1_s, 'b_1')
 57 |         with tf.name_scope('drop_out'):
 58 |             a_1_l = tf.nn.dropout(a_1_l, dropout)
 59 |             a_1_r = tf.nn.dropout(a_1_r, dropout)
 60 |         with tf.name_scope('hidden'):
 61 |             a_2_l = tf.nn.elu(tf.nn.bias_add(tf.matmul(a_1_l, w_1), b_1))
 62 |             a_2_r = tf.nn.elu(tf.nn.bias_add(tf.matmul(a_1_r, w_1), b_1))
 63 |     with tf.name_scope("Encoder_Layer3"):
 64 |         with tf.name_scope('weights'):
 65 |             w_2 = init_weights(w_2_s, 'w_2')
 66 |         with tf.name_scope('bias'):
 67 |             b_2 = bias_variable(b_2_s, 'b_2')
 68 |         with tf.name_scope('drop_out'):
 69 |             a_2_l = tf.nn.dropout(a_2_l, dropout)
 70 |             a_2_r = tf.nn.dropout(a_2_r, dropout)
 71 |         with tf.name_scope('hidden'):
 72 |             a_3_l = tf.nn.elu(tf.nn.bias_add(tf.matmul(a_2_l, w_2), b_2))
 73 |             a_3_r = tf.nn.elu(tf.nn.bias_add(tf.matmul(a_2_r, w_2), b_2))
 74 |     with tf.name_scope('Merge_Layer'):
 75 |         input = tf.concat(axis=1, values=[a_3_l, a_3_r])
 76 |         with tf.name_scope('weights'):
 77 |             w_m = init_weights(w_m_s, 'w_m')
 78 |         with tf.name_scope('bias'):
 79 |             b_m = init_weights(b_m_s, 'b_m')
 80 |         with tf.name_scope('hidden'):
 81 |             a_in = tf.nn.elu(tf.nn.bias_add(tf.matmul(input, w_m), b_m))
 82 |     with tf.name_scope('output_layer'):
 83 |         with tf.name_scope('weights'):
 84 |             w_o = init_weights(w_o_s, 'w_o')
 85 |         with tf.name_scope('out'):
 86 |             a_o = tf.matmul(a_in, w_o)
 87 |     return a_o
 88 | 
 89 | 
 90 | def from_sparse_arr(sparse_arr):
 91 |     mat = np.zeros((dim, dim, bin_vec_dim), dtype=np.float32)
 92 |     for (i,j,k) in sparse_arr:
 93 |         if k <= 2:
 94 |             mat[i,j,0] = k
 95 |         elif k <= 6:
 96 |             mat[i,j,1] = k - 3
 97 |         elif k <= 18:
 98 |             mat[i,j,2] = k - 7
 99 |         elif k <= 21:
100 |             mat[i,j,3] = k - 19
101 |         elif k <= 25:
102 |             mat[i,j,4] = k - 22
103 |         elif k <= 37:
104 |             mat[i,j,5] = k - 26
105 |         elif k <= 80:
106 |             mat[i,j,6] = k - 38
107 |         else:
108 |             mat[i,j,7] = k - 81
109 |     return mat
110 | 
111 | def from_sparse_arrs(sparse_arrs):
112 |     mats = []
113 |     for sparse_arr in sparse_arrs:
114 |         mats.append(from_sparse_arr(sparse_arr))
115 |     mats = np.array(mats, dtype=np.float32)
116 |     mats = np.reshape(mats, [mats.shape[0], dim*dim*bin_vec_dim])
117 |     return mats
118 | 
119 | 
120 | def train_10_fold():
121 |     batch_size = 128
122 |     test_size = 128
123 | 
124 |     skf = StratifiedKFold(n_splits=10)
125 |     file_path = "../dataset/g4_128.npy"
126 |     dataset = np.load(open(file_path, 'r'))
127 |     X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int)
128 |     
129 |     with tf.name_scope('Input'):
130 |         X_left = tf.placeholder(tf.float32, [None, dim * dim * bin_vec_dim])
131 |         X_right = tf.placeholder(tf.float32, [None, dim * dim * bin_vec_dim])
132 |         Y = tf.placeholder(tf.float32, [None, 2])
133 |     dropout = tf.placeholder(tf.float32, name='dropout')
134 |     sample_weights = tf.placeholder(tf.float32, [batch_size])
135 |     
136 |     py_x = model(X_left, X_right, dropout)
137 |     
138 |     cost = tf.reduce_mean(
139 |        tf.nn.softmax_cross_entropy_with_logits(logits=py_x, labels=Y))
140 |     reg_term = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
141 |     train_op = tf.train.AdamOptimizer().minimize(cost + reg_term)
142 |     predict_op = tf.argmax(py_x, 1)
143 | 
144 |     indices = np.random.permutation(X.shape[0])
145 |     X = X[indices]
146 |     y = y[indices]
147 |     fold_index = 0
148 |     avg_accuracy = 0.
149 |     avg_recall = 0.
150 |     avg_precision = 0.
151 |     avg_f1_score = 0.
152 |     fout = open('result/sda_base_10_fold.txt', 'w')
153 |     if os.path.exists('result') is not True:
154 |         os.mkdir("result")
155 |     if os.path.exists("old_models_10_fold") is not True:
156 |         os.mkdir("old_models_10_fold")
157 |     for train_idx, test_idx in skf.split(X, y):
158 |         print ('*' * 40 + str(fold_index) + '*' * 40)
159 |         fold_path = os.path.join("old_models_10_fold/sda_base", str(fold_index))
160 |         if os.path.exists(fold_path) is not True:
161 |             os.mkdir(fold_path)
162 |         X_train, X_test = X[train_idx], X[test_idx]
163 |         y_train, y_test = y[train_idx], y[test_idx]
164 |         train_X_left, train_X_right, train_Y = \
165 |             graph_mat_data.make_pairs_10_fold_old_model(X_train, y_train,
166 |                                                         neg_ratio=1.3,
167 |                                               pos_ratio=1.0, add_all_neg=False)
168 |         test_X_left, test_X_right, test_Y = \
169 |             graph_mat_data.make_pairs_10_fold(X_test, y_test, neg_ratio=1.0,
170 |                                               pos_ratio=1.0, add_all_neg=True)
171 |         # compute the class weights
172 |         classes_numbers = np.bincount(np.argmax(train_Y, axis=1))
173 |         classes_weights = np.array([classes_numbers[1] * 1.0 /
174 |                                     (classes_numbers[0] + classes_numbers[1]),
175 |                                     classes_numbers[0] * 1.0 /
176 |                                     (classes_numbers[0] + classes_numbers[1])],
177 |                                    dtype=np.float32)
178 |         classes_weights = np.reshape(classes_weights, newshape=[2, 1])
179 |     
180 |         t_beg = time.clock()
181 |         
182 |         with tf.Session() as sess:
183 |             merged = tf.summary.merge_all()
184 |             train_writer = tf.summary.FileWriter(logdir,
185 |                                                  sess.graph)
186 |             tf.global_variables_initializer().run()
187 |             
188 |             dense_test_X_left = from_sparse_arrs(test_X_left[0:test_size])
189 |             dense_test_X_right = from_sparse_arrs(test_X_right[0:test_size])
190 |             step = 0
191 |             for epoch in xrange(6):
192 |                 indices = np.random.permutation(train_X_left.shape[0])
193 |                 train_X_left = train_X_left[indices]
194 |                 train_X_right = train_X_right[indices]
195 |                 train_Y = train_Y[indices]
196 |                 for start, end in zip(
197 |                         range(0, np.shape(train_X_left)[0], batch_size),
198 |                         range(batch_size, np.shape(train_X_left)[0] + 1,
199 |                               batch_size)):
200 |                     dense_train_X_left = from_sparse_arrs(train_X_left[start:end])
201 |                     dense_train_X_right = from_sparse_arrs(train_X_right[start:end])
202 |                     batch_samples_weights = np.matmul(train_Y[start:end],
203 |                                                       classes_weights)
204 |                     batch_samples_weights = np.reshape(batch_samples_weights,
205 |                                                        newshape=[batch_size])
206 |                     sess.run(train_op, feed_dict={X_left: dense_train_X_left,
207 |                                                   X_right: dense_train_X_right,
208 |                                                   Y: train_Y[start:end],
209 |                                                   dropout: keep_prob,
210 |                                                   sample_weights:
211 |                                                       batch_samples_weights
212 |                                                   })
213 |                     step += 1
214 |                     if step % 100 == 0 and step != 0:
215 |                         print('Epoch: %d, Iter: %d\n' % (epoch, step))
216 |                 
217 |                 predict_Y = sess.run(predict_op,
218 |                                      feed_dict={X_left: dense_test_X_left,
219 |                                                 X_right: dense_test_X_right,
220 |                                                 dropout: 1.0})
221 |                 print(
222 |                 epoch, np.mean(np.argmax(test_Y[0:test_size], axis=1) == predict_Y))
223 |             
224 |             t_end = time.clock()
225 |             print('time cost: %.2f' % (t_end - t_beg))
226 |             saver = tf.train.Saver()
227 |             saver.save(sess, os.path.join(fold_path, 'mode.ckpt'))
228 |             print "model saved."
229 | 
230 |             overall_accuracy = 0.
231 |             overall_predict_Y = []
232 |             iter = 0
233 |             for start, end in zip(
234 |                     range(0, np.shape(test_X_left)[0], batch_size),
235 |                     range(batch_size, np.shape(test_X_left)[0] + 1,
236 |                           batch_size)):
237 |                 dense_test_X_left = from_sparse_arrs(test_X_left[start:end])
238 |                 dense_test_X_right = from_sparse_arrs(test_X_right[start:end])
239 |                 predict_Y = sess.run(predict_op,
240 |                                      feed_dict={X_left: dense_test_X_left,
241 |                                                 X_right: dense_test_X_right,
242 |                                                 dropout: 1.0})  # no dropout
243 |                 overall_predict_Y.extend(predict_Y.tolist())
244 |                 accuracy = np.mean(
245 |                     np.argmax(test_Y[start:end], axis=1) == predict_Y)
246 |                 iter += 1
247 |                 overall_accuracy += accuracy
248 | 
249 |             print('Overall accuracy: %.5f' % (overall_accuracy / iter))
250 |             t_end = time.clock()
251 |             print('Time cost: %.2f' % (t_end - t_beg))
252 |             fout.write('*' * 80 + '\n')
253 |             fout.write('Fold %d:\n' % (fold_index))
254 |             fout.write('Overall accuracy: %.5f\n' % (overall_accuracy / iter))
255 |             fout.write('Time cost: %.2f\n' % (t_end - t_beg))
256 |             recall, precision, f1_score = stat(
257 |                 np.argmax(test_Y[:len(overall_predict_Y)], axis=1),
258 |                 np.array(overall_predict_Y, dtype=np.int32), fout=fout)
259 |             fout.flush()
260 |             avg_accuracy += overall_accuracy / iter
261 |             avg_recall += recall
262 |             avg_precision += precision
263 |             avg_f1_score += f1_score
264 |         print('*' * 80)
265 |         fold_index += 1
266 |     avg_accuracy /= 10.0
267 |     avg_precision /= 10.0
268 |     avg_recall /= 10.0
269 |     avg_f1_score /= 10.0
270 |     print('Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 '
271 |           'score: %.4f' % (
272 |           avg_accuracy, avg_recall, avg_precision, avg_f1_score))
273 |     fout.write('*' * 80 + '\n')
274 |     fout.write(
275 |         'Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 '
276 |         'score: %.4f' % (avg_accuracy, avg_recall, avg_precision, avg_f1_score))
277 |     fout.close()
278 | 
279 | 
280 | def stat(Y, predicted_Y, fout=None):
281 |     real_positive_count = 0
282 |     predict_positive_count = 0
283 |     recall = 0
284 |     precision = 0
285 |     for i in xrange(Y.shape[0]):
286 |         if Y[i] == 1:
287 |             real_positive_count += 1
288 |             if predicted_Y[i] == 1:
289 |                 recall += 1
290 |         if predicted_Y[i] == 1:
291 |             predict_positive_count += 1
292 |             if Y[i] == 1:
293 |                 precision += 1
294 |     retrieved_positive_count = recall
295 |     recall /= real_positive_count * 1.0
296 |     precision /= max(predict_positive_count * 1.0, 1.0)
297 |     f1_score = 2 * recall * precision / max(
298 |     recall + precision, 0.00001)
299 |     print "Clone pairs: %d, non-clone pairs: %d " % (
300 |     real_positive_count, Y.shape[0] - real_positive_count)
301 |     print "Recall: %f, precision: %f, f1 score: %f" % (
302 |     recall, precision, f1_score)
303 |     print "Predicted_positive_count: %d, recall truly positive: %d, false positive: %d, missed true positive: %d" \
304 |           % (predict_positive_count, retrieved_positive_count,
305 |              predict_positive_count - retrieved_positive_count,
306 |              real_positive_count - retrieved_positive_count)
307 |     if fout is not None:
308 |         fout.write("Clone pairs: %d, non-clone pairs: %d\n" % (
309 |     real_positive_count, Y.shape[0] - real_positive_count))
310 |         fout.write("Recall: %.4f, precision: %.4f, f1 score: %.4f\n" % (
311 |     recall, precision, f1_score))
312 |         fout.write("Predicted_positive_count: %d, recall truly positive: %d, "
313 |                    "false positive: %d, missed true positive: %d\n" \
314 |           % (predict_positive_count, retrieved_positive_count,
315 |              predict_positive_count - retrieved_positive_count,
316 |              real_positive_count - retrieved_positive_count))
317 |     return recall, precision, f1_score
318 | 
319 | 
320 | def predict_on_full_dataset():
321 |     st = time.time()
322 |     file_path = "../dataset/g4_128.npy"
323 |     dataset = np.load(open(file_path, 'r'))
324 |     X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int)
325 |     test_X_left, test_X_right, test_Y = \
326 |         graph_mat_data.make_pairs_10_fold_for_predict(X, y)
327 |     print "File reading time: ", time.time() - st
328 |     
329 |     
330 |     with tf.name_scope('Input'):
331 |         X_left = tf.placeholder(tf.float32, [None, dim * dim * bin_vec_dim])
332 |         X_right = tf.placeholder(tf.float32, [None, dim * dim * bin_vec_dim])
333 |         Y = tf.placeholder(tf.float32, [None, 2])
334 |     dropout = tf.placeholder(tf.float32, name='dropout')
335 |     
336 |     py_x = model(X_left, X_right, dropout)
337 |     predict_op = tf.argmax(py_x, 1)
338 |     
339 |     batch_size = 256
340 |     test_size = 256
341 |     
342 |     saver = tf.train.Saver()
343 |     sess = tf.InteractiveSession()
344 |     saver.restore(sess, 'old_models_10_fold/sda_base/0/mode.ckpt')
345 |     
346 |     iter = 0
347 |     overall_predict_Y = []
348 |     for start, end in zip(range(0, np.shape(test_X_left)[0], batch_size),
349 |                           range(batch_size, np.shape(test_X_left)[0] + 1,
350 |                                 batch_size)):
351 |         dense_test_X_left = from_sparse_arrs(test_X_left[start:end])
352 |         dense_test_X_right = from_sparse_arrs(test_X_right[start:end])
353 |         predict_Y = sess.run(predict_op,
354 |                              feed_dict={X_left: dense_test_X_left,
355 |                                         X_right: dense_test_X_right,
356 |                                         dropout: 1.0})  # no dropout
357 |         overall_predict_Y.extend(predict_Y.tolist())
358 |         iter += 1
359 | 
360 |     stat(np.argmax(test_Y[:end], axis=1),
361 |          np.array(overall_predict_Y, dtype=np.int32))
362 | 
363 | 
364 | if __name__ == '__main__':
365 |     
366 |     st = time.time()
367 |     train_10_fold()
368 |     print 'Total 10-fold time: ', time.time() - st
369 | 
370 |     st = time.time()
371 |     predict_on_full_dataset()
372 |     print "Total predicting time: ", time.time() - st


--------------------------------------------------------------------------------
/dcsim/sda_unsup.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import pandas as pd
  4 | import os
  5 | import time
  6 | from sklearn.model_selection import StratifiedKFold
  7 | 
  8 | import graph_mat_data
  9 | 
 10 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 11 | 
 12 | # w_0_s = [28 * 28, 256]; b_0_s = [256]
 13 | # w_1_s = [256, 50]; b_1_s = [50]
 14 | # w_m_s = [100, 20]; b_m_s = [20]
 15 | # w_o_s = [20, 2]
 16 | 
 17 | # w_0_s = [28 * 28 * 2, 512]; b_0_s = [512]
 18 | # w_1_s = [512, 100]; b_1_s = [100]
 19 | # w_m_s = [100, 20]; b_m_s = [20]
 20 | # w_o_s = [20, 2]
 21 | 
 22 | dim = graph_mat_data.dim
 23 | bin_vec_dim = 8
 24 | 
 25 | w_0_s = [dim * dim * bin_vec_dim, 1024]; b_0_s = [1024]
 26 | w_1_s = [1024, 512]; b_1_s = [512]
 27 | w_2_s = [512, 256]; b_2_s = [256]
 28 | w_m_s = [512, 128]; b_m_s = [128]
 29 | w_o_s = [128, 2]
 30 | h_dim = 256
 31 | 
 32 | keep_prob = 0.75
 33 | beta_1 = 0.0003
 34 | beta_2 = 0.001
 35 | 
 36 | noise_g_std = 0.1
 37 | 
 38 | reg_term = None
 39 | cls_reg_term = None
 40 | 
 41 | logdir = '/tmp/tf_logs'
 42 | 
 43 | 
 44 | def variable_summaries(var, collections):
 45 |   """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
 46 |   with tf.name_scope('summaries'):
 47 |     mean = tf.reduce_mean(var)
 48 |     tf.summary.scalar('mean', mean, collections)
 49 |     with tf.name_scope('stddev'):
 50 |       stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
 51 |     tf.summary.scalar('stddev', stddev, collections)
 52 |     tf.summary.scalar('max', tf.reduce_max(var), collections)
 53 |     tf.summary.scalar('min', tf.reduce_min(var), collections)
 54 |     tf.summary.histogram('histogram', var, collections)
 55 | 
 56 | def add_gaussian_noise(h):
 57 |     noise = tf.random_normal(shape=tf.shape(h), mean=0., stddev=noise_g_std, dtype=tf.float32)
 58 |     return h + noise
 59 | 
 60 | 
 61 | def init_weights(shape, name):
 62 |     return tf.get_variable(name=name, shape=shape, dtype=tf.float32,
 63 |                            initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG',
 64 |                                                                                       uniform=True))
 65 |     # return tf.Variable(tf.random_normal(shape, stddev=0.1)) #better to use tf.get_variable with regularizer and initializer
 66 | 
 67 | def bias_variable(shape, name):
 68 |     if len(shape) > 1:
 69 |         raise Exception('Bias should be a vector.')
 70 |     return tf.get_variable(name=name, shape=shape, dtype=tf.float32, initializer=tf.constant_initializer(0.01))
 71 | 
 72 | def encoder(X, dropout):
 73 |     global reg_term
 74 |     with tf.name_scope("Encoder_Layer_1"):
 75 |         with tf.variable_scope('encoder_weights'):
 76 |             w_0 = init_weights(w_0_s, 'w_0')
 77 |         b_0 = bias_variable(b_0_s, 'b_0')
 78 |         X = tf.nn.dropout(X, dropout)
 79 |         a_1_l = tf.nn.elu(tf.nn.bias_add(tf.matmul(X, w_0), b_0))
 80 |     reg_term = tf.nn.l2_loss(w_0)
 81 |     with tf.name_scope("Encoder_Layer_2"):
 82 |         with tf.variable_scope('encoder_weights'):
 83 |             w_1 = init_weights(w_1_s, 'w_1')
 84 |         b_1 = bias_variable(b_1_s, 'b_1')
 85 |         a_1_l = tf.nn.dropout(a_1_l, dropout)
 86 |         a_2_l = tf.nn.elu(tf.nn.bias_add(tf.matmul(a_1_l, w_1), b_1))
 87 |     reg_term += tf.nn.l2_loss(w_1)
 88 |     with tf.name_scope("Encoder_Layer3"):
 89 |         with tf.variable_scope('encoder_weights'):
 90 |             w_2 = init_weights(w_2_s, 'w_2')
 91 |         b_2 = bias_variable(b_2_s, 'b_2')
 92 |         # a_2_l = tf.nn.dropout(a_2_l, dropout)
 93 |         a_3_l = tf.nn.elu(tf.nn.bias_add(tf.matmul(a_2_l, w_2), b_2))
 94 |     reg_term += tf.nn.l2_loss(w_2)
 95 |     return a_3_l
 96 | 
 97 | def decoder(X):
 98 |     with tf.name_scope("Decoder_Layer_1"):
 99 |         with tf.variable_scope('encoder_weights', reuse=True):
100 |             w_0 = tf.transpose(init_weights(w_2_s, 'w_2'))
101 |         b_0 = bias_variable(b_1_s, 'b_1_s')
102 |         a_1_l = tf.nn.elu(tf.nn.bias_add(tf.matmul(X, w_0), b_0))
103 |     with tf.name_scope("Decoder_Layer_2"):
104 |         with tf.variable_scope('encoder_weights', reuse=True):
105 |             w_1 = tf.transpose(init_weights(w_1_s, 'w_1'))
106 |         b_1 = bias_variable(b_0_s, 'b_0_s')
107 |         a_2_l = tf.nn.elu(tf.nn.bias_add(tf.matmul(a_1_l, w_1), b_1))
108 |     with tf.name_scope("Decoder_Layer3"):
109 |         with tf.variable_scope('encoder_weights', reuse=True):
110 |             w_2 = tf.transpose(init_weights(w_0_s, 'w_0'))
111 |         b_2 = bias_variable([dim * dim * bin_vec_dim], 'b_s')
112 |         a_3_l = tf.nn.elu(tf.nn.bias_add(tf.matmul(a_2_l, w_2), b_2))
113 |     return a_3_l
114 | 
115 | def binary_classification(h_left, h_right, dropout):
116 |     global cls_reg_term
117 |     with tf.name_scope('Merge_Layer'):
118 |         input = tf.concat(axis=1, values=[h_left, h_right])
119 |         with tf.name_scope('weights'):
120 |             w_m = init_weights(w_m_s, 'w_m')
121 |         with tf.name_scope('bias'):
122 |             b_m = init_weights(b_m_s, 'b_m')
123 |         with tf.name_scope('hidden'):
124 |             a_in = tf.nn.elu(tf.nn.bias_add(tf.matmul(input, w_m), b_m))
125 |         cls_reg_term = tf.nn.l2_loss(w_m)
126 |     with tf.name_scope('output_layer'):
127 |         with tf.name_scope('weights'):
128 |             w_o = init_weights(w_o_s, 'w_o')
129 |             variable_summaries(w_o, ['cls'])
130 |         with tf.name_scope('out'):
131 |             a_o = tf.matmul(a_in, w_o)
132 |         cls_reg_term += tf.nn.l2_loss(w_o)
133 |     return a_o
134 | 
135 | def from_sparse_arr(sparse_arr):
136 |     mat = np.zeros((dim, dim, bin_vec_dim), dtype=np.float32)
137 |     for (i,j,k) in sparse_arr:
138 |         if k <= 2:
139 |             mat[i,j,0] = k
140 |         elif k <= 6:
141 |             mat[i,j,1] = k - 3
142 |         elif k <= 18:
143 |             mat[i,j,2] = k - 7
144 |         elif k <= 21:
145 |             mat[i,j,3] = k - 19
146 |         elif k <= 25:
147 |             mat[i,j,4] = k - 22
148 |         elif k <= 37:
149 |             mat[i,j,5] = k - 26
150 |         elif k <= 80:
151 |             mat[i,j,6] = k - 38
152 |         else:
153 |             mat[i,j,7] = k - 81
154 |     return mat
155 | 
156 | def from_sparse_arrs(sparse_arrs):
157 |     mats = []
158 |     for sparse_arr in sparse_arrs:
159 |         mats.append(from_sparse_arr(sparse_arr))
160 |     mats = np.array(mats, dtype=np.float32)
161 |     mats = np.reshape(mats, [mats.shape[0], dim*dim*bin_vec_dim])
162 |     return mats
163 | 
164 |         
165 | def train_10_fold():
166 |     global cls_reg_term, reg_term
167 |     
168 |     t_beg = time.clock()
169 |     
170 |     batch_size = 256
171 |     test_size = 256
172 | 
173 |     skf = StratifiedKFold(n_splits=10)
174 |     file_path = "../dataset/g4_128.npy"
175 |     dataset = np.load(open(file_path, 'r'))
176 |     X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int32)
177 |     
178 |     with tf.name_scope('Input'):
179 |         X_left = tf.placeholder(tf.float32, [None, dim * dim * bin_vec_dim])
180 |         X_right = tf.placeholder(tf.float32, [None, dim * dim * bin_vec_dim])
181 |         Y = tf.placeholder(tf.float32, [None, 2])
182 |     dropout = tf.placeholder(tf.float32, name='dropout')
183 |     sample_weights = tf.placeholder(tf.float32, [batch_size])
184 |     
185 |     with tf.variable_scope('encoder-decoder'):
186 |         z = decoder(encoder(X_left, dropout))
187 |     
188 |     cost = tf.reduce_mean(tf.nn.l2_loss(z - X_left))
189 |     tf.summary.scalar('ae_cost', cost, collections=['ae'])
190 |     encoding_trian_op = tf.train.AdamOptimizer().minimize(
191 |         cost + beta_1 * reg_term)
192 |     
193 |     with tf.variable_scope('encoder-decoder', reuse=True):
194 |         h_left_op = encoder(X_left, dropout)
195 |         h_right_op = encoder(X_right, dropout)
196 |     h_left = tf.placeholder(tf.float32, [None, h_dim])
197 |     h_right = tf.placeholder(tf.float32, [None, h_dim])
198 |     py_x = binary_classification(h_left, h_right, dropout)
199 |     
200 |     cls_cost = tf.reduce_mean(
201 |         tf.nn.softmax_cross_entropy_with_logits(logits=py_x, labels=Y))
202 |     tf.summary.scalar('cls_cost', cls_cost, ['cls'])
203 |     cls_train_op = tf.train.AdamOptimizer().minimize(
204 |         cls_cost + beta_2 * cls_reg_term)
205 |     predict_op = tf.argmax(py_x, 1)
206 | 
207 |     indices = np.random.permutation(X.shape[0])
208 |     X = X[indices]
209 |     y = y[indices]
210 |     fold_index = 0
211 |     avg_accuracy = 0.
212 |     avg_recall = 0.
213 |     avg_precision = 0.
214 |     avg_f1_score = 0.
215 |     fout = open('result/sda_unsup_10_fold.txt', 'w')
216 |     if os.path.exists('result') is not True:
217 |         os.mkdir("result")
218 |     if os.path.exists("old_models_10_fold") is not True:
219 |         os.mkdir("old_models_10_fold")
220 |     for train_idx, test_idx in skf.split(X, y):
221 |         print ('*' * 40 + str(fold_index) + '*' * 40)
222 |         fold_path = os.path.join("old_models_10_fold/sda_unsup",
223 |                                  str(fold_index))
224 |         if os.path.exists(fold_path) is not True:
225 |             os.mkdir(fold_path)
226 |         X_train, X_test = X[train_idx], X[test_idx]
227 |         y_train, y_test = y[train_idx], y[test_idx]
228 |         train_X_left, train_X_right, train_Y = \
229 |             graph_mat_data.make_pairs_10_fold_old_model(X_train, y_train,
230 |                                                         neg_ratio=1.3,
231 |                                                         pos_ratio=1.0,
232 |                                                         add_all_neg=False)
233 |         test_X_left, test_X_right, test_Y = \
234 |             graph_mat_data.make_pairs_10_fold(X_test, y_test, neg_ratio=1.0,
235 |                                               pos_ratio=1.0, add_all_neg=True)
236 |     
237 |         classes_numbers = np.bincount(np.argmax(train_Y, axis=1))
238 |         classes_weights = np.array([classes_numbers[1] * 2.0 /
239 |                                     (classes_numbers[0] + classes_numbers[1]),
240 |                                     classes_numbers[0] * 1.0 /
241 |                                     (classes_numbers[0] + classes_numbers[1])],
242 |                                    dtype=np.float32)
243 |         classes_weights = np.reshape(classes_weights, newshape=[2, 1])
244 |     
245 |         t_beg = time.clock()
246 |     
247 |         t_end = time.clock()
248 |         # print('Preparing time: %.2f' % (t_end - t_beg))
249 |         
250 |         with tf.Session() as sess:
251 |             merged = tf.summary.merge_all(key='ae')
252 |             train_writer = tf.summary.FileWriter(
253 |                 logdir, sess.graph)
254 |             tf.global_variables_initializer().run()
255 |             tf.local_variables_initializer().run()
256 | 
257 |             dense_test_X_left = from_sparse_arrs(test_X_left[0:test_size])
258 |             dense_test_X_right = from_sparse_arrs(test_X_right[0:test_size])
259 | 
260 |             it = 0
261 |             for epoch in xrange(300):
262 |                 indices = np.random.permutation(X_train.shape[0])
263 |                 shuffle_train_X = X_train[indices]
264 |                 for start, end in zip(
265 |                         xrange(0, np.shape(X_train)[0], batch_size),
266 |                         xrange(batch_size, np.shape(X_train)[0] + 1,
267 |                                batch_size)):
268 |                     dense_train_X_left = from_sparse_arrs(
269 |                         shuffle_train_X[start:end])
270 |                     # dense_train_X_right = from_sparse_arrs(train_X_right[start:end])
271 |                     _ = sess.run([encoding_trian_op],
272 |                                  feed_dict={X_left: dense_train_X_left,
273 |                                             dropout: keep_prob})
274 |                     # train_writer.add_summary(summary, it)
275 |                     if (it % 100 == 0):
276 |                         print('Epoch: %d, Iter: %d\n' % (epoch, it))
277 |                         h_left_val = sess.run(z,
278 |                                               feed_dict={
279 |                                                   X_left: dense_test_X_left,
280 |                                                   dropout: 1.0})
281 |                         print('l2 cost: %.3f\n' % (np.mean(
282 |                             np.linalg.norm(h_left_val - dense_test_X_left,
283 |                                            axis=1))))
284 |                     it += 1
285 | 
286 |             t_end = time.clock()
287 |             print('Unsup phase time: %.2f' % (t_end - t_beg))
288 | 
289 |             merged = tf.summary.merge_all('cls')
290 |             best_test_acc = 0.
291 |             it = 0
292 |             for epoch in xrange(3):
293 |                 indices = np.random.permutation(train_X_left.shape[0])
294 |                 train_X_left = train_X_left[indices]
295 |                 train_X_right = train_X_right[indices]
296 |                 train_Y = train_Y[indices]
297 |                 for start, end in zip(
298 |                         range(0, np.shape(train_X_left)[0], batch_size),
299 |                         range(batch_size, np.shape(train_X_left)[0] + 1,
300 |                               batch_size)):
301 |                     batch_samples_weights = np.matmul(train_Y[start:end],
302 |                                                       classes_weights)
303 |                     batch_samples_weights = np.reshape(
304 |                         batch_samples_weights,
305 |                         newshape=[batch_size])
306 |                     dense_train_X_left = from_sparse_arrs(
307 |                         train_X_left[start:end])
308 |                     dense_train_X_right = from_sparse_arrs(
309 |                         train_X_right[start:end])
310 |                     h_left_val = sess.run(h_left_op,
311 |                                           feed_dict={
312 |                                               X_left: dense_train_X_left,
313 |                                               dropout: 1.0})
314 |                     h_right_val = sess.run(h_right_op,
315 |                                            feed_dict={
316 |                                                X_right: dense_train_X_right,
317 |                                                dropout: 1.0})
318 |                     _ = sess.run([cls_train_op],
319 |                                  feed_dict={h_left: h_left_val,
320 |                                             h_right: h_right_val,
321 |                                             Y: train_Y[start:end],
322 |                                             dropout: keep_prob,
323 |                                             sample_weights: batch_samples_weights})
324 |                     # train_writer.add_summary(summary, it)
325 |                     if (it % 100 == 0):
326 |                         print('Epoch: %d, Iter: %d\n' % (epoch, it))
327 |                     it += 1
328 |     
329 |                 h_left_val = sess.run(h_left_op,
330 |                                       feed_dict={X_left: dense_test_X_left,
331 |                                                  dropout: 1.0})
332 |                 h_right_val = sess.run(h_right_op,
333 |                                        feed_dict={
334 |                                            X_right: dense_test_X_right,
335 |                                            dropout: 1.0})
336 |                 predict_Y = sess.run(predict_op,
337 |                                      feed_dict={h_left: h_left_val,
338 |                                                 h_right: h_right_val,
339 |                                                 dropout: 1.0})
340 |                 stat(np.argmax(test_Y[0:test_size], axis=1), np.array(
341 |                     predict_Y, dtype=np.float32))
342 |                 test_acc = np.mean(
343 |                     np.argmax(test_Y[0:test_size], axis=1) == predict_Y)
344 |                 print(epoch, test_acc)
345 |     
346 |                 saver = tf.train.Saver()
347 |                 saver.save(sess, os.path.join(fold_path, 'mode.ckpt'))
348 |                 print "model saved."
349 | 
350 |             t_end = time.clock()
351 |             print('time cost: %.2f' % (t_end - t_beg))
352 | 
353 |             overall_accuracy = 0.
354 |             overall_predict_Y = []
355 |             iter = 0
356 |             # saver = tf.train.Saver()
357 |             # saver.restore(sess, os.path.join(fold_path, 'mode.ckpt'))
358 |             for start, end in zip(
359 |                     range(0, np.shape(test_X_left)[0], batch_size),
360 |                     range(batch_size, np.shape(test_X_left)[0] + 1,
361 |                           batch_size)):
362 |                 dense_test_X_left = from_sparse_arrs(test_X_left[start:end])
363 |                 dense_test_X_right = from_sparse_arrs(
364 |                     test_X_right[start:end])
365 |                 h_left_val = sess.run(h_left_op,
366 |                                       feed_dict={X_left: dense_test_X_left,
367 |                                                  dropout: 1.0})
368 |                 h_right_val = sess.run(h_right_op,
369 |                                        feed_dict={
370 |                                            X_right: dense_test_X_right,
371 |                                            dropout: 1.0})
372 |                 predict_Y = sess.run(predict_op,
373 |                                      feed_dict={h_left: h_left_val,
374 |                                                 h_right: h_right_val,
375 |                                                 Y: train_Y[
376 |                                                    start:end],
377 |                                                 dropout: 1.0})
378 |                 overall_predict_Y.extend(predict_Y.tolist())
379 |                 accuracy = np.mean(
380 |                     np.argmax(test_Y[start:end], axis=1) == predict_Y)
381 |                 iter += 1
382 |                 overall_accuracy += accuracy
383 | 
384 |             print('Overall accuracy: %.5f' % (overall_accuracy / iter))
385 | 
386 |             t_end = time.clock()
387 |             print('Time cost: %.2f' % (t_end - t_beg))
388 |             fout.write('*' * 80 + '\n')
389 |             fout.write('Fold %d:\n' % (fold_index))
390 |             fout.write(
391 |                 'Overall accuracy: %.5f\n' % (overall_accuracy / iter))
392 |             fout.write('Time cost: %.2f\n' % (t_end - t_beg))
393 |             recall, precision, f1_score = stat(
394 |                 np.argmax(test_Y[:len(overall_predict_Y)], axis=1),
395 |                 np.array(overall_predict_Y, dtype=np.int32), fout=fout)
396 |             fout.flush()
397 |             avg_accuracy += overall_accuracy / iter
398 |             avg_recall += recall
399 |             avg_precision += precision
400 |             avg_f1_score += f1_score
401 |         print('*' * 80)
402 |         fold_index += 1
403 |     avg_accuracy /= 10.0
404 |     avg_precision /= 10.0
405 |     avg_recall /= 10.0
406 |     avg_f1_score /= 10.0
407 |     print('Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 '
408 |           'score: %.4f' % (
409 |           avg_accuracy, avg_recall, avg_precision, avg_f1_score))
410 |     fout.write('*' * 80 + '\n')
411 |     fout.write(
412 |         'Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 '
413 |         'score: %.4f' % (avg_accuracy, avg_recall, avg_precision, avg_f1_score))
414 |     fout.close()
415 |     
416 | 
417 | def stat(Y, predicted_Y, fout=None):
418 |     real_positive_count = 0
419 |     predict_positive_count = 0
420 |     recall = 0
421 |     precision = 0
422 |     for i in xrange(Y.shape[0]):
423 |         if Y[i] == 1:
424 |             real_positive_count += 1
425 |             if predicted_Y[i] == 1:
426 |                 recall += 1
427 |         if predicted_Y[i] == 1:
428 |             predict_positive_count += 1
429 |             if Y[i] == 1:
430 |                 precision += 1
431 |     retrieved_positive_count = recall
432 |     recall /= real_positive_count * 1.0
433 |     precision /= max(predict_positive_count * 1.0, 1.0)
434 |     f1_score = 2 * recall * precision / max(
435 |         recall + precision, 0.00001)
436 |     print "Clone pairs: %d, non-clone pairs: %d " % (
437 |         real_positive_count, Y.shape[0] - real_positive_count)
438 |     print "Recall: %f, precision: %f, f1 score: %f" % (
439 |         recall, precision, f1_score)
440 |     print "Predicted_positive_count: %d, recall truly positive: %d, false positive: %d, missed true positive: %d" \
441 |           % (predict_positive_count, retrieved_positive_count,
442 |              predict_positive_count - retrieved_positive_count,
443 |              real_positive_count - retrieved_positive_count)
444 |     if fout is not None:
445 |         fout.write("Clone pairs: %d, non-clone pairs: %d\n" % (
446 |             real_positive_count, Y.shape[0] - real_positive_count))
447 |         fout.write("Recall: %.4f, precision: %.4f, f1 score: %.4f\n" % (
448 |             recall, precision, f1_score))
449 |         fout.write(
450 |             "Predicted_positive_count: %d, recall truly positive: %d, "
451 |             "false positive: %d, missed true positive: %d\n" \
452 |             % (predict_positive_count, retrieved_positive_count,
453 |                predict_positive_count - retrieved_positive_count,
454 |                real_positive_count - retrieved_positive_count))
455 |     return recall, precision, f1_score
456 | 
457 | 
458 | def predict_on_full_dataset():
459 |     '''
460 |     Test the time performance on the full dataset after evaluation using
461 |     10-fold cross-validation
462 |     :return:
463 |     '''
464 |     st = time.time()
465 |     file_path = "../dataset/g4_128.npy"
466 |     dataset = np.load(open(file_path, 'r'))
467 |     X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int)
468 |     # test_X_left, test_X_right, test_Y = \
469 |     #     graph_mat_data.make_pairs_10_fold_for_predict(X, y)
470 |     print "File reading time: ", time.time() - st
471 |     
472 |     with tf.name_scope('Input'):
473 |         X_left = tf.placeholder(tf.float32, [None, dim * dim * bin_vec_dim])
474 |         X_right = tf.placeholder(tf.float32, [None, dim * dim * bin_vec_dim])
475 |         Y = tf.placeholder(tf.float32, [None, 2])
476 |     dropout = tf.placeholder(tf.float32, name='dropout')
477 |     
478 |     with tf.variable_scope('encoder-decoder'):
479 |         h = encoder(X_left, dropout)
480 |         z = decoder(h)
481 |     
482 |     with tf.variable_scope('encoder-decoder', reuse=True):
483 |         h_left_op = encoder(X_left, dropout)
484 |         h_right_op = encoder(X_right, dropout)
485 |     h_left = tf.placeholder(tf.float32, [None, h_dim])
486 |     h_right = tf.placeholder(tf.float32, [None, h_dim])
487 |     py_x = binary_classification(h_left, h_right, dropout)
488 |     
489 |     predict_op = tf.argmax(py_x, 1)
490 |     
491 |     batch_size = 256
492 |     
493 |     saver = tf.train.Saver()
494 |     sess = tf.InteractiveSession()
495 |     saver.restore(sess, 'old_models_10_fold/sda_unsup/0/mode.ckpt')
496 |     
497 |     iter = 0
498 |     overall_predict_Y= []
499 |     X_reps = []
500 |     for start, end in zip(range(0, np.shape(X)[0], batch_size), \
501 |                      range(batch_size, np.shape(X)[0] + 1, batch_size)):
502 |         dense_X = from_sparse_arrs(X[start:end])
503 |         h_val = sess.run(h_left_op, feed_dict={X_left: dense_X, dropout: 1.0})
504 |         X_reps.extend(h_val.tolist())
505 |     dense_X = from_sparse_arrs(X[end:])
506 |     h_val = sess.run(h_left_op, feed_dict={X_left: dense_X, dropout: 1.0})
507 |     X_reps.extend(h_val.tolist())
508 |     test_X_left = []
509 |     test_X_right = []
510 |     test_Y = []
511 |     for i in xrange(y.shape[0]):
512 |         for j in xrange(i+1, y.shape[0]):
513 |             if y[i] == y[j]:
514 |                 test_X_left.append(X_reps[i])
515 |                 test_X_right.append(X_reps[j])
516 |                 test_Y.append([0, 1])
517 |             else:
518 |                 test_X_left.append(X_reps[i])
519 |                 test_X_right.append(X_reps[j])
520 |                 test_Y.append([1, 0])
521 |     test_X_left = np.array(test_X_left)
522 |     test_X_right = np.array(test_X_right)
523 |     test_Y = np.array(test_Y, dtype=np.float32)
524 |     
525 |     for start, end in zip(range(0, np.shape(test_X_left)[0], batch_size),
526 |                           range(batch_size, np.shape(test_X_left)[0] + 1,
527 |                                 batch_size)):
528 |         predict_Y = sess.run(predict_op, feed_dict={h_left: test_X_left[
529 |                                                             start:end],
530 |                                                     h_right: test_X_right[
531 |                                                              start:end],
532 |                                                     dropout: 1.0})
533 |         overall_predict_Y.extend(predict_Y.tolist())
534 |         iter += 1
535 | 
536 |     stat(np.argmax(test_Y[:end], axis=1),
537 |          np.array(overall_predict_Y, dtype=np.int32))
538 | 
539 | 
540 | if __name__ == '__main__':
541 | 
542 |     st = time.time()
543 |     train_10_fold()
544 |     print 'Total 10-fold cross-validation time: ', time.time() - st
545 |     st = time.time()
546 |     predict_on_full_dataset()
547 |     print 'Total predicting time on the full dataset: ', time.time() - st


--------------------------------------------------------------------------------