├── Code
    ├── CodeMetrics.py
    ├── ExtractLearnedFeaturesAndClassification.py
    └── TransferableRepresentationLearning_LSTM_DNN.py
├── Data
    ├── CodeMetrics
    │   ├── FFmpeg.zip
    │   ├── LibPNG.zip
    │   ├── LibTIFF.zip
    │   └── VLC.zip
    ├── TrainedTokenizer
    │   └── tokenizer.zip
    ├── TrainedWord2vecModel
    │   └── 6_projects_w2v_model_CBOW.zip
    └── VulnerabilityData
    │   ├── 6_projects_functions.zip
    │   ├── Sample data Info.txt
    │   ├── except_ffmpeg_list.pkl
    │   ├── except_ffmpeg_list_id.pkl
    │   ├── ffmpeg_list.pkl
    │   └── ffmpeg_list_id.pkl
├── README.md
└── Vulnerabilities_info.xlsx


/Code/CodeMetrics.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Oct 24 11:33:37 2017
  4 | 
  5 | Implement RF classifier using sciki-learn package
  6 | 
  7 | The RF classifer is trained using code metrics as features. Using code metrics as features is used as the baseline to compare with the method which uses transfer-learned representations as features.  
  8 | 
  9 | """
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | import time
 14 | import csv
 15 | 
 16 | from sklearn.metrics import classification_report
 17 | from sklearn.metrics import confusion_matrix
 18 | from sklearn.grid_search import GridSearchCV
 19 | from sklearn.model_selection import cross_val_predict
 20 | from sklearn.ensemble import RandomForestClassifier
 21 | 
 22 | script_start_time = time.time()
 23 | 
 24 | working_dir = '/home/your/user/name/CodeMetrics/vlc/'
 25 | print ("Script starts at: " + str(script_start_time))
 26 | 
 27 | 
 28 | # 1. Import processed data from CSV files.
 29 | #-------------------------------------------
 30 | def getData(filePath):
 31 |     df = pd.read_csv(filePath, header=None, sep=",")
 32 |     
 33 |     df_list = df.values.tolist()
 34 |     
 35 |     return np.asarray(df_list)
 36 | 
 37 | def GenerateLabels(input_arr):
 38 |     temp_arr = []
 39 |     for func_id in input_arr:
 40 |         temp_sub_arr = []
 41 |         if "cve" in func_id or "CVE" in func_id:
 42 |             temp_sub_arr.append(1)
 43 |         else:
 44 |             temp_sub_arr.append(0)
 45 |         temp_arr.append(temp_sub_arr)
 46 |     return np.asarray(temp_arr)
 47 | 
 48 | def storeOuput(arr, path):
 49 |     with open(path, 'w') as myfile:
 50 |         wr = csv.writer(myfile)
 51 |         wr.writerow(arr)
 52 | 
 53 | train_set_x = getData(working_dir + 'train_vlc_cm.csv')
 54 | train_set_id = getData(working_dir + 'train_vlc_id.csv')
 55 | 
 56 | test_set_x = getData(working_dir + 'test_vlc_cm.csv')
 57 | test_set_id = getData(working_dir + 'test_vlc_id.csv')
 58 | 
 59 | train_set_id = np.ndarray.flatten(np.asarray(train_set_id))
 60 | test_set_id = np.ndarray.flatten(np.asarray(test_set_id))
 61 | 
 62 | train_set_y = GenerateLabels(train_set_id)
 63 | test_set_y = GenerateLabels(test_set_id)
 64 | 
 65 | train_set_y = np.ndarray.flatten(np.asarray(train_set_y))
 66 | test_set_y = np.ndarray.flatten(np.asarray(test_set_y))
 67 | 
 68 | print ("Training set: ")
 69 | print (train_set_x)
 70 | 
 71 | print ("Testing set: ")
 72 | print (test_set_x)
 73 | 
 74 | print ("The length of training and testing sets: ")
 75 | 
 76 | print (len(train_set_x), len(test_set_x), len(train_set_y), len(test_set_y))
 77 | 
 78 | print ("-------------------------")
 79 | 
 80 | print ("The shape of the datasets: " + "\r\n")
 81 | 
 82 | #print (train_set_x.shape, train_set_y.shape, test_set_x.shape, test_set_y.shape)
 83 | 
 84 | print (np.count_nonzero(train_set_y), np.count_nonzero(test_set_y))
 85 | 
 86 | # 2. Training RF parameters
 87 | # -------------------------------------------------------
 88 | param_grid = {'max_depth': [3,4,5,6,10,15,20],
 89 |               'min_samples_split': [2,3,4,5,6,10],
 90 |               'min_samples_leaf': [1,2,3,4,5,6,10],
 91 |               'bootstrap': [True,False],
 92 |               'criterion': ['gini','entropy'],
 93 |               'n_estimators': [10,20,30,40,50,60,70]}
 94 | 
 95 | # 3. Start training the RF model
 96 | #--------------------------------------------
 97 | clf = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, n_jobs=-1)
 98 | clf = clf.fit(train_set_x, train_set_y)
 99 | 
100 | print("best estimator found by grid search:")
101 | print(clf.best_estimator_)
102 | 
103 | # Output the feature importance.
104 | feature_importances = clf.best_estimator_.feature_importances_
105 | 
106 | print ("\r\n")
107 | 
108 | #evaluate the model on the test set
109 | print("predicting on the test set")
110 | #t0 = time()
111 | y_predict = clf.predict(test_set_x)
112 | 
113 | y_predict_proba = clf.predict_proba(test_set_x)
114 | 
115 | np.savetxt("y_predict_vlc_cm.csv", y_predict, delimiter=",")
116 | np.savetxt("y_predict_proba_vlc_cm.csv", y_predict_proba, delimiter=",")
117 | storeOuput(test_set_y, "test_label_vlc.csv")
118 | #np.savetxt("./test_label_3.csv", test_set_y, delimiter=",")
119 | #np.savetxt("./feature_importances_short.csv", feature_importances, delimiter=",")
120 | 
121 | #y_predict_proba = cross_val_predict(clf, )
122 | 
123 | #print (y_predict_proba)
124 | 
125 | # Accuracy
126 | accuracy = np.mean(test_set_y==y_predict)*100
127 | print ("accuracy = " +  str(accuracy))
128 |     
129 | target_names = ["Non-vulnerable","Vulnerable"] #non-vulnerable->0, vulnerable->1
130 | print (confusion_matrix(test_set_y, y_predict, labels=[0,1]))   
131 | print ("\r\n")
132 | print ("\r\n")
133 | print (classification_report(test_set_y, y_predict, target_names=target_names))
134 | 
135 | print ("\r\n")
136 | print ("--- %s seconds ---" + str((time.time() - script_start_time)))


--------------------------------------------------------------------------------
/Code/ExtractLearnedFeaturesAndClassification.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Nov 02 16:35:39 2017
  4 | 
  5 | This file has two functions:
  6 |     1. load the trained LSTM network for obtaining the function representations as features.
  7 |     2. train a random forest classifier using obtained features.
  8 | 
  9 | """
 10 | 
 11 | import time
 12 | import numpy as np
 13 | import pandas as pd
 14 | import os
 15 | import csv
 16 | import pickle
 17 | 
 18 | from sklearn.model_selection import train_test_split
 19 | 
 20 | from keras.preprocessing.text import Tokenizer
 21 | from keras.preprocessing.sequence import pad_sequences
 22 | from keras.models import load_model
 23 | from keras.preprocessing import sequence
 24 | from keras import backend as K
 25 | 
 26 | from sklearn.metrics import classification_report
 27 | from sklearn.metrics import confusion_matrix
 28 | from sklearn.grid_search import GridSearchCV
 29 | from sklearn.model_selection import cross_val_predict
 30 | from sklearn.ensemble import RandomForestClassifier
 31 | 
 32 | script_start_time = time.time()
 33 | 
 34 | print ("Script starts at: " + str(script_start_time))
 35 | 
 36 | # ------------------------------------------------------------ #
 37 | # Parameters used
 38 | MAX_LEN = 1000 # The Padding Length for each sample.
 39 | EMBEDDING_DIM = 100 # The Embedding Dimension for each element within the sequence of a data sample. 
 40 | 
 41 | #--------------------------------------------------------#
 42 | # 1. Directories of all the needed files.
 43 | 
 44 | project_name = "FFmpeg"
 45 | 
 46 | working_dir = '/home/your/user/name/TransferRepresentationLearning/ffmpeg/'
 47 | 
 48 | w2v_dir = '/home/your/user/name/TransferRepresentationLearning/word2vec/'
 49 | 
 50 | model_saved_path = '/home/your/user/name/TransferRepresentationLearning/models/'
 51 | 
 52 | #--------------------------------------------------------#
 53 | # 2. Load the saved model and compile it.
 54 | 
 55 | # The path where the trained models are saved.
 56 | model = load_model(model_saved_path + '1st_1000_100_32_90_test_on_ffmpeg.h5')
 57 | 
 58 | model.compile(loss='binary_crossentropy',
 59 |               optimizer='rmsprop',
 60 |               metrics=['accuracy'])
 61 | 
 62 | print (model.summary())
 63 | 
 64 | #--------------------------------------------------------#
 65 | # 3. Load the data
 66 | 
 67 | def LoadSavedData(path):
 68 |     with open(path, 'rb') as f:
 69 |         loaded_data = pickle.load(f)
 70 |     return loaded_data
 71 | 
 72 | def GenerateLabels(input_arr):
 73 |     temp_arr = []
 74 |     for func_id in input_arr:
 75 |         temp_sub_arr = []
 76 |         if "cve" in func_id or "CVE" in func_id:
 77 |             temp_sub_arr.append(1)
 78 |         else:
 79 |             temp_sub_arr.append(0)
 80 |         temp_arr.append(temp_sub_arr)
 81 |     return np.asarray(temp_arr)
 82 | 
 83 | training_list = LoadSavedData(working_dir + 'except_ffmpeg_list.pkl')
 84 | training_list_id = LoadSavedData(working_dir + 'except_ffmpeg_list_id.pkl')
 85 | 
 86 | testing_list = LoadSavedData(working_dir + 'ffmpeg_list.pkl')
 87 | testing_list_id = LoadSavedData(working_dir + 'ffmpeg_list_id.pkl')
 88 | 
 89 | print ("The number of training functions: " + str(len(training_list)) + "  ID: " + str(len(training_list_id)))
 90 | print ("The number of testing functions: " + str(len(testing_list)) + "  ID: " + str(len(testing_list_id)))
 91 | 
 92 | #------------------------------------#
 93 | # 2. Load pre-trained word2vec and tokens
 94 |     
 95 | def JoinSubLists(list_to_join):
 96 |     new_list = []
 97 |     
 98 |     for sub_list_token in list_to_join:
 99 |         new_line = ','.join(sub_list_token)
100 |         new_list.append(new_line)
101 |     return new_list
102 | 
103 | new_training_list = JoinSubLists(training_list)
104 | new_testing_list = JoinSubLists(testing_list)
105 | 
106 | tokenizer = LoadSavedData(w2v_dir + 'tokenizer.pickle')
107 | train_sequences = tokenizer.texts_to_sequences(new_training_list)
108 | test_sequences = tokenizer.texts_to_sequences(new_testing_list)
109 | word_index = tokenizer.word_index
110 | print ('Found %s unique tokens.' % len(word_index))
111 | 
112 | print ("The length of tokenized sequence: " + str(len(train_sequences)))
113 | print ("The length of tokenized sequence: " + str(len(test_sequences)))
114 | 
115 | # Load the pre-trained embeddings.
116 | w2v_model_path = w2v_dir + '6_projects_w2v_model_CBOW.txt'
117 | w2v_model = open(w2v_model_path, encoding="latin1")
118 | 
119 | print ("----------------------------------------")
120 | print ("The trained word2vec model: ")
121 | print (w2v_model)
122 | 
123 | #------------------------------------#
124 | # 3. Do the paddings.
125 | print ("max_len ", MAX_LEN)
126 | print('Pad sequences (samples x time)')
127 | 
128 | train_sequences_pad = pad_sequences(train_sequences, maxlen = MAX_LEN, padding ='post')
129 | test_sequences_pad = pad_sequences(test_sequences, maxlen = MAX_LEN, padding ='post')
130 | 
131 | train_set_x = train_sequences_pad
132 | test_set_x = test_sequences_pad
133 | 
134 | train_set_y = GenerateLabels(training_list_id)
135 | test_set_y = GenerateLabels(testing_list_id)
136 | 
137 | print (len(train_set_x), len(train_set_y), len(test_set_x), len(test_set_y))
138 | 
139 | print ("-------------------------")
140 | 
141 | print ("The shape of the datasets: " + "\r\n")
142 | 
143 | print (train_set_x.shape, train_set_y.shape, test_set_x.shape, test_set_y.shape)
144 | 
145 | print (np.count_nonzero(train_set_y), np.count_nonzero(test_set_y))
146 |         
147 | # ------------------------------------------------------------ #
148 | # 4. Get the activations (outputs of each layer)
149 | def get_activations(model, model_inputs, print_shape_only=False, layer_name=None):
150 | 
151 |     print('----- activations -----')
152 |     activations = []
153 |     inp = model.input
154 | 
155 |     model_multi_inputs_cond = True
156 |     if not isinstance(inp, list):
157 |         # only one input! let's wrap it in a list.
158 |         inp = [inp]
159 |         model_multi_inputs_cond = False
160 | 
161 |     print("Preparing outputs....")
162 |     outputs = [layer.output for layer in model.layers if
163 |                layer.name == layer_name or layer_name is None]  # all layer outputs
164 |     
165 |     print ("-----------------")
166 |     print (len(outputs))
167 |     print ("-----------------")
168 |     funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs]  # evaluation functions
169 | 
170 |     if model_multi_inputs_cond:
171 |         list_inputs = []
172 |         list_inputs.extend(model_inputs)
173 |         list_inputs.append(1.)
174 |     else:
175 |         list_inputs = [model_inputs, 1.]
176 | 
177 |     print ("--------Layer Ouputs---------")
178 | 
179 |     # Learning phase. 1 = Test mode (no dropout or batch normalization)
180 |     # layer_outputs = [func([model_inputs, 1.])[0] for func in funcs]
181 |     layer_outputs = [func(list_inputs)[0] for func in funcs]
182 |     for layer_activations in layer_outputs:
183 |         activations.append(layer_activations)
184 |         if print_shape_only:
185 |             print(layer_activations.shape)
186 |         else:
187 |             print(layer_activations)
188 |     return activations
189 | 
190 | # 4.1 Get the activations (representations) using all the training and testing samples.
191 | 
192 | print ("Saving the layer outputs...")
193 | 
194 | repre_testing = get_activations(model, test_set_x, print_shape_only=True)
195 | 
196 | def storeOuput(arr, path):
197 |     with open(path, 'w') as myfile:
198 |         wr = csv.writer(myfile)
199 |         wr.writerow(arr)
200 | 
201 | # There are five layers, we only care about the third and the fourth layer.
202 | 
203 | # train_X = train_layer_three = repre_training[2] 
204 | # test_X = test_layer_three = repre_testing[2] 
205 | 
206 | #train_X = train_layer_three = repre_training
207 | ffmpeg_repre = test_layer_three = repre_testing[4] 
208 | 
209 | #train_X = train_layer_three = repre_training
210 | #test_X = test_layer_three = repre_testing 
211 | 
212 | #train_y = np.ndarray.flatten(np.asarray(train_set_y))
213 | #libtiff_label = np.ndarray.flatten(np.asarray(test_set_y))
214 | 
215 | print ("-------------------------")
216 | 
217 | print ("The shape of the learned representations: " + "\r\n")
218 | 
219 | print (ffmpeg_repre.shape)
220 | 
221 | """
222 | The obtained representations for project Ffmpeg can be used as features for training a machine learning classifier (here we use random forest).  
223 | 
224 | Suppose that project FFmpeg has very limited labeled data. To simulate this situation, we divide the total FFmpeg functions into two sets: 
225 |     
226 | 25% of total functions are used for training (simulated the labeled data), and 75% of total functions for testing (simulated the unlabeled data)
227 | 
228 | """
229 | 
230 | train_set_x, test_set_x, train_set_y_id, test_set_y_id = train_test_split(ffmpeg_repre, testing_list_id, test_size=0.75, random_state=42) 
231 | 
232 | train_X = train_set_x
233 | train_y = GenerateLabels(train_set_y_id)
234 | test_X = test_set_x
235 | test_y = GenerateLabels(test_set_y_id)
236 | 
237 | train_y = np.ndarray.flatten(np.asarray(train_y))
238 | test_y = np.ndarray.flatten(np.asarray(test_y))
239 | 
240 | print ("-------------------------")
241 | 
242 | print ("The shape of the datasets: " + "\r\n")
243 | 
244 | print (len(train_X), len(train_y), len(test_X), len(test_y))
245 | 
246 | print (train_X.shape, train_y.shape, test_X.shape, test_y.shape)
247 | 
248 | print (np.count_nonzero(train_y), np.count_nonzero(test_y))
249 | 
250 | #-------------------------------------------------------------------------------
251 | # Invoke Sklearn tools for classification -- using Random Forest
252 | 
253 | #train a random forest model
254 | print ("Fitting the classifier to the training set") 
255 | #t0 = time()
256 | param_grid = {'max_depth': [2,3,4,5,9,10,11,15,20],
257 |               'min_samples_split': [2,3,4,5,6,10],
258 |               'min_samples_leaf': [2,3,4,5,6,10],
259 |               'bootstrap': [True,False],
260 |               'criterion': ['gini','entropy'],
261 |               'n_estimators': [10,20,30,40,50,55,60,65,70]}
262 | 
263 |     #construct the grid search classifier, 10-fold Cross Validation
264 | clf = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, n_jobs=-1)
265 | clf = clf.fit(train_X, train_y)
266 | #print("finished params search in %0.3fs" % (time() - t0))
267 | print("best estimator found by grid search:")
268 | print(clf.best_estimator_)
269 | 
270 | #for params, mean_score, scores in clf.grid_scores_:
271 |     #print (mean_score, scores.std()*2, params)
272 | print ("\r\n")
273 | 
274 | #evaluate the model on the test set
275 | print("predicting on the test set")
276 | #t0 = time()
277 | y_predict = clf.predict(test_X)
278 | 
279 | y_predict_proba = clf.predict_proba(test_X)
280 | 
281 | np.savetxt("./new_results/learned_repr_ffmpeg1.csv", ffmpeg_repre, delimiter=",")
282 | np.savetxt("./new_results/y_predict_proba1_w2v_gmp_ffmpeg1.csv", y_predict_proba, delimiter=",")
283 | np.savetxt("./new_results/y_predict1_w2v_gmp_ffmpeg1.csv", y_predict, delimiter=",")
284 | np.savetxt("./new_results/test_label_ffmpeg1.csv", test_set_y, delimiter=",")
285 | storeOuput(test_set_y_id, "./new_results/test_output_ids_ffmpeg1.csv")
286 | storeOuput(testing_list_id, "./new_results/ffmpeg_ids1.csv")
287 | 
288 | #y_predict_proba = cross_val_predict(clf, )
289 | 
290 | #print (y_predict_proba)
291 | 
292 | # Accuracy
293 | accuracy = np.mean(test_y==y_predict)*100
294 | print ("accuracy = " +  str(accuracy))
295 |     
296 | target_names = ["Non-vulnerable","Vulnerable"] #non-vulnerable->0, vulnerable->1
297 | print (confusion_matrix(test_y, y_predict, labels=[0,1]))   
298 | print ("\r\n")
299 | print ("\r\n")
300 | print (classification_report(test_y, y_predict, target_names=target_names))
301 | 
302 | K.clear_session()
303 | 	
304 | print ("\r\n")
305 | print ("--- %s seconds --- " +  str((time.time() - script_start_time)))


--------------------------------------------------------------------------------
/Code/TransferableRepresentationLearning_LSTM_DNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Nov 11 14:51:39 2017
  4 | 
  5 | This file implements a LSTM network that is capable of leveraging the historical vulnerable function data for learning the representations.
  6 | 
  7 | """
  8 | 
  9 | import time
 10 | import pickle
 11 | import csv
 12 | import numpy as np
 13 | 
 14 | from sklearn.model_selection import train_test_split
 15 | 
 16 | from keras.preprocessing.text import Tokenizer
 17 | from keras.preprocessing.sequence import pad_sequences
 18 | 
 19 | from sklearn.metrics import classification_report
 20 | from sklearn.metrics import confusion_matrix
 21 | from sklearn.grid_search import GridSearchCV
 22 | from sklearn.model_selection import cross_val_predict
 23 | from sklearn.ensemble import RandomForestClassifier
 24 | 
 25 | from keras.models import Sequential
 26 | from keras.layers import Dense, Dropout
 27 | from keras.layers import Embedding, Bidirectional
 28 | from keras.layers import LSTM
 29 | from keras.layers import GlobalMaxPooling1D
 30 | from keras.callbacks import ModelCheckpoint, EarlyStopping
 31 | from keras.callbacks import TensorBoard, CSVLogger
 32 | from keras import backend as K
 33 | 
 34 | script_start_time = time.time()
 35 | 
 36 | print ("Script starts at: " + str(script_start_time))
 37 | 
 38 | # ------------------------------------------------------------ #
 39 | # Parameters used
 40 | MAX_LEN = 1000 # The Padding Length for each sample.
 41 | EMBEDDING_DIM = 100 # The Embedding Dimension for each element within the sequence of a data sample. 
 42 | 
 43 | NUM_TRAIN_SAMPLE = 19326
 44 | NUM_VALIDATION_SAMPLE = 8283
 45 | NUM_TEST_SAMPLE = 4921
 46 | BATCH_SIZE = 32
 47 | EPOCHS = 150
 48 | 
 49 | working_dir = '/home/your/user/name/TransferRepresentationLearning/ffmpeg/'
 50 | 
 51 | w2v_dir = '/home/your/user/name/TransferRepresentationLearning/word2vec/'
 52 | 
 53 | log_path = '/home/your/user/name/TransferRepresentationLearning/Logs/'
 54 | 
 55 | # The path where the trained models are saved.
 56 | model_saved_path = '/home/your/user/name/TransferRepresentationLearning/models/'
 57 | 
 58 | saved_model_name = "1st_1000_100_32_90_test_on_ffmpeg"
 59 | 
 60 | def LoadSavedData(path):
 61 |     with open(path, 'rb') as f:
 62 |         loaded_data = pickle.load(f)
 63 |     return loaded_data
 64 | 
 65 | def GenerateLabels(input_arr):
 66 |     temp_arr = []
 67 |     for func_id in input_arr:
 68 |         temp_sub_arr = []
 69 |         if "cve" in func_id or "CVE" in func_id:
 70 |             temp_sub_arr.append(1)
 71 |         else:
 72 |             temp_sub_arr.append(0)
 73 |         temp_arr.append(temp_sub_arr)
 74 |     return np.asarray(temp_arr)
 75 | 
 76 | training_list = LoadSavedData(working_dir + 'except_ffmpeg_list.pkl')
 77 | training_list_id = LoadSavedData(working_dir + 'except_ffmpeg_list_id.pkl')
 78 | 
 79 | testing_list = LoadSavedData(working_dir + 'ffmpeg_list.pkl')
 80 | testing_list_id = LoadSavedData(working_dir + 'ffmpeg_list_id.pkl')
 81 | 
 82 | print ("The number of training functions: " + str(len(training_list)) + "  ID: " + str(len(training_list_id)))
 83 | print ("The number of testing functions: " + str(len(testing_list)) + "  ID: " + str(len(testing_list_id)))
 84 | 
 85 | #------------------------------------#
 86 | # 2. Load pre-trained word2vec and tokens
 87 |     
 88 | def JoinSubLists(list_to_join):
 89 |     new_list = []
 90 |     
 91 |     for sub_list_token in list_to_join:
 92 |         new_line = ','.join(sub_list_token)
 93 |         new_list.append(new_line)
 94 |     return new_list
 95 | 
 96 | new_training_list = JoinSubLists(training_list)
 97 | new_testing_list = JoinSubLists(testing_list)
 98 | 
 99 | tokenizer = LoadSavedData(w2v_dir + 'tokenizer.pickle')
100 | train_sequences = tokenizer.texts_to_sequences(new_training_list)
101 | test_sequences = tokenizer.texts_to_sequences(new_testing_list)
102 | word_index = tokenizer.word_index
103 | print ('Found %s unique tokens.' % len(word_index))
104 | 
105 | print ("The length of tokenized sequence: " + str(len(train_sequences)))
106 | print ("The length of tokenized sequence: " + str(len(test_sequences)))
107 | 
108 | # Load the pre-trained embeddings.
109 | w2v_model_path = w2v_dir + '6_projects_w2v_model_CBOW.txt'
110 | w2v_model = open(w2v_model_path, encoding="latin1")
111 | 
112 | print ("----------------------------------------")
113 | print ("The trained word2vec model: ")
114 | print (w2v_model)
115 | 
116 | #------------------------------------#
117 | # 3. Do the paddings.
118 | print ("max_len ", MAX_LEN)
119 | print('Pad sequences (samples x time)')
120 | 
121 | train_sequences_pad = pad_sequences(train_sequences, maxlen = MAX_LEN, padding ='post')
122 | test_sequences_pad = pad_sequences(test_sequences, maxlen = MAX_LEN, padding ='post')
123 | 
124 | print (train_sequences_pad.shape)
125 | print (test_sequences_pad.shape)
126 | 
127 | train_set_x, validation_set_x, train_set_y_id, validation_set_id = train_test_split(train_sequences_pad, training_list_id, test_size=0.3, random_state=42) 
128 | 
129 | print ("Training set: ")
130 | 
131 | print (train_set_x)
132 | 
133 | #print test_validation_set_x
134 | 
135 | print ("The length of the training set: " + str(len(train_set_x)) + "\n" + "The length of the training labels: " +  str(len(train_set_y_id)))
136 | 
137 | print ("Validation set: ")
138 | 
139 | print (validation_set_x)
140 | 
141 | print ("Testing set: ")
142 | 
143 | test_set_x = test_sequences_pad
144 | test_set_id = testing_list_id
145 | 
146 | print (test_set_x)
147 | 
148 | print (len(validation_set_x), len(test_set_x), len(validation_set_id), len(test_set_id))
149 | 
150 | #print validation_set_x, test_set_x, validation_set_y, test_set_y
151 | 
152 | # Now we need to convert all the *_set_y to 0 and 1 labels. All the *_set_y lists contain the actual names of all the samples.
153 | 
154 | # The samples' ids of the train_set should be reserved, so after training we can still use the ids to identify which feature sets belong to which sample.
155 | train_set_y = GenerateLabels(train_set_y_id)
156 | validation_set_y = GenerateLabels(validation_set_id)
157 | test_set_y = GenerateLabels(test_set_id)
158 | 
159 | print ("-------------------------")
160 | 
161 | print ("The shape of the datasets: " + "\r\n")
162 | 
163 | print (train_set_x.shape, train_set_y.shape, validation_set_x.shape, validation_set_y.shape, test_set_x.shape, test_set_y.shape)
164 | 
165 | print (np.count_nonzero(train_set_y), np.count_nonzero(validation_set_y), np.count_nonzero(test_set_y))
166 | 
167 | # ----------------------------------------------------- #
168 | # 4. Preparing the Embedding layer
169 | 
170 | embeddings_index = {} # a dictionary with mapping of a word i.e. 'int' and its corresponding 100 dimension embedding.
171 | 
172 | # Use the loaded model
173 | for line in w2v_model:
174 |     if not line.isspace():
175 |         values = line.split()
176 |         word = values[0]
177 |         coefs = np.asarray(values[1:], dtype='float32')
178 |         embeddings_index[word] = coefs
179 | w2v_model.close()
180 | 
181 | print('Found %s word vectors.' % len(embeddings_index))
182 | 
183 | embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
184 | for word, i in word_index.items():
185 |     embedding_vector = embeddings_index.get(word)
186 |     if embedding_vector is not None:
187 |         # words not found in embedding index will be all-zeros.
188 |         embedding_matrix[i] = embedding_vector
189 | 
190 | # Get the activations (outputs of each layer)
191 | def get_activations(model, model_inputs, print_shape_only=False, layer_name=None):
192 | 
193 |     print('----- activations -----')
194 |     activations = []
195 |     inp = model.input
196 | 
197 |     model_multi_inputs_cond = True
198 |     if not isinstance(inp, list):
199 |         # only one input! let's wrap it in a list.
200 |         inp = [inp]
201 |         model_multi_inputs_cond = False
202 | 
203 |     outputs = [layer.output for layer in model.layers if
204 |                layer.name == layer_name or layer_name is None]  # all layer outputs
205 | 
206 |     funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs]  # evaluation functions
207 | 
208 |     if model_multi_inputs_cond:
209 |         list_inputs = []
210 |         list_inputs.extend(model_inputs)
211 |         list_inputs.append(1.)
212 |     else:
213 |         list_inputs = [model_inputs, 1.]
214 | 
215 |     # Learning phase. 1 = Test mode (no dropout or batch normalization)
216 |     # layer_outputs = [func([model_inputs, 1.])[0] for func in funcs]
217 |     layer_outputs = [func(list_inputs)[0] for func in funcs]
218 |     for layer_activations in layer_outputs:
219 |         activations.append(layer_activations)
220 |         if print_shape_only:
221 |             print(layer_activations.shape)
222 |         else:
223 |             print(layer_activations)
224 |     return activations
225 | 
226 | def storeOuput(arr, path):
227 |     with open(path, 'w') as myfile:
228 |         wr = csv.writer(myfile, delimiter=',', quoting=csv.QUOTE_ALL)
229 |         wr.writerow(arr)
230 |         
231 | # ------------------------------------------------------------ #
232 | # 5. Define network structure
233 | model = Sequential()
234 | #
235 | model.add(Embedding(len(word_index) + 1,
236 |                             EMBEDDING_DIM,
237 |                             weights=[embedding_matrix],
238 |                             input_length=MAX_LEN,
239 |                             trainable=False)) # Layer 0: an embedding layer
240 | model.add(Bidirectional(LSTM(64, activation='tanh', return_sequences=True))) # Layer 1: An LSTM layer (tanh)
241 | model.add(GlobalMaxPooling1D())
242 | #model.add(Bidirectional(LSTM(64))) # Layer 2: An LSTM layer
243 | model.add(Dense(64, activation='tanh'))
244 | model.add(Dense(32))
245 | model.add(Dense(1, activation='sigmoid')) # Layer 3: Dense layer
246 | 
247 | print ("-------------------------")
248 | 
249 | print ("strat compiling the model...")
250 | 
251 | # ------------------------------------------------------------ #
252 | # 6. Configure the learning process
253 | model.compile(loss='binary_crossentropy',
254 |               optimizer='rmsprop',
255 |               metrics=['accuracy'])
256 | 
257 | # Save weights of best training epoch: monitor either val_loss or val_acc
258 | callbacks_list = [
259 |         ModelCheckpoint(filepath = model_saved_path + saved_model_name +'_{epoch:02d}_{val_acc:.3f}.h5', monitor='val_loss', verbose=2, save_best_only=True, period=1),
260 |         EarlyStopping(monitor='val_loss', patience=60, verbose=2, mode="min"),
261 | 		 TensorBoard(log_dir=log_path, batch_size = BATCH_SIZE,  write_graph=True, write_grads=True, write_images=True, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None),
262 |         CSVLogger(log_path + saved_model_name + '.log')]
263 | 
264 | 
265 | print ("start training the model...")
266 | 
267 | # ------------------------------------------------------------ #
268 | # 7. Train the model. 
269 | model.fit(train_set_x, train_set_y,
270 |           epochs=EPOCHS,
271 |           batch_size=BATCH_SIZE,
272 | 		   shuffle = False, # The data has already been shuffle before, so it is unnessary to shuffle it again. (And also, we need to correspond the ids to the features of the samples.)
273 |           #validation_split=0.5,
274 |           validation_data = (validation_set_x, validation_set_y), # Validation data is not used for training (or development of the model)
275 |           callbacks=callbacks_list, # Get the best weights of the model and stop the first raound training.
276 |           verbose=2)
277 | 
278 | print ("Model training completed! ")
279 | 
280 | print ("-----------------------------------------------")
281 | 
282 | print ("Start predicting....")
283 | 
284 | predicted_classes = model.predict_classes(test_set_x, batch_size=BATCH_SIZE, verbose=2)
285 | 
286 | #print (predicted_classes)
287 | 
288 | test_accuracy = np.mean(np.equal(test_set_y, predicted_classes))
289 | 
290 | print ("LSTM classification result: ")
291 | 
292 | target_names = ["Non-vulnerable","Vulnerable"] #non-vulnerable->0, vulnerable->1
293 | print (confusion_matrix(test_set_y, predicted_classes, labels=[0,1]))   
294 | print ("\r\n")
295 | print ("\r\n")
296 | print (classification_report(test_set_y, predicted_classes, target_names=target_names))
297 | 
298 | print ("LSTM prediction completed.")
299 | 
300 | K.clear_session()	
301 | 
302 | print ("\r\n")
303 | print ("--- %s seconds ---" + str(time.time() - script_start_time))


--------------------------------------------------------------------------------
/Data/CodeMetrics/FFmpeg.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/CodeMetrics/FFmpeg.zip


--------------------------------------------------------------------------------
/Data/CodeMetrics/LibPNG.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/CodeMetrics/LibPNG.zip


--------------------------------------------------------------------------------
/Data/CodeMetrics/LibTIFF.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/CodeMetrics/LibTIFF.zip


--------------------------------------------------------------------------------
/Data/CodeMetrics/VLC.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/CodeMetrics/VLC.zip


--------------------------------------------------------------------------------
/Data/TrainedTokenizer/tokenizer.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/TrainedTokenizer/tokenizer.zip


--------------------------------------------------------------------------------
/Data/TrainedWord2vecModel/6_projects_w2v_model_CBOW.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/TrainedWord2vecModel/6_projects_w2v_model_CBOW.zip


--------------------------------------------------------------------------------
/Data/VulnerabilityData/6_projects_functions.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/VulnerabilityData/6_projects_functions.zip


--------------------------------------------------------------------------------
/Data/VulnerabilityData/Sample data Info.txt:
--------------------------------------------------------------------------------
 1 | The .pkl files are Python binaries created by pickle which is a Python module.
 2 | 
 3 | These .pkl files are for the experiments on FFmpeg project. 
 4 | 
 5 | The experiments are based on the scenario where there are some labeled historical projects available:
 6 | 
 7 | 1. The except_ffmpeg_list.pkl contains the vulnerable and non-vulnerable functions from the historical projects. The except_ffmpeg_list_id.pkl are the funtion IDs which can be used for generating labels (all the vulnerable functions are named with their CVE IDs).
 8 | 
 9 | 2. The ffmpeg_list.pkl contains the functions from target project (FFmpeg).
10 | 


--------------------------------------------------------------------------------
/Data/VulnerabilityData/except_ffmpeg_list.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/VulnerabilityData/except_ffmpeg_list.pkl


--------------------------------------------------------------------------------
/Data/VulnerabilityData/except_ffmpeg_list_id.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/VulnerabilityData/except_ffmpeg_list_id.pkl


--------------------------------------------------------------------------------
/Data/VulnerabilityData/ffmpeg_list.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/VulnerabilityData/ffmpeg_list.pkl


--------------------------------------------------------------------------------
/Data/VulnerabilityData/ffmpeg_list_id.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/VulnerabilityData/ffmpeg_list_id.pkl


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Transferable Representation Learning
 2 | 
 3 | Hi there, welcome to this page!
 4 | 
 5 | This page contains the code and data used in the paper [Cross-Project Transfer Representation Learning for Vulnerable Function Discovery](https://ieeexplore.ieee.org/abstract/document/8329207/) by Guanjun Lin; Jun Zhang; Wei Luo; Lei Pan; Yang Xiang; Olivier De Vel and Paul Montague.
 6 | 
 7 | ### Instructions:
 8 | 
 9 | The Vulnerabilities_info.xlsx file contains information of the collected function-level vulnerabilities (It just a record for reference.). These vulnerabilities are from 6 open source projects: [FFmpeg](https://github.com/FFmpeg/FFmpeg), [LibTIFF](https://github.com/vadz/libtiff), [LibPNG](https://github.com/glennrp/libpng), [Pidgin](https://pidgin.im/), [Asterisk](https://www.asterisk.org/get-started) and [VLC Media Player](https://www.videolan.org/vlc/index.html). And vulnerability information was collected from [National Vulnerability Database(NVD)](https://nvd.nist.gov/) until the end of July 2017.
10 | 
11 | ### Requirements for code:
12 | 
13 |  * [Tensorflow](https://www.tensorflow.org/)
14 |  * [Keras](https://github.com/fchollet/keras/tree/master/keras)
15 |  * [Scikit-learn](http://scikit-learn.org/stable/)
16 |  * [Gensim](https://radimrehurek.com/gensim/)
17 |  * Python >= 2.7
18 | 
19 | The dependencies can be installed using [Anaconda](https://www.anaconda.com/download/). For example:
20 | 
21 | ```bash
22 | $ bash Anaconda3-5.0.1-Linux-x86_64.sh
23 | ```
24 | 
25 | The "Data" folder contains the following sub folders:
26 | 1) VulnerabilityData -- It contains a ZIP file which stores the vulnerable and part of non_vulnerable functions from 6 open source projects. Unzip the file, one will find 6 folders named with the projects. Each folder contains the source code of the non-vulnerable functions (named with their function names) and vulnerable functions (named with the CVE IDs):
27 |    * The vulnerable functions are all named with the CVE IDs (their names are starting with ‘cve-’ or ‘CVE-’). For example, “cve-2017-14005.c” is a vulnerable function. 
28 |    * The non-vulnerable functions are named with the format: “xxxx_file_name_function_name.c” to avoid duplicated file/function names. For example “1374_cmdutils.c_show_devices.c” is a non-vulnerable function.
29 | 
30 | In the pre-training phase, one can choose any 5 projects as the historical data for training a LSTM network (the labels can be generated based on the file names (vulnerable functions have the CVE IDs as their file names. Please see the code for more details). Then, the remaining 1 project can be used as the input to the pre-trained network for generating representations. Finally, the generated representations can be used as features for training a classifier. 
31 | 
32 | 2) CodeMetrics -- It stores the code metrics extracted from the source code files of the open source projects. The code metrics are used as features to train a random forest classifier as the baseline to compare with the method which uses transfer-learned representations as features. We used [Understand](https://scitools.com/) which is a commercial code enhancement tool for extracting function-level code metrics. We included 23 code metrics extracted from the vulnerable functions of 6 projects.
33 |  
34 | 3) TrainedTokenizer -- It contains the trained tokenizer file which is used for converting the serialized AST lists to numeric tokens.
35 | 
36 | 4) TrainedWord2vecModel -- It includes the trained Word2vec model. The model was trained on the code base of 6 open source projects. The Word2vec model is used in the embedding layer of the LSTM network for converting input sequence to meaningful embeddings.
37 | 
38 | The "Code" folder contains the Python code samples. 
39 | 1) TransferableRepresentationLearning_LSTM_DNN.py file is for LSTM network training. It defines the structure of the Bi-LSTM network used in the paper. The input of the file is the historical vulnerable functions that have labels. The output of the file is a trained LSTM network capable of obtaining vulnerable function representations. 
40 | 
41 | 2) ExtractLearnedFeaturesAndClassification.py file is for obtaining the function representations from the pre-trained LSTM network. It also includes the code for training a random forest classifier based on the obtained function representations as features.
42 | 
43 | 3) CodeMetrics.py file is to train a random forest classifier based on the selected 23 code metrics.
44 | 
45 | If you are interested in our project, please contact junzhang@swin.edu.au for more information. If you use our code and data in your work, please kindly cite our paper in your work. 
46 | 
47 | The latex format:
48 | 
49 | ```
50 | @article{lin2018cross,
51 |   title={Cross-Project Transfer Representation Learning for Vulnerable Function Discovery},
52 |   author={Lin, Guanjun and Zhang, Jun and Luo, Wei and Pan, Lei and Xiang, Yang and De Vel, Olivier and Montague, Paul},
53 |   journal={IEEE Transactions on Industrial Informatics},
54 |   year={2018},
55 |   publisher={IEEE}
56 | }
57 | ```
58 | 
59 | Thank you!
60 | 


--------------------------------------------------------------------------------
/Vulnerabilities_info.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Vulnerabilities_info.xlsx


--------------------------------------------------------------------------------