├── Code ├── CodeMetrics.py ├── ExtractLearnedFeaturesAndClassification.py └── TransferableRepresentationLearning_LSTM_DNN.py ├── Data ├── CodeMetrics │ ├── FFmpeg.zip │ ├── LibPNG.zip │ ├── LibTIFF.zip │ └── VLC.zip ├── TrainedTokenizer │ └── tokenizer.zip ├── TrainedWord2vecModel │ └── 6_projects_w2v_model_CBOW.zip └── VulnerabilityData │ ├── 6_projects_functions.zip │ ├── Sample data Info.txt │ ├── except_ffmpeg_list.pkl │ ├── except_ffmpeg_list_id.pkl │ ├── ffmpeg_list.pkl │ └── ffmpeg_list_id.pkl ├── README.md └── Vulnerabilities_info.xlsx /Code/CodeMetrics.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Oct 24 11:33:37 2017 4 | 5 | Implement RF classifier using sciki-learn package 6 | 7 | The RF classifer is trained using code metrics as features. Using code metrics as features is used as the baseline to compare with the method which uses transfer-learned representations as features. 8 | 9 | """ 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import time 14 | import csv 15 | 16 | from sklearn.metrics import classification_report 17 | from sklearn.metrics import confusion_matrix 18 | from sklearn.grid_search import GridSearchCV 19 | from sklearn.model_selection import cross_val_predict 20 | from sklearn.ensemble import RandomForestClassifier 21 | 22 | script_start_time = time.time() 23 | 24 | working_dir = '/home/your/user/name/CodeMetrics/vlc/' 25 | print ("Script starts at: " + str(script_start_time)) 26 | 27 | 28 | # 1. Import processed data from CSV files. 29 | #------------------------------------------- 30 | def getData(filePath): 31 | df = pd.read_csv(filePath, header=None, sep=",") 32 | 33 | df_list = df.values.tolist() 34 | 35 | return np.asarray(df_list) 36 | 37 | def GenerateLabels(input_arr): 38 | temp_arr = [] 39 | for func_id in input_arr: 40 | temp_sub_arr = [] 41 | if "cve" in func_id or "CVE" in func_id: 42 | temp_sub_arr.append(1) 43 | else: 44 | temp_sub_arr.append(0) 45 | temp_arr.append(temp_sub_arr) 46 | return np.asarray(temp_arr) 47 | 48 | def storeOuput(arr, path): 49 | with open(path, 'w') as myfile: 50 | wr = csv.writer(myfile) 51 | wr.writerow(arr) 52 | 53 | train_set_x = getData(working_dir + 'train_vlc_cm.csv') 54 | train_set_id = getData(working_dir + 'train_vlc_id.csv') 55 | 56 | test_set_x = getData(working_dir + 'test_vlc_cm.csv') 57 | test_set_id = getData(working_dir + 'test_vlc_id.csv') 58 | 59 | train_set_id = np.ndarray.flatten(np.asarray(train_set_id)) 60 | test_set_id = np.ndarray.flatten(np.asarray(test_set_id)) 61 | 62 | train_set_y = GenerateLabels(train_set_id) 63 | test_set_y = GenerateLabels(test_set_id) 64 | 65 | train_set_y = np.ndarray.flatten(np.asarray(train_set_y)) 66 | test_set_y = np.ndarray.flatten(np.asarray(test_set_y)) 67 | 68 | print ("Training set: ") 69 | print (train_set_x) 70 | 71 | print ("Testing set: ") 72 | print (test_set_x) 73 | 74 | print ("The length of training and testing sets: ") 75 | 76 | print (len(train_set_x), len(test_set_x), len(train_set_y), len(test_set_y)) 77 | 78 | print ("-------------------------") 79 | 80 | print ("The shape of the datasets: " + "\r\n") 81 | 82 | #print (train_set_x.shape, train_set_y.shape, test_set_x.shape, test_set_y.shape) 83 | 84 | print (np.count_nonzero(train_set_y), np.count_nonzero(test_set_y)) 85 | 86 | # 2. Training RF parameters 87 | # ------------------------------------------------------- 88 | param_grid = {'max_depth': [3,4,5,6,10,15,20], 89 | 'min_samples_split': [2,3,4,5,6,10], 90 | 'min_samples_leaf': [1,2,3,4,5,6,10], 91 | 'bootstrap': [True,False], 92 | 'criterion': ['gini','entropy'], 93 | 'n_estimators': [10,20,30,40,50,60,70]} 94 | 95 | # 3. Start training the RF model 96 | #-------------------------------------------- 97 | clf = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, n_jobs=-1) 98 | clf = clf.fit(train_set_x, train_set_y) 99 | 100 | print("best estimator found by grid search:") 101 | print(clf.best_estimator_) 102 | 103 | # Output the feature importance. 104 | feature_importances = clf.best_estimator_.feature_importances_ 105 | 106 | print ("\r\n") 107 | 108 | #evaluate the model on the test set 109 | print("predicting on the test set") 110 | #t0 = time() 111 | y_predict = clf.predict(test_set_x) 112 | 113 | y_predict_proba = clf.predict_proba(test_set_x) 114 | 115 | np.savetxt("y_predict_vlc_cm.csv", y_predict, delimiter=",") 116 | np.savetxt("y_predict_proba_vlc_cm.csv", y_predict_proba, delimiter=",") 117 | storeOuput(test_set_y, "test_label_vlc.csv") 118 | #np.savetxt("./test_label_3.csv", test_set_y, delimiter=",") 119 | #np.savetxt("./feature_importances_short.csv", feature_importances, delimiter=",") 120 | 121 | #y_predict_proba = cross_val_predict(clf, ) 122 | 123 | #print (y_predict_proba) 124 | 125 | # Accuracy 126 | accuracy = np.mean(test_set_y==y_predict)*100 127 | print ("accuracy = " + str(accuracy)) 128 | 129 | target_names = ["Non-vulnerable","Vulnerable"] #non-vulnerable->0, vulnerable->1 130 | print (confusion_matrix(test_set_y, y_predict, labels=[0,1])) 131 | print ("\r\n") 132 | print ("\r\n") 133 | print (classification_report(test_set_y, y_predict, target_names=target_names)) 134 | 135 | print ("\r\n") 136 | print ("--- %s seconds ---" + str((time.time() - script_start_time))) -------------------------------------------------------------------------------- /Code/ExtractLearnedFeaturesAndClassification.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Nov 02 16:35:39 2017 4 | 5 | This file has two functions: 6 | 1. load the trained LSTM network for obtaining the function representations as features. 7 | 2. train a random forest classifier using obtained features. 8 | 9 | """ 10 | 11 | import time 12 | import numpy as np 13 | import pandas as pd 14 | import os 15 | import csv 16 | import pickle 17 | 18 | from sklearn.model_selection import train_test_split 19 | 20 | from keras.preprocessing.text import Tokenizer 21 | from keras.preprocessing.sequence import pad_sequences 22 | from keras.models import load_model 23 | from keras.preprocessing import sequence 24 | from keras import backend as K 25 | 26 | from sklearn.metrics import classification_report 27 | from sklearn.metrics import confusion_matrix 28 | from sklearn.grid_search import GridSearchCV 29 | from sklearn.model_selection import cross_val_predict 30 | from sklearn.ensemble import RandomForestClassifier 31 | 32 | script_start_time = time.time() 33 | 34 | print ("Script starts at: " + str(script_start_time)) 35 | 36 | # ------------------------------------------------------------ # 37 | # Parameters used 38 | MAX_LEN = 1000 # The Padding Length for each sample. 39 | EMBEDDING_DIM = 100 # The Embedding Dimension for each element within the sequence of a data sample. 40 | 41 | #--------------------------------------------------------# 42 | # 1. Directories of all the needed files. 43 | 44 | project_name = "FFmpeg" 45 | 46 | working_dir = '/home/your/user/name/TransferRepresentationLearning/ffmpeg/' 47 | 48 | w2v_dir = '/home/your/user/name/TransferRepresentationLearning/word2vec/' 49 | 50 | model_saved_path = '/home/your/user/name/TransferRepresentationLearning/models/' 51 | 52 | #--------------------------------------------------------# 53 | # 2. Load the saved model and compile it. 54 | 55 | # The path where the trained models are saved. 56 | model = load_model(model_saved_path + '1st_1000_100_32_90_test_on_ffmpeg.h5') 57 | 58 | model.compile(loss='binary_crossentropy', 59 | optimizer='rmsprop', 60 | metrics=['accuracy']) 61 | 62 | print (model.summary()) 63 | 64 | #--------------------------------------------------------# 65 | # 3. Load the data 66 | 67 | def LoadSavedData(path): 68 | with open(path, 'rb') as f: 69 | loaded_data = pickle.load(f) 70 | return loaded_data 71 | 72 | def GenerateLabels(input_arr): 73 | temp_arr = [] 74 | for func_id in input_arr: 75 | temp_sub_arr = [] 76 | if "cve" in func_id or "CVE" in func_id: 77 | temp_sub_arr.append(1) 78 | else: 79 | temp_sub_arr.append(0) 80 | temp_arr.append(temp_sub_arr) 81 | return np.asarray(temp_arr) 82 | 83 | training_list = LoadSavedData(working_dir + 'except_ffmpeg_list.pkl') 84 | training_list_id = LoadSavedData(working_dir + 'except_ffmpeg_list_id.pkl') 85 | 86 | testing_list = LoadSavedData(working_dir + 'ffmpeg_list.pkl') 87 | testing_list_id = LoadSavedData(working_dir + 'ffmpeg_list_id.pkl') 88 | 89 | print ("The number of training functions: " + str(len(training_list)) + " ID: " + str(len(training_list_id))) 90 | print ("The number of testing functions: " + str(len(testing_list)) + " ID: " + str(len(testing_list_id))) 91 | 92 | #------------------------------------# 93 | # 2. Load pre-trained word2vec and tokens 94 | 95 | def JoinSubLists(list_to_join): 96 | new_list = [] 97 | 98 | for sub_list_token in list_to_join: 99 | new_line = ','.join(sub_list_token) 100 | new_list.append(new_line) 101 | return new_list 102 | 103 | new_training_list = JoinSubLists(training_list) 104 | new_testing_list = JoinSubLists(testing_list) 105 | 106 | tokenizer = LoadSavedData(w2v_dir + 'tokenizer.pickle') 107 | train_sequences = tokenizer.texts_to_sequences(new_training_list) 108 | test_sequences = tokenizer.texts_to_sequences(new_testing_list) 109 | word_index = tokenizer.word_index 110 | print ('Found %s unique tokens.' % len(word_index)) 111 | 112 | print ("The length of tokenized sequence: " + str(len(train_sequences))) 113 | print ("The length of tokenized sequence: " + str(len(test_sequences))) 114 | 115 | # Load the pre-trained embeddings. 116 | w2v_model_path = w2v_dir + '6_projects_w2v_model_CBOW.txt' 117 | w2v_model = open(w2v_model_path, encoding="latin1") 118 | 119 | print ("----------------------------------------") 120 | print ("The trained word2vec model: ") 121 | print (w2v_model) 122 | 123 | #------------------------------------# 124 | # 3. Do the paddings. 125 | print ("max_len ", MAX_LEN) 126 | print('Pad sequences (samples x time)') 127 | 128 | train_sequences_pad = pad_sequences(train_sequences, maxlen = MAX_LEN, padding ='post') 129 | test_sequences_pad = pad_sequences(test_sequences, maxlen = MAX_LEN, padding ='post') 130 | 131 | train_set_x = train_sequences_pad 132 | test_set_x = test_sequences_pad 133 | 134 | train_set_y = GenerateLabels(training_list_id) 135 | test_set_y = GenerateLabels(testing_list_id) 136 | 137 | print (len(train_set_x), len(train_set_y), len(test_set_x), len(test_set_y)) 138 | 139 | print ("-------------------------") 140 | 141 | print ("The shape of the datasets: " + "\r\n") 142 | 143 | print (train_set_x.shape, train_set_y.shape, test_set_x.shape, test_set_y.shape) 144 | 145 | print (np.count_nonzero(train_set_y), np.count_nonzero(test_set_y)) 146 | 147 | # ------------------------------------------------------------ # 148 | # 4. Get the activations (outputs of each layer) 149 | def get_activations(model, model_inputs, print_shape_only=False, layer_name=None): 150 | 151 | print('----- activations -----') 152 | activations = [] 153 | inp = model.input 154 | 155 | model_multi_inputs_cond = True 156 | if not isinstance(inp, list): 157 | # only one input! let's wrap it in a list. 158 | inp = [inp] 159 | model_multi_inputs_cond = False 160 | 161 | print("Preparing outputs....") 162 | outputs = [layer.output for layer in model.layers if 163 | layer.name == layer_name or layer_name is None] # all layer outputs 164 | 165 | print ("-----------------") 166 | print (len(outputs)) 167 | print ("-----------------") 168 | funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs] # evaluation functions 169 | 170 | if model_multi_inputs_cond: 171 | list_inputs = [] 172 | list_inputs.extend(model_inputs) 173 | list_inputs.append(1.) 174 | else: 175 | list_inputs = [model_inputs, 1.] 176 | 177 | print ("--------Layer Ouputs---------") 178 | 179 | # Learning phase. 1 = Test mode (no dropout or batch normalization) 180 | # layer_outputs = [func([model_inputs, 1.])[0] for func in funcs] 181 | layer_outputs = [func(list_inputs)[0] for func in funcs] 182 | for layer_activations in layer_outputs: 183 | activations.append(layer_activations) 184 | if print_shape_only: 185 | print(layer_activations.shape) 186 | else: 187 | print(layer_activations) 188 | return activations 189 | 190 | # 4.1 Get the activations (representations) using all the training and testing samples. 191 | 192 | print ("Saving the layer outputs...") 193 | 194 | repre_testing = get_activations(model, test_set_x, print_shape_only=True) 195 | 196 | def storeOuput(arr, path): 197 | with open(path, 'w') as myfile: 198 | wr = csv.writer(myfile) 199 | wr.writerow(arr) 200 | 201 | # There are five layers, we only care about the third and the fourth layer. 202 | 203 | # train_X = train_layer_three = repre_training[2] 204 | # test_X = test_layer_three = repre_testing[2] 205 | 206 | #train_X = train_layer_three = repre_training 207 | ffmpeg_repre = test_layer_three = repre_testing[4] 208 | 209 | #train_X = train_layer_three = repre_training 210 | #test_X = test_layer_three = repre_testing 211 | 212 | #train_y = np.ndarray.flatten(np.asarray(train_set_y)) 213 | #libtiff_label = np.ndarray.flatten(np.asarray(test_set_y)) 214 | 215 | print ("-------------------------") 216 | 217 | print ("The shape of the learned representations: " + "\r\n") 218 | 219 | print (ffmpeg_repre.shape) 220 | 221 | """ 222 | The obtained representations for project Ffmpeg can be used as features for training a machine learning classifier (here we use random forest). 223 | 224 | Suppose that project FFmpeg has very limited labeled data. To simulate this situation, we divide the total FFmpeg functions into two sets: 225 | 226 | 25% of total functions are used for training (simulated the labeled data), and 75% of total functions for testing (simulated the unlabeled data) 227 | 228 | """ 229 | 230 | train_set_x, test_set_x, train_set_y_id, test_set_y_id = train_test_split(ffmpeg_repre, testing_list_id, test_size=0.75, random_state=42) 231 | 232 | train_X = train_set_x 233 | train_y = GenerateLabels(train_set_y_id) 234 | test_X = test_set_x 235 | test_y = GenerateLabels(test_set_y_id) 236 | 237 | train_y = np.ndarray.flatten(np.asarray(train_y)) 238 | test_y = np.ndarray.flatten(np.asarray(test_y)) 239 | 240 | print ("-------------------------") 241 | 242 | print ("The shape of the datasets: " + "\r\n") 243 | 244 | print (len(train_X), len(train_y), len(test_X), len(test_y)) 245 | 246 | print (train_X.shape, train_y.shape, test_X.shape, test_y.shape) 247 | 248 | print (np.count_nonzero(train_y), np.count_nonzero(test_y)) 249 | 250 | #------------------------------------------------------------------------------- 251 | # Invoke Sklearn tools for classification -- using Random Forest 252 | 253 | #train a random forest model 254 | print ("Fitting the classifier to the training set") 255 | #t0 = time() 256 | param_grid = {'max_depth': [2,3,4,5,9,10,11,15,20], 257 | 'min_samples_split': [2,3,4,5,6,10], 258 | 'min_samples_leaf': [2,3,4,5,6,10], 259 | 'bootstrap': [True,False], 260 | 'criterion': ['gini','entropy'], 261 | 'n_estimators': [10,20,30,40,50,55,60,65,70]} 262 | 263 | #construct the grid search classifier, 10-fold Cross Validation 264 | clf = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, n_jobs=-1) 265 | clf = clf.fit(train_X, train_y) 266 | #print("finished params search in %0.3fs" % (time() - t0)) 267 | print("best estimator found by grid search:") 268 | print(clf.best_estimator_) 269 | 270 | #for params, mean_score, scores in clf.grid_scores_: 271 | #print (mean_score, scores.std()*2, params) 272 | print ("\r\n") 273 | 274 | #evaluate the model on the test set 275 | print("predicting on the test set") 276 | #t0 = time() 277 | y_predict = clf.predict(test_X) 278 | 279 | y_predict_proba = clf.predict_proba(test_X) 280 | 281 | np.savetxt("./new_results/learned_repr_ffmpeg1.csv", ffmpeg_repre, delimiter=",") 282 | np.savetxt("./new_results/y_predict_proba1_w2v_gmp_ffmpeg1.csv", y_predict_proba, delimiter=",") 283 | np.savetxt("./new_results/y_predict1_w2v_gmp_ffmpeg1.csv", y_predict, delimiter=",") 284 | np.savetxt("./new_results/test_label_ffmpeg1.csv", test_set_y, delimiter=",") 285 | storeOuput(test_set_y_id, "./new_results/test_output_ids_ffmpeg1.csv") 286 | storeOuput(testing_list_id, "./new_results/ffmpeg_ids1.csv") 287 | 288 | #y_predict_proba = cross_val_predict(clf, ) 289 | 290 | #print (y_predict_proba) 291 | 292 | # Accuracy 293 | accuracy = np.mean(test_y==y_predict)*100 294 | print ("accuracy = " + str(accuracy)) 295 | 296 | target_names = ["Non-vulnerable","Vulnerable"] #non-vulnerable->0, vulnerable->1 297 | print (confusion_matrix(test_y, y_predict, labels=[0,1])) 298 | print ("\r\n") 299 | print ("\r\n") 300 | print (classification_report(test_y, y_predict, target_names=target_names)) 301 | 302 | K.clear_session() 303 | 304 | print ("\r\n") 305 | print ("--- %s seconds --- " + str((time.time() - script_start_time))) -------------------------------------------------------------------------------- /Code/TransferableRepresentationLearning_LSTM_DNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Nov 11 14:51:39 2017 4 | 5 | This file implements a LSTM network that is capable of leveraging the historical vulnerable function data for learning the representations. 6 | 7 | """ 8 | 9 | import time 10 | import pickle 11 | import csv 12 | import numpy as np 13 | 14 | from sklearn.model_selection import train_test_split 15 | 16 | from keras.preprocessing.text import Tokenizer 17 | from keras.preprocessing.sequence import pad_sequences 18 | 19 | from sklearn.metrics import classification_report 20 | from sklearn.metrics import confusion_matrix 21 | from sklearn.grid_search import GridSearchCV 22 | from sklearn.model_selection import cross_val_predict 23 | from sklearn.ensemble import RandomForestClassifier 24 | 25 | from keras.models import Sequential 26 | from keras.layers import Dense, Dropout 27 | from keras.layers import Embedding, Bidirectional 28 | from keras.layers import LSTM 29 | from keras.layers import GlobalMaxPooling1D 30 | from keras.callbacks import ModelCheckpoint, EarlyStopping 31 | from keras.callbacks import TensorBoard, CSVLogger 32 | from keras import backend as K 33 | 34 | script_start_time = time.time() 35 | 36 | print ("Script starts at: " + str(script_start_time)) 37 | 38 | # ------------------------------------------------------------ # 39 | # Parameters used 40 | MAX_LEN = 1000 # The Padding Length for each sample. 41 | EMBEDDING_DIM = 100 # The Embedding Dimension for each element within the sequence of a data sample. 42 | 43 | NUM_TRAIN_SAMPLE = 19326 44 | NUM_VALIDATION_SAMPLE = 8283 45 | NUM_TEST_SAMPLE = 4921 46 | BATCH_SIZE = 32 47 | EPOCHS = 150 48 | 49 | working_dir = '/home/your/user/name/TransferRepresentationLearning/ffmpeg/' 50 | 51 | w2v_dir = '/home/your/user/name/TransferRepresentationLearning/word2vec/' 52 | 53 | log_path = '/home/your/user/name/TransferRepresentationLearning/Logs/' 54 | 55 | # The path where the trained models are saved. 56 | model_saved_path = '/home/your/user/name/TransferRepresentationLearning/models/' 57 | 58 | saved_model_name = "1st_1000_100_32_90_test_on_ffmpeg" 59 | 60 | def LoadSavedData(path): 61 | with open(path, 'rb') as f: 62 | loaded_data = pickle.load(f) 63 | return loaded_data 64 | 65 | def GenerateLabels(input_arr): 66 | temp_arr = [] 67 | for func_id in input_arr: 68 | temp_sub_arr = [] 69 | if "cve" in func_id or "CVE" in func_id: 70 | temp_sub_arr.append(1) 71 | else: 72 | temp_sub_arr.append(0) 73 | temp_arr.append(temp_sub_arr) 74 | return np.asarray(temp_arr) 75 | 76 | training_list = LoadSavedData(working_dir + 'except_ffmpeg_list.pkl') 77 | training_list_id = LoadSavedData(working_dir + 'except_ffmpeg_list_id.pkl') 78 | 79 | testing_list = LoadSavedData(working_dir + 'ffmpeg_list.pkl') 80 | testing_list_id = LoadSavedData(working_dir + 'ffmpeg_list_id.pkl') 81 | 82 | print ("The number of training functions: " + str(len(training_list)) + " ID: " + str(len(training_list_id))) 83 | print ("The number of testing functions: " + str(len(testing_list)) + " ID: " + str(len(testing_list_id))) 84 | 85 | #------------------------------------# 86 | # 2. Load pre-trained word2vec and tokens 87 | 88 | def JoinSubLists(list_to_join): 89 | new_list = [] 90 | 91 | for sub_list_token in list_to_join: 92 | new_line = ','.join(sub_list_token) 93 | new_list.append(new_line) 94 | return new_list 95 | 96 | new_training_list = JoinSubLists(training_list) 97 | new_testing_list = JoinSubLists(testing_list) 98 | 99 | tokenizer = LoadSavedData(w2v_dir + 'tokenizer.pickle') 100 | train_sequences = tokenizer.texts_to_sequences(new_training_list) 101 | test_sequences = tokenizer.texts_to_sequences(new_testing_list) 102 | word_index = tokenizer.word_index 103 | print ('Found %s unique tokens.' % len(word_index)) 104 | 105 | print ("The length of tokenized sequence: " + str(len(train_sequences))) 106 | print ("The length of tokenized sequence: " + str(len(test_sequences))) 107 | 108 | # Load the pre-trained embeddings. 109 | w2v_model_path = w2v_dir + '6_projects_w2v_model_CBOW.txt' 110 | w2v_model = open(w2v_model_path, encoding="latin1") 111 | 112 | print ("----------------------------------------") 113 | print ("The trained word2vec model: ") 114 | print (w2v_model) 115 | 116 | #------------------------------------# 117 | # 3. Do the paddings. 118 | print ("max_len ", MAX_LEN) 119 | print('Pad sequences (samples x time)') 120 | 121 | train_sequences_pad = pad_sequences(train_sequences, maxlen = MAX_LEN, padding ='post') 122 | test_sequences_pad = pad_sequences(test_sequences, maxlen = MAX_LEN, padding ='post') 123 | 124 | print (train_sequences_pad.shape) 125 | print (test_sequences_pad.shape) 126 | 127 | train_set_x, validation_set_x, train_set_y_id, validation_set_id = train_test_split(train_sequences_pad, training_list_id, test_size=0.3, random_state=42) 128 | 129 | print ("Training set: ") 130 | 131 | print (train_set_x) 132 | 133 | #print test_validation_set_x 134 | 135 | print ("The length of the training set: " + str(len(train_set_x)) + "\n" + "The length of the training labels: " + str(len(train_set_y_id))) 136 | 137 | print ("Validation set: ") 138 | 139 | print (validation_set_x) 140 | 141 | print ("Testing set: ") 142 | 143 | test_set_x = test_sequences_pad 144 | test_set_id = testing_list_id 145 | 146 | print (test_set_x) 147 | 148 | print (len(validation_set_x), len(test_set_x), len(validation_set_id), len(test_set_id)) 149 | 150 | #print validation_set_x, test_set_x, validation_set_y, test_set_y 151 | 152 | # Now we need to convert all the *_set_y to 0 and 1 labels. All the *_set_y lists contain the actual names of all the samples. 153 | 154 | # The samples' ids of the train_set should be reserved, so after training we can still use the ids to identify which feature sets belong to which sample. 155 | train_set_y = GenerateLabels(train_set_y_id) 156 | validation_set_y = GenerateLabels(validation_set_id) 157 | test_set_y = GenerateLabels(test_set_id) 158 | 159 | print ("-------------------------") 160 | 161 | print ("The shape of the datasets: " + "\r\n") 162 | 163 | print (train_set_x.shape, train_set_y.shape, validation_set_x.shape, validation_set_y.shape, test_set_x.shape, test_set_y.shape) 164 | 165 | print (np.count_nonzero(train_set_y), np.count_nonzero(validation_set_y), np.count_nonzero(test_set_y)) 166 | 167 | # ----------------------------------------------------- # 168 | # 4. Preparing the Embedding layer 169 | 170 | embeddings_index = {} # a dictionary with mapping of a word i.e. 'int' and its corresponding 100 dimension embedding. 171 | 172 | # Use the loaded model 173 | for line in w2v_model: 174 | if not line.isspace(): 175 | values = line.split() 176 | word = values[0] 177 | coefs = np.asarray(values[1:], dtype='float32') 178 | embeddings_index[word] = coefs 179 | w2v_model.close() 180 | 181 | print('Found %s word vectors.' % len(embeddings_index)) 182 | 183 | embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) 184 | for word, i in word_index.items(): 185 | embedding_vector = embeddings_index.get(word) 186 | if embedding_vector is not None: 187 | # words not found in embedding index will be all-zeros. 188 | embedding_matrix[i] = embedding_vector 189 | 190 | # Get the activations (outputs of each layer) 191 | def get_activations(model, model_inputs, print_shape_only=False, layer_name=None): 192 | 193 | print('----- activations -----') 194 | activations = [] 195 | inp = model.input 196 | 197 | model_multi_inputs_cond = True 198 | if not isinstance(inp, list): 199 | # only one input! let's wrap it in a list. 200 | inp = [inp] 201 | model_multi_inputs_cond = False 202 | 203 | outputs = [layer.output for layer in model.layers if 204 | layer.name == layer_name or layer_name is None] # all layer outputs 205 | 206 | funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs] # evaluation functions 207 | 208 | if model_multi_inputs_cond: 209 | list_inputs = [] 210 | list_inputs.extend(model_inputs) 211 | list_inputs.append(1.) 212 | else: 213 | list_inputs = [model_inputs, 1.] 214 | 215 | # Learning phase. 1 = Test mode (no dropout or batch normalization) 216 | # layer_outputs = [func([model_inputs, 1.])[0] for func in funcs] 217 | layer_outputs = [func(list_inputs)[0] for func in funcs] 218 | for layer_activations in layer_outputs: 219 | activations.append(layer_activations) 220 | if print_shape_only: 221 | print(layer_activations.shape) 222 | else: 223 | print(layer_activations) 224 | return activations 225 | 226 | def storeOuput(arr, path): 227 | with open(path, 'w') as myfile: 228 | wr = csv.writer(myfile, delimiter=',', quoting=csv.QUOTE_ALL) 229 | wr.writerow(arr) 230 | 231 | # ------------------------------------------------------------ # 232 | # 5. Define network structure 233 | model = Sequential() 234 | # 235 | model.add(Embedding(len(word_index) + 1, 236 | EMBEDDING_DIM, 237 | weights=[embedding_matrix], 238 | input_length=MAX_LEN, 239 | trainable=False)) # Layer 0: an embedding layer 240 | model.add(Bidirectional(LSTM(64, activation='tanh', return_sequences=True))) # Layer 1: An LSTM layer (tanh) 241 | model.add(GlobalMaxPooling1D()) 242 | #model.add(Bidirectional(LSTM(64))) # Layer 2: An LSTM layer 243 | model.add(Dense(64, activation='tanh')) 244 | model.add(Dense(32)) 245 | model.add(Dense(1, activation='sigmoid')) # Layer 3: Dense layer 246 | 247 | print ("-------------------------") 248 | 249 | print ("strat compiling the model...") 250 | 251 | # ------------------------------------------------------------ # 252 | # 6. Configure the learning process 253 | model.compile(loss='binary_crossentropy', 254 | optimizer='rmsprop', 255 | metrics=['accuracy']) 256 | 257 | # Save weights of best training epoch: monitor either val_loss or val_acc 258 | callbacks_list = [ 259 | ModelCheckpoint(filepath = model_saved_path + saved_model_name +'_{epoch:02d}_{val_acc:.3f}.h5', monitor='val_loss', verbose=2, save_best_only=True, period=1), 260 | EarlyStopping(monitor='val_loss', patience=60, verbose=2, mode="min"), 261 | TensorBoard(log_dir=log_path, batch_size = BATCH_SIZE, write_graph=True, write_grads=True, write_images=True, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None), 262 | CSVLogger(log_path + saved_model_name + '.log')] 263 | 264 | 265 | print ("start training the model...") 266 | 267 | # ------------------------------------------------------------ # 268 | # 7. Train the model. 269 | model.fit(train_set_x, train_set_y, 270 | epochs=EPOCHS, 271 | batch_size=BATCH_SIZE, 272 | shuffle = False, # The data has already been shuffle before, so it is unnessary to shuffle it again. (And also, we need to correspond the ids to the features of the samples.) 273 | #validation_split=0.5, 274 | validation_data = (validation_set_x, validation_set_y), # Validation data is not used for training (or development of the model) 275 | callbacks=callbacks_list, # Get the best weights of the model and stop the first raound training. 276 | verbose=2) 277 | 278 | print ("Model training completed! ") 279 | 280 | print ("-----------------------------------------------") 281 | 282 | print ("Start predicting....") 283 | 284 | predicted_classes = model.predict_classes(test_set_x, batch_size=BATCH_SIZE, verbose=2) 285 | 286 | #print (predicted_classes) 287 | 288 | test_accuracy = np.mean(np.equal(test_set_y, predicted_classes)) 289 | 290 | print ("LSTM classification result: ") 291 | 292 | target_names = ["Non-vulnerable","Vulnerable"] #non-vulnerable->0, vulnerable->1 293 | print (confusion_matrix(test_set_y, predicted_classes, labels=[0,1])) 294 | print ("\r\n") 295 | print ("\r\n") 296 | print (classification_report(test_set_y, predicted_classes, target_names=target_names)) 297 | 298 | print ("LSTM prediction completed.") 299 | 300 | K.clear_session() 301 | 302 | print ("\r\n") 303 | print ("--- %s seconds ---" + str(time.time() - script_start_time)) -------------------------------------------------------------------------------- /Data/CodeMetrics/FFmpeg.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/CodeMetrics/FFmpeg.zip -------------------------------------------------------------------------------- /Data/CodeMetrics/LibPNG.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/CodeMetrics/LibPNG.zip -------------------------------------------------------------------------------- /Data/CodeMetrics/LibTIFF.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/CodeMetrics/LibTIFF.zip -------------------------------------------------------------------------------- /Data/CodeMetrics/VLC.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/CodeMetrics/VLC.zip -------------------------------------------------------------------------------- /Data/TrainedTokenizer/tokenizer.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/TrainedTokenizer/tokenizer.zip -------------------------------------------------------------------------------- /Data/TrainedWord2vecModel/6_projects_w2v_model_CBOW.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/TrainedWord2vecModel/6_projects_w2v_model_CBOW.zip -------------------------------------------------------------------------------- /Data/VulnerabilityData/6_projects_functions.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/VulnerabilityData/6_projects_functions.zip -------------------------------------------------------------------------------- /Data/VulnerabilityData/Sample data Info.txt: -------------------------------------------------------------------------------- 1 | The .pkl files are Python binaries created by pickle which is a Python module. 2 | 3 | These .pkl files are for the experiments on FFmpeg project. 4 | 5 | The experiments are based on the scenario where there are some labeled historical projects available: 6 | 7 | 1. The except_ffmpeg_list.pkl contains the vulnerable and non-vulnerable functions from the historical projects. The except_ffmpeg_list_id.pkl are the funtion IDs which can be used for generating labels (all the vulnerable functions are named with their CVE IDs). 8 | 9 | 2. The ffmpeg_list.pkl contains the functions from target project (FFmpeg). 10 | -------------------------------------------------------------------------------- /Data/VulnerabilityData/except_ffmpeg_list.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/VulnerabilityData/except_ffmpeg_list.pkl -------------------------------------------------------------------------------- /Data/VulnerabilityData/except_ffmpeg_list_id.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/VulnerabilityData/except_ffmpeg_list_id.pkl -------------------------------------------------------------------------------- /Data/VulnerabilityData/ffmpeg_list.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/VulnerabilityData/ffmpeg_list.pkl -------------------------------------------------------------------------------- /Data/VulnerabilityData/ffmpeg_list_id.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Data/VulnerabilityData/ffmpeg_list_id.pkl -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Transferable Representation Learning 2 | 3 | Hi there, welcome to this page! 4 | 5 | This page contains the code and data used in the paper [Cross-Project Transfer Representation Learning for Vulnerable Function Discovery](https://ieeexplore.ieee.org/abstract/document/8329207/) by Guanjun Lin; Jun Zhang; Wei Luo; Lei Pan; Yang Xiang; Olivier De Vel and Paul Montague. 6 | 7 | ### Instructions: 8 | 9 | The Vulnerabilities_info.xlsx file contains information of the collected function-level vulnerabilities (It just a record for reference.). These vulnerabilities are from 6 open source projects: [FFmpeg](https://github.com/FFmpeg/FFmpeg), [LibTIFF](https://github.com/vadz/libtiff), [LibPNG](https://github.com/glennrp/libpng), [Pidgin](https://pidgin.im/), [Asterisk](https://www.asterisk.org/get-started) and [VLC Media Player](https://www.videolan.org/vlc/index.html). And vulnerability information was collected from [National Vulnerability Database(NVD)](https://nvd.nist.gov/) until the end of July 2017. 10 | 11 | ### Requirements for code: 12 | 13 | * [Tensorflow](https://www.tensorflow.org/) 14 | * [Keras](https://github.com/fchollet/keras/tree/master/keras) 15 | * [Scikit-learn](http://scikit-learn.org/stable/) 16 | * [Gensim](https://radimrehurek.com/gensim/) 17 | * Python >= 2.7 18 | 19 | The dependencies can be installed using [Anaconda](https://www.anaconda.com/download/). For example: 20 | 21 | ```bash 22 | $ bash Anaconda3-5.0.1-Linux-x86_64.sh 23 | ``` 24 | 25 | The "Data" folder contains the following sub folders: 26 | 1) VulnerabilityData -- It contains a ZIP file which stores the vulnerable and part of non_vulnerable functions from 6 open source projects. Unzip the file, one will find 6 folders named with the projects. Each folder contains the source code of the non-vulnerable functions (named with their function names) and vulnerable functions (named with the CVE IDs): 27 | * The vulnerable functions are all named with the CVE IDs (their names are starting with ‘cve-’ or ‘CVE-’). For example, “cve-2017-14005.c” is a vulnerable function. 28 | * The non-vulnerable functions are named with the format: “xxxx_file_name_function_name.c” to avoid duplicated file/function names. For example “1374_cmdutils.c_show_devices.c” is a non-vulnerable function. 29 | 30 | In the pre-training phase, one can choose any 5 projects as the historical data for training a LSTM network (the labels can be generated based on the file names (vulnerable functions have the CVE IDs as their file names. Please see the code for more details). Then, the remaining 1 project can be used as the input to the pre-trained network for generating representations. Finally, the generated representations can be used as features for training a classifier. 31 | 32 | 2) CodeMetrics -- It stores the code metrics extracted from the source code files of the open source projects. The code metrics are used as features to train a random forest classifier as the baseline to compare with the method which uses transfer-learned representations as features. We used [Understand](https://scitools.com/) which is a commercial code enhancement tool for extracting function-level code metrics. We included 23 code metrics extracted from the vulnerable functions of 6 projects. 33 | 34 | 3) TrainedTokenizer -- It contains the trained tokenizer file which is used for converting the serialized AST lists to numeric tokens. 35 | 36 | 4) TrainedWord2vecModel -- It includes the trained Word2vec model. The model was trained on the code base of 6 open source projects. The Word2vec model is used in the embedding layer of the LSTM network for converting input sequence to meaningful embeddings. 37 | 38 | The "Code" folder contains the Python code samples. 39 | 1) TransferableRepresentationLearning_LSTM_DNN.py file is for LSTM network training. It defines the structure of the Bi-LSTM network used in the paper. The input of the file is the historical vulnerable functions that have labels. The output of the file is a trained LSTM network capable of obtaining vulnerable function representations. 40 | 41 | 2) ExtractLearnedFeaturesAndClassification.py file is for obtaining the function representations from the pre-trained LSTM network. It also includes the code for training a random forest classifier based on the obtained function representations as features. 42 | 43 | 3) CodeMetrics.py file is to train a random forest classifier based on the selected 23 code metrics. 44 | 45 | If you are interested in our project, please contact junzhang@swin.edu.au for more information. If you use our code and data in your work, please kindly cite our paper in your work. 46 | 47 | The latex format: 48 | 49 | ``` 50 | @article{lin2018cross, 51 | title={Cross-Project Transfer Representation Learning for Vulnerable Function Discovery}, 52 | author={Lin, Guanjun and Zhang, Jun and Luo, Wei and Pan, Lei and Xiang, Yang and De Vel, Olivier and Montague, Paul}, 53 | journal={IEEE Transactions on Industrial Informatics}, 54 | year={2018}, 55 | publisher={IEEE} 56 | } 57 | ``` 58 | 59 | Thank you! 60 | -------------------------------------------------------------------------------- /Vulnerabilities_info.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielLin1986/TransferRepresentationLearning/a439acb4f66afec7e5893358f6aeabaaa5b8ab49/Vulnerabilities_info.xlsx --------------------------------------------------------------------------------