├── .gitignore ├── Devign_Reveal_model ├── GNN_explainer_result_analysis.ipynb ├── backbone.py ├── data_loader │ ├── __init__.py │ ├── batch_graph.py │ └── dataset.py ├── data_sampler.py ├── exp.bash ├── exp_latent.sh ├── gnn_explainer.ipynb ├── main.py ├── modules │ ├── __init__.py │ └── model.py ├── my_trainer.py ├── readme.md └── utils.py ├── IVDetect_model ├── gen_graphs.py ├── joern.zip ├── latent_fine_tune.ipynb ├── main.py ├── preprocess.py ├── readme.md ├── shard_splitter.py └── vul_model.py ├── LineVul_model ├── data_splitter.py ├── evaluator │ └── my_evaluator.py ├── exp_latent.sh ├── latent_result_analyser.py ├── lime_explainer-latent.ipynb ├── lime_explainer.ipynb ├── lime_result_analyze.ipynb ├── model.py ├── myrun.py ├── readme.md ├── result_analyser.ipynb └── run.bash └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .DS_Store 3 | .idea -------------------------------------------------------------------------------- /Devign_Reveal_model/GNN_explainer_result_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 21, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pickle\n", 10 | "with open('gnnexplainer_result/msr_4x_split_0_hop_1.pkl', 'rb') as fp:\n", 11 | " gnn_explainer_dict = pickle.load(fp)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# the test set with vul line information\n", 30 | "df_test = pd.read_json('msr_test_set_with_line.json')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 4, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "
\n", 59 | " | node_features | \n", 60 | "node_features_sym | \n", 61 | "graph | \n", 62 | "original_tokens | \n", 63 | "symbolic_tokens | \n", 64 | "targets | \n", 65 | "token | \n", 66 | "token_list | \n", 67 | "line | \n", 68 | "
---|---|---|---|---|---|---|---|---|---|
0 | \n", 73 | "[[-1.0918805779, -0.0600933482, -1.3017879377,... | \n", 74 | "[[-0.0134278598, -0.44801456980000004, -0.0053... | \n", 75 | "[[0, 2, 25], [25, 3, 0], [0, 2, 2], [2, 3, 0],... | \n", 76 | "[[static, void, usb_net_handle_dataout, (, USB... | \n", 77 | "[[static, void, FUNC1, (, VAR1, *, VAR2, ,, VA... | \n", 78 | "[[0]] | \n", 79 | "[static, void, usb_net_handle_dataout, (, USBN... | \n", 80 | "static,void,usb_net_handle_dataout,(,USBNetSta... | \n", 81 | "None | \n", 82 | "
1 | \n", 85 | "[[0.5474294862, 0.32906730910000004, -1.650767... | \n", 86 | "[[0.1319378658, -0.397455169, -0.0868625781, -... | \n", 87 | "[[0, 2, 2], [2, 3, 0], [0, 2, 4], [4, 3, 0], [... | \n", 88 | "[[static, int, do_flock, (, struct, file, *, f... | \n", 89 | "[[static, int, FUNC1, (, struct, VAR1, *, VAR1... | \n", 90 | "[[0]] | \n", 91 | "[static, int, do_flock, (, struct, file, *, fi... | \n", 92 | "static,int,do_flock,(,struct,file,*,file,,,int... | \n", 93 | "None | \n", 94 | "
2 | \n", 97 | "[[-0.7882826924, -0.54334445, -1.1545437828, 0... | \n", 98 | "[[-0.3183312699, -0.24551808390000002, -0.0939... | \n", 99 | "[[0, 2, 8], [8, 3, 0], [0, 2, 9], [9, 3, 0]] | \n", 100 | "[[void, AddSystemStrings, (, content, ::, WebU... | \n", 101 | "[[void, FUNC1, (, VAR1, :, VAR2, *, VAR3, ), {... | \n", 102 | "[[0]] | \n", 103 | "[void, AddSystemStrings, (, content, ::, WebUI... | \n", 104 | "void,AddSystemStrings,(,content,::,WebUIDataSo... | \n", 105 | "None | \n", 106 | "
3 | \n", 109 | "[[-0.4252009115, -1.3483376971, -0.8432620365,... | \n", 110 | "[[0.0869873634, -0.3488178274, -0.1156845348, ... | \n", 111 | "[[0, 2, 3], [3, 3, 0], [0, 2, 9], [9, 3, 0], [... | \n", 112 | "[[SMB2_sess_alloc_buffer, (, struct, SMB2_sess... | \n", 113 | "[[FUNC1, (, struct, VAR1, *, VAR2, )], [{], [i... | \n", 114 | "[[0]] | \n", 115 | "[SMB2_sess_alloc_buffer, (, struct, SMB2_sess_... | \n", 116 | "SMB2_sess_alloc_buffer,(,struct,SMB2_sess_data... | \n", 117 | "None | \n", 118 | "
4 | \n", 121 | "[[1.1260667245, 0.1808472077, 0.03605219720000... | \n", 122 | "[[0.2014318407, -0.2869973059, 0.1425949335, -... | \n", 123 | "[] | \n", 124 | "[[AutofillPopupWarningView, (, AutofillPopupVi... | \n", 125 | "[[FUNC1, (, VAR1, *, VAR2, ,], [int, VAR1, )],... | \n", 126 | "[[0]] | \n", 127 | "[AutofillPopupWarningView, (, AutofillPopupVie... | \n", 128 | "AutofillPopupWarningView,(,AutofillPopupViewNa... | \n", 129 | "None | \n", 130 | "
... | \n", 133 | "... | \n", 134 | "... | \n", 135 | "... | \n", 136 | "... | \n", 137 | "... | \n", 138 | "... | \n", 139 | "... | \n", 140 | "... | \n", 141 | "... | \n", 142 | "
27722 | \n", 145 | "[[-0.8682729341000001, -1.2759647071, -1.54760... | \n", 146 | "[[-0.32964309750000004, -0.0687663168, 0.01822... | \n", 147 | "[[1, 2, 3], [3, 3, 1], [2, 2, 4], [4, 3, 2]] | \n", 148 | "[[int, GetLastYear, (), {], [Time, last_year_t... | \n", 149 | "[[int, FUNC1, (, ), {], [VAR1, VAR2, =, VAR1, ... | \n", 150 | "[[0]] | \n", 151 | "[int, GetLastYear, (), {, Time, last_year_time... | \n", 152 | "int,GetLastYear,(),{,Time,last_year_time,=,Tim... | \n", 153 | "None | \n", 154 | "
27723 | \n", 157 | "[[1.5330900732, 0.7245404899, 0.2295006856, -0... | \n", 158 | "[[0.2159726471, -0.29889355900000003, 0.093106... | \n", 159 | "[[0, 2, 3], [3, 3, 0], [0, 2, 19], [19, 3, 0],... | \n", 160 | "[[gpk_compute_crycks, (, sc_card_t, *, card, ,... | \n", 161 | "[[FUNC1, (, VAR1, *, VAR2, ,, VAR3, *, VAR4, ,... | \n", 162 | "[[0]] | \n", 163 | "[gpk_compute_crycks, (, sc_card_t, *, card, ,,... | \n", 164 | "gpk_compute_crycks,(,sc_card_t,*,card,,,sc_apd... | \n", 165 | "None | \n", 166 | "
27724 | \n", 169 | "[[1.3671128926, 0.8307209481000001, -1.3632551... | \n", 170 | "[[0.10021117140000001, -0.4366644649, 0.025586... | \n", 171 | "[[0, 2, 5], [5, 3, 0], [0, 2, 7], [7, 3, 0], [... | \n", 172 | "[[static, int, asepcos_set_sec_attributes, (, ... | \n", 173 | "[[static, int, FUNC1, (, VAR1, *, VAR2, ,, con... | \n", 174 | "[[0]] | \n", 175 | "[static, int, asepcos_set_sec_attributes, (, s... | \n", 176 | "static,int,asepcos_set_sec_attributes,(,sc_car... | \n", 177 | "None | \n", 178 | "
27725 | \n", 181 | "[[0.24357717040000001, 0.1453775644, 0.7804953... | \n", 182 | "[[0.1413379952, -0.2888208061, 0.0209930599, -... | \n", 183 | "[[0, 2, 14], [14, 3, 0], [0, 2, 36], [36, 3, 0... | \n", 184 | "[[after_select, (, fd_set, *, readset, ,, fd_s... | \n", 185 | "[[FUNC1, (, VAR1, *, VAR2, ,, VAR1, *, VAR3, )... | \n", 186 | "[[0]] | \n", 187 | "[after_select, (, fd_set, *, readset, ,, fd_se... | \n", 188 | "after_select,(,fd_set,*,readset,,,fd_set,*,wri... | \n", 189 | "None | \n", 190 | "
27726 | \n", 193 | "[[1.4474659761, -1.2718610279, -2.2667120596, ... | \n", 194 | "[[-0.0782345608, -0.6810032055, -0.3169494768,... | \n", 195 | "[[0, 2, 2], [2, 3, 0]] | \n", 196 | "[[ext4_xattr_create_cache, (, char, *, name, )... | \n", 197 | "[[FUNC1, (, char, *, VAR1, )], [{], [return, F... | \n", 198 | "[[1]] | \n", 199 | "[ext4_xattr_create_cache, (, char, *, name, ),... | \n", 200 | "ext4_xattr_create_cache,(,char,*,name,),{,retu... | \n", 201 | "\\treturn mb_cache_create(name, HASH_BUCKET_BIT... | \n", 202 | "
27727 rows × 9 columns
\n", 206 | "
89 | @inproceedings{fu2022linevul,
90 | title={LineVul: A Transformer-based Line-Level Vulnerability Prediction},
91 | author={Fu, Michael and Tantithamthavorn, Chakkrit},
92 | booktitle={2022 IEEE/ACM 19th International Conference on Mining Software Repositories (MSR)},
93 | year={2022},
94 | organization={IEEE}
95 | }
96 |
97 |
--------------------------------------------------------------------------------
/LineVul_model/result_analyser.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "pycharm": {
8 | "name": "#%%\n"
9 | }
10 | },
11 | "outputs": [],
12 | "source": [
13 | "import logging\n",
14 | "import sys\n",
15 | "import json\n",
16 | "import numpy as np\n",
17 | "from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score,roc_curve, auc, confusion_matrix,classification_report\n",
18 | "\n",
19 | "def read_answers(filename):\n",
20 | " answers={}\n",
21 | " with open(filename) as f:\n",
22 | " for line in f:\n",
23 | " line=line.strip()\n",
24 | " js=json.loads(line)\n",
25 | " answers[js['idx']]=js['target']\n",
26 | " return answers\n",
27 | "\n",
28 | "def read_predictions(filename):\n",
29 | " predictions={}\n",
30 | " with open(filename) as f:\n",
31 | " for line in f:\n",
32 | " line=line.strip()\n",
33 | " idx,label=line.split()\n",
34 | " predictions[int(idx)]=int(label)\n",
35 | " return predictions\n",
36 | "\n",
37 | "def read_predictions_prob(filename):\n",
38 | " predictions_prob={}\n",
39 | " with open(filename) as f:\n",
40 | " for line in f:\n",
41 | " line=line.strip()\n",
42 | " idx,label=line.split()\n",
43 | " predictions_prob[int(idx)]= float(label)\n",
44 | " return predictions_prob\n",
45 | "\n",
46 | "def calculate_scores(answers,predictions,predictions_prob):\n",
47 | " Acc=[]\n",
48 | " Ans=[]\n",
49 | " Pred=[]\n",
50 | " Pred_prob=[]\n",
51 | " for key in answers:\n",
52 | " Ans.append(answers[key])\n",
53 | " if key not in predictions:\n",
54 | " logging.error(\"Missing prediction for index {}.\".format(key))\n",
55 | " sys.exit()\n",
56 | " Acc.append(answers[key]==predictions[key])\n",
57 | " for key in predictions:\n",
58 | " Pred.append(predictions[key])\n",
59 | " for key in predictions_prob:\n",
60 | " Pred_prob.append(predictions_prob[key])\n",
61 | " scores={}\n",
62 | " results = []\n",
63 | "# scores['acc']=np.mean(Acc)\n",
64 | " fpr, tpr, _ = roc_curve(Ans, Pred_prob)\n",
65 | " results.append(auc(fpr, tpr)*100)\n",
66 | " results.append(accuracy_score(Ans,Pred)*100)\n",
67 | " results.append(recall_score(Ans,Pred)*100)\n",
68 | " results.append(precision_score(Ans,Pred,zero_division=0)*100)\n",
69 | " results.append(f1_score(Ans,Pred,zero_division=0)*100)\n",
70 | " zipped_result = zip(Ans,Pred,Pred_prob)\n",
71 | " sorted_zip = sorted(zipped_result, key=lambda x: x[2],reverse=True)\n",
72 | " print(confusion_matrix(Ans,Pred))\n",
73 | "# print('auc\\t',auc(fpr, tpr))\n",
74 | "# print('acc\\t',accuracy_score(Ans,Pred))\n",
75 | "# print('f1\\t',f1_score(Ans,Pred))\n",
76 | "# print('recall\\t',recall_score(Ans,Pred))\n",
77 | "# print('precision\\t',precision_score(Ans,Pred))\n",
78 | " print(results)\n",
79 | " return results,sorted_zip,Pred_prob\n"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 2,
85 | "metadata": {
86 | "pycharm": {
87 | "name": "#%%\n"
88 | }
89 | },
90 | "outputs": [],
91 | "source": [
92 | "def read_output(test_dir,result_dir):\n",
93 | " answers=read_answers(test_dir+'test.jsonl')\n",
94 | " predictions=read_predictions(result_dir+'predictions.txt')\n",
95 | " predictions_prob = read_predictions_prob(result_dir+'predictions_prob.txt')\n",
96 | " scores,sorted_zip, Pred_prob=calculate_scores(answers,predictions,predictions_prob)\n",
97 | " return scores,sorted_zip,Pred_prob"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 3,
103 | "metadata": {
104 | "pycharm": {
105 | "name": "#%%\n"
106 | }
107 | },
108 | "outputs": [
109 | {
110 | "name": "stdout",
111 | "output_type": "stream",
112 | "text": [
113 | "msr_output/ros_msr/saved_models_0/\n",
114 | "[[34518 1005]\n",
115 | " [ 1384 821]]\n",
116 | "[76.60132239704429, 93.66783290924512, 37.23356009070295, 44.961664841182916, 40.734309104440584]\n",
117 | "msr_output/ros_msr/saved_models_1/\n",
118 | "[[34550 952]\n",
119 | " [ 1402 824]]\n",
120 | "[80.29656200986969, 93.76060220525869, 37.01707097933513, 46.3963963963964, 41.17941029485258]\n",
121 | "msr_output/ros_msr/saved_models_2/\n",
122 | "[[34610 964]\n",
123 | " [ 1389 765]]\n",
124 | "[74.40449763551453, 93.76325275657337, 35.515320334261844, 44.24522845575477, 39.40252382178728]\n",
125 | "msr_output/ros_msr/saved_models_3/\n",
126 | "[[34615 941]\n",
127 | " [ 1361 811]]\n",
128 | "[80.82965498670217, 93.89843087362172, 37.338858195211785, 46.289954337899545, 41.335372069317025]\n",
129 | "msr_output/ros_msr/saved_models_4/\n",
130 | "[[34665 864]\n",
131 | " [ 1394 805]]\n",
132 | "[70.99250026920473, 94.01505513146734, 36.607548885857206, 48.23247453565009, 41.623578076525334]\n",
133 | "msr_output/ros_msr/saved_models_5/\n",
134 | "[[34676 888]\n",
135 | " [ 1400 764]]\n",
136 | "[74.2297515858006, 93.93553859202714, 35.304990757855826, 46.246973365617436, 40.04192872117401]\n",
137 | "msr_output/ros_msr/saved_models_6/\n",
138 | "[[34531 1004]\n",
139 | " [ 1343 850]]\n",
140 | "[75.72716276015676, 93.7791560644614, 38.759689922480625, 45.84681769147789, 42.006424511984186]\n",
141 | "msr_output/ros_msr/saved_models_7/\n",
142 | "[[34515 1041]\n",
143 | " [ 1382 790]]\n",
144 | "[70.13264954181166, 93.57771416454622, 36.37200736648251, 43.14582195521573, 39.470397202098425]\n",
145 | "msr_output/ros_msr/saved_models_8/\n",
146 | "[[34619 950]\n",
147 | " [ 1387 772]]\n",
148 | "[74.38341665790833, 93.80566157760815, 35.75729504400185, 44.83159117305458, 39.78356093790259]\n",
149 | "msr_output/ros_msr/saved_models_9/\n",
150 | "[[34570 961]\n",
151 | " [ 1394 803]]\n",
152 | "[74.86113436019835, 93.75795165394402, 36.54984069185252, 45.52154195011338, 40.54531683918202]\n",
153 | "msr_output/ros_msr/saved_models_10/\n",
154 | "[[34602 968]\n",
155 | " [ 1334 824]]\n",
156 | "[78.09118049152124, 93.89843087362172, 38.18350324374421, 45.982142857142854, 41.721518987341774]\n",
157 | "msr_output/ros_msr/saved_models_11/\n",
158 | "[[34695 932]\n",
159 | " [ 1346 755]]\n",
160 | "[60.05300877286018, 93.96204410517387, 35.93526891956211, 44.75400118553646, 39.86272439281943]\n",
161 | "msr_output/ros_msr/saved_models_12/\n",
162 | "[[34671 945]\n",
163 | " [ 1374 738]]\n",
164 | "[76.57451326884922, 93.85337150127226, 34.94318181818182, 43.85026737967914, 38.89328063241107]\n",
165 | "msr_output/ros_msr/saved_models_13/\n",
166 | "[[34653 935]\n",
167 | " [ 1353 787]]\n",
168 | "[74.31879865522244, 93.93553859202714, 36.77570093457944, 45.70267131242741, 40.75608493008804]\n",
169 | "msr_output/ros_msr/saved_models_14/\n",
170 | "[[34569 942]\n",
171 | " [ 1378 839]]\n",
172 | "[76.4742981098934, 93.85072094995759, 37.843933243121334, 47.10836608646828, 41.97098549274638]\n",
173 | "msr_output/ros_msr/saved_models_15/\n",
174 | "[[34560 960]\n",
175 | " [ 1384 824]]\n",
176 | "[79.01079408212561, 93.78710771840542, 37.31884057971014, 46.18834080717489, 41.28256513026052]\n",
177 | "msr_output/ros_msr/saved_models_16/\n",
178 | "[[34574 994]\n",
179 | " [ 1364 796]]\n",
180 | "[68.79657874952099, 93.75, 36.851851851851855, 44.46927374301676, 40.30379746835443]\n",
181 | "msr_output/ros_msr/saved_models_17/\n",
182 | "[[34658 964]\n",
183 | " [ 1287 819]]\n",
184 | "[77.0780650667612, 94.03360899067006, 38.88888888888889, 45.93381940549636, 42.11879660581126]\n",
185 | "msr_output/ros_msr/saved_models_18/\n",
186 | "[[34730 897]\n",
187 | " [ 1373 728]]\n",
188 | "[72.78113745214627, 93.98324851569126, 34.65016658733936, 44.800000000000004, 39.07675791733764]\n"
189 | ]
190 | }
191 | ],
192 | "source": [
193 | "result_list = [] \n",
194 | "for i in range(0,19):\n",
195 | " try:\n",
196 | " test_dir = f'../msr_dataset/ros_msr/data_split_{i}/'\n",
197 | " result_dir = f'msr_output/ros_msr/saved_models_{i}/'\n",
198 | " print(result_dir)\n",
199 | " result,sorted_zip,pred_prob = read_output(test_dir,result_dir)\n",
200 | " # import pickle\n",
201 | " # pickle.dump(sorted_zip, open( f'../../metrics_exp/data/Codebert_model/reveal/oss/{i}_zip_ans_pred_prob.pkl', \"wb\" ))\n",
202 | " # result_list.append(result)\n",
203 | " except:\n",
204 | " print('error',i)"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 3,
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "name": "stdout",
214 | "output_type": "stream",
215 | "text": [
216 | "[[34612 911]\n",
217 | " [ 1380 825]]\n",
218 | "[70.96231479295169, 93.92758693808312, 37.41496598639456, 47.52304147465438, 41.86754630804364]\n"
219 | ]
220 | }
221 | ],
222 | "source": [
223 | "i=0\n",
224 | "test_dir = f'../msr_dataset/origin/data_split_{i}/'\n",
225 | "result_dir = f'msr_outout/ros_4x/'\n",
226 | "result,sorted_zip,pred_prob_2 = read_output(test_dir,result_dir)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {},
233 | "outputs": [],
234 | "source": [
235 | "import seaborn as sns\n",
236 | "sns.displot(pred_prob, kde=False, \n",
237 | " bins=100, color = 'blue')"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "import seaborn as sns\n",
247 | "sns.displot(pred_prob_2, kde=False, \n",
248 | " bins=100, color = 'blue')"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {
255 | "pycharm": {
256 | "name": "#%%\n"
257 | }
258 | },
259 | "outputs": [],
260 | "source": [
261 | "import pandas as pd\n",
262 | "df = pd.DataFrame(result_list, columns = ['auc', 'acc','recall','precision','f1'])"
263 | ]
264 | },
265 | {
266 | "cell_type": "raw",
267 | "metadata": {
268 | "pycharm": {
269 | "name": "#%%\n"
270 | }
271 | },
272 | "source": [
273 | "zipped_result = zip(Ans,Pred,Pred_prob)\n",
274 | "sorted_zip = sorted(zipped_result, key=lambda x: x[2],reverse=True)\n",
275 | "import pickle\n",
276 | "pickle.dump(sorted_zip, open( f'{result_dir}zip_ans_pred_prob.pkl', \"wb\" ))"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": []
285 | }
286 | ],
287 | "metadata": {
288 | "kernelspec": {
289 | "display_name": "transformer_env",
290 | "language": "python",
291 | "name": "transformer_env"
292 | },
293 | "language_info": {
294 | "codemirror_mode": {
295 | "name": "ipython",
296 | "version": 3
297 | },
298 | "file_extension": ".py",
299 | "mimetype": "text/x-python",
300 | "name": "python",
301 | "nbconvert_exporter": "python",
302 | "pygments_lexer": "ipython3",
303 | "version": "3.8.8"
304 | }
305 | },
306 | "nbformat": 4,
307 | "nbformat_minor": 4
308 | }
309 |
--------------------------------------------------------------------------------
/LineVul_model/run.bash:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0,1 python my_run.py \
2 | --output_dir=./msr_outout/ros_2x \
3 | --model_type=roberta \
4 | --tokenizer_name=microsoft/codebert-base \
5 | --model_name_or_path=microsoft/codebert-base \
6 | --do_train \
7 | --do_test\
8 | --train_data_file=../msr_dataset/ros_2x/train.jsonl \
9 | --test_data_file=../msr_dataset/origin/data_split_0/test.jsonl \
10 | --epoch 4 \
11 | --block_size 400 \
12 | --train_batch_size 64 \
13 | --eval_batch_size 32 \
14 | --learning_rate 5e-5 \
15 | --max_grad_norm 1.0
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # DataSampling4DLVD
2 | This is the official replication repository for our paper
3 | > *Does data sampling improve deep learning-based
4 | vulnerability detection? Yeas! and Nays!*
5 |
6 | ## 0.datasets
7 | ### 0.1 processed datasets dump availble in Zenodo:
8 | Please click to see our zenodo site for our processed datasets at:
9 |
10 | [](https://doi.org/10.5281/zenodo.7057996)
11 |
12 |
13 | ### 0.2 raw datasets:
14 | This repo consist three model that are developed based on their official releases github repo.
15 | 1. [IVDetect](https://github.com/vulnerabilitydetection/VulnerabilityDetectionResearch)
16 | 2. [Reveal](https://github.com/VulDetProject/ReVeal)
17 | 3. [LineVul/codebert](https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Defect-detection)
18 |
19 | We thank for researchers for their hard work.
20 | ## 1 models
21 | The repository consist of 3 model replication folder. Also required package are listed.
22 | ### 1.1 IVDetect_model
23 | Package:
24 | 1. Pytorch
25 | 2. Pytorch-geometric
26 | 3. imblearn
27 | 4. sklearn
28 | 5. gensim
29 | 6. nni
30 | ### 1.2 Reveal_model
31 | >**Devign_model** also inside Reveal_model folder. This is because the author of Devign didn't open source the code, our implementation of Devign are based on the replicate written by Reveal's author.
32 |
33 | Package:
34 | 1. pytorch
35 | 2. dgl (which includes the GNNExplainer implementation for XAI)
36 | 3. imblearn
37 | 4. sklearn
38 |
39 | ### 1.3 LineVul_model
40 | package:
41 | 1. Pytorch
42 | 2. Transformer (by Huggingface)
43 | 3. Lime (if you want to use the XAI tool Lime)
44 | 4. tensorflow
45 | 5. imblearn
46 | 6. sklearn
47 |
48 | ## 2 Datasets
49 | We use three datasets in the experiment, we provide only the link to the **raw** dataset here, and we will provide the **processed** datasets(as model input) in zenodo
50 | 1. [Devign](https://drive.google.com/file/d/1x6hoF7G-tSYxg8AFybggypLZgMGDNHfF/view) dataset
51 | 1. for more detailed info about the devign datasets, check [Devign's official webpage](https://sites.google.com/view/devign)
52 | 2. [Reveal](https://drive.google.com/drive/folders/1KuIYgFcvWUXheDhT--cBALsfy1I4utOy) dataset
53 | 1. for more detailed info about the reveal datasets, check [Reveal github](https://github.com/VulDetProject/ReVeal)
54 | 3. [BigVul](https://drive.google.com/file/d/1-0VhnHBp9IGh90s2wCNjeCMuy70HPl8X/view) dataset
55 | 1. we use a cleaned version of BigVul, the origin BigVul contain much more information to digest, we suggest researchers to check the origin BigVul dataset at [BigVul](https://github.com/ZeoVan/MSR_20_Code_vulnerability_CSV_Dataset) official repo
56 |
57 | ## 3 Interpretable Tool
58 | We provide the code/jupyter-notebook that we use in our RQ and discussion part for future study
59 | 1. **Lime** is in LineVul_model folder
60 | 2. **GNNExplainer** is in Reveal_model folder
61 |
62 |
63 | ## 4 The whole pipeline
64 | We tried our best to describe how to conduct the experiment from **raw data -> proceed data -> model traning -> evaluation** in each readme file in different folder of model
--------------------------------------------------------------------------------