├── .gitignore ├── Devign_Reveal_model ├── GNN_explainer_result_analysis.ipynb ├── backbone.py ├── data_loader │ ├── __init__.py │ ├── batch_graph.py │ └── dataset.py ├── data_sampler.py ├── exp.bash ├── exp_latent.sh ├── gnn_explainer.ipynb ├── main.py ├── modules │ ├── __init__.py │ └── model.py ├── my_trainer.py ├── readme.md └── utils.py ├── IVDetect_model ├── gen_graphs.py ├── joern.zip ├── latent_fine_tune.ipynb ├── main.py ├── preprocess.py ├── readme.md ├── shard_splitter.py └── vul_model.py ├── LineVul_model ├── data_splitter.py ├── evaluator │ └── my_evaluator.py ├── exp_latent.sh ├── latent_result_analyser.py ├── lime_explainer-latent.ipynb ├── lime_explainer.ipynb ├── lime_result_analyze.ipynb ├── model.py ├── myrun.py ├── readme.md ├── result_analyser.ipynb └── run.bash └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .DS_Store 3 | .idea -------------------------------------------------------------------------------- /Devign_Reveal_model/GNN_explainer_result_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 21, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pickle\n", 10 | "with open('gnnexplainer_result/msr_4x_split_0_hop_1.pkl', 'rb') as fp:\n", 11 | " gnn_explainer_dict = pickle.load(fp)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# the test set with vul line information\n", 30 | "df_test = pd.read_json('msr_test_set_with_line.json')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 4, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "

\n", 42 | "\n", 55 | "\n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | "

	node_features	node_features_sym	graph	original_tokens	symbolic_tokens	targets	token	token_list	line
0	[[-1.0918805779, -0.0600933482, -1.3017879377,...	[[-0.0134278598, -0.44801456980000004, -0.0053...	[[0, 2, 25], [25, 3, 0], [0, 2, 2], [2, 3, 0],...	[[static, void, usb_net_handle_dataout, (, USB...	[[static, void, FUNC1, (, VAR1, *, VAR2, ,, VA...	[[0]]	[static, void, usb_net_handle_dataout, (, USBN...	static,void,usb_net_handle_dataout,(,USBNetSta...	None
1	[[0.5474294862, 0.32906730910000004, -1.650767...	[[0.1319378658, -0.397455169, -0.0868625781, -...	[[0, 2, 2], [2, 3, 0], [0, 2, 4], [4, 3, 0], [...	[[static, int, do_flock, (, struct, file, *, f...	[[static, int, FUNC1, (, struct, VAR1, *, VAR1...	[[0]]	[static, int, do_flock, (, struct, file, *, fi...	static,int,do_flock,(,struct,file,*,file,,,int...	None
2	[[-0.7882826924, -0.54334445, -1.1545437828, 0...	[[-0.3183312699, -0.24551808390000002, -0.0939...	[[0, 2, 8], [8, 3, 0], [0, 2, 9], [9, 3, 0]]	[[void, AddSystemStrings, (, content, ::, WebU...	[[void, FUNC1, (, VAR1, :, VAR2, *, VAR3, ), {...	[[0]]	[void, AddSystemStrings, (, content, ::, WebUI...	void,AddSystemStrings,(,content,::,WebUIDataSo...	None
3	[[-0.4252009115, -1.3483376971, -0.8432620365,...	[[0.0869873634, -0.3488178274, -0.1156845348, ...	[[0, 2, 3], [3, 3, 0], [0, 2, 9], [9, 3, 0], [...	[[SMB2_sess_alloc_buffer, (, struct, SMB2_sess...	[[FUNC1, (, struct, VAR1, *, VAR2, )], [{], [i...	[[0]]	[SMB2_sess_alloc_buffer, (, struct, SMB2_sess_...	SMB2_sess_alloc_buffer,(,struct,SMB2_sess_data...	None
4	[[1.1260667245, 0.1808472077, 0.03605219720000...	[[0.2014318407, -0.2869973059, 0.1425949335, -...	[]	[[AutofillPopupWarningView, (, AutofillPopupVi...	[[FUNC1, (, VAR1, *, VAR2, ,], [int, VAR1, )],...	[[0]]	[AutofillPopupWarningView, (, AutofillPopupVie...	AutofillPopupWarningView,(,AutofillPopupViewNa...	None
...	...	...	...	...	...	...	...	...	...
27722	[[-0.8682729341000001, -1.2759647071, -1.54760...	[[-0.32964309750000004, -0.0687663168, 0.01822...	[[1, 2, 3], [3, 3, 1], [2, 2, 4], [4, 3, 2]]	[[int, GetLastYear, (), {], [Time, last_year_t...	[[int, FUNC1, (, ), {], [VAR1, VAR2, =, VAR1, ...	[[0]]	[int, GetLastYear, (), {, Time, last_year_time...	int,GetLastYear,(),{,Time,last_year_time,=,Tim...	None
27723	[[1.5330900732, 0.7245404899, 0.2295006856, -0...	[[0.2159726471, -0.29889355900000003, 0.093106...	[[0, 2, 3], [3, 3, 0], [0, 2, 19], [19, 3, 0],...	[[gpk_compute_crycks, (, sc_card_t, *, card, ,...	[[FUNC1, (, VAR1, , VAR2, ,, VAR3, , VAR4, ,...	[[0]]	[gpk_compute_crycks, (, sc_card_t, *, card, ,,...	gpk_compute_crycks,(,sc_card_t,*,card,,,sc_apd...	None
27724	[[1.3671128926, 0.8307209481000001, -1.3632551...	[[0.10021117140000001, -0.4366644649, 0.025586...	[[0, 2, 5], [5, 3, 0], [0, 2, 7], [7, 3, 0], [...	[[static, int, asepcos_set_sec_attributes, (, ...	[[static, int, FUNC1, (, VAR1, *, VAR2, ,, con...	[[0]]	[static, int, asepcos_set_sec_attributes, (, s...	static,int,asepcos_set_sec_attributes,(,sc_car...	None
27725	[[0.24357717040000001, 0.1453775644, 0.7804953...	[[0.1413379952, -0.2888208061, 0.0209930599, -...	[[0, 2, 14], [14, 3, 0], [0, 2, 36], [36, 3, 0...	[[after_select, (, fd_set, *, readset, ,, fd_s...	[[FUNC1, (, VAR1, , VAR2, ,, VAR1, , VAR3, )...	[[0]]	[after_select, (, fd_set, *, readset, ,, fd_se...	after_select,(,fd_set,,readset,,,fd_set,,wri...	None
27726	[[1.4474659761, -1.2718610279, -2.2667120596, ...	[[-0.0782345608, -0.6810032055, -0.3169494768,...	[[0, 2, 2], [2, 3, 0]]	[[ext4_xattr_create_cache, (, char, *, name, )...	[[FUNC1, (, char, *, VAR1, )], [{], [return, F...	[[1]]	[ext4_xattr_create_cache, (, char, *, name, ),...	ext4_xattr_create_cache,(,char,*,name,),{,retu...	\\treturn mb_cache_create(name, HASH_BUCKET_BIT...

\n", 205 | "

27727 rows × 9 columns

\n", 206 | "

" 207 | ], 208 | "text/plain": [ 209 | " node_features \\\n", 210 | "0 [[-1.0918805779, -0.0600933482, -1.3017879377,... \n", 211 | "1 [[0.5474294862, 0.32906730910000004, -1.650767... \n", 212 | "2 [[-0.7882826924, -0.54334445, -1.1545437828, 0... \n", 213 | "3 [[-0.4252009115, -1.3483376971, -0.8432620365,... \n", 214 | "4 [[1.1260667245, 0.1808472077, 0.03605219720000... \n", 215 | "... ... \n", 216 | "27722 [[-0.8682729341000001, -1.2759647071, -1.54760... \n", 217 | "27723 [[1.5330900732, 0.7245404899, 0.2295006856, -0... \n", 218 | "27724 [[1.3671128926, 0.8307209481000001, -1.3632551... \n", 219 | "27725 [[0.24357717040000001, 0.1453775644, 0.7804953... \n", 220 | "27726 [[1.4474659761, -1.2718610279, -2.2667120596, ... \n", 221 | "\n", 222 | " node_features_sym \\\n", 223 | "0 [[-0.0134278598, -0.44801456980000004, -0.0053... \n", 224 | "1 [[0.1319378658, -0.397455169, -0.0868625781, -... \n", 225 | "2 [[-0.3183312699, -0.24551808390000002, -0.0939... \n", 226 | "3 [[0.0869873634, -0.3488178274, -0.1156845348, ... \n", 227 | "4 [[0.2014318407, -0.2869973059, 0.1425949335, -... \n", 228 | "... ... \n", 229 | "27722 [[-0.32964309750000004, -0.0687663168, 0.01822... \n", 230 | "27723 [[0.2159726471, -0.29889355900000003, 0.093106... \n", 231 | "27724 [[0.10021117140000001, -0.4366644649, 0.025586... \n", 232 | "27725 [[0.1413379952, -0.2888208061, 0.0209930599, -... \n", 233 | "27726 [[-0.0782345608, -0.6810032055, -0.3169494768,... \n", 234 | "\n", 235 | " graph \\\n", 236 | "0 [[0, 2, 25], [25, 3, 0], [0, 2, 2], [2, 3, 0],... \n", 237 | "1 [[0, 2, 2], [2, 3, 0], [0, 2, 4], [4, 3, 0], [... \n", 238 | "2 [[0, 2, 8], [8, 3, 0], [0, 2, 9], [9, 3, 0]] \n", 239 | "3 [[0, 2, 3], [3, 3, 0], [0, 2, 9], [9, 3, 0], [... \n", 240 | "4 [] \n", 241 | "... ... \n", 242 | "27722 [[1, 2, 3], [3, 3, 1], [2, 2, 4], [4, 3, 2]] \n", 243 | "27723 [[0, 2, 3], [3, 3, 0], [0, 2, 19], [19, 3, 0],... \n", 244 | "27724 [[0, 2, 5], [5, 3, 0], [0, 2, 7], [7, 3, 0], [... \n", 245 | "27725 [[0, 2, 14], [14, 3, 0], [0, 2, 36], [36, 3, 0... \n", 246 | "27726 [[0, 2, 2], [2, 3, 0]] \n", 247 | "\n", 248 | " original_tokens \\\n", 249 | "0 [[static, void, usb_net_handle_dataout, (, USB... \n", 250 | "1 [[static, int, do_flock, (, struct, file, *, f... \n", 251 | "2 [[void, AddSystemStrings, (, content, ::, WebU... \n", 252 | "3 [[SMB2_sess_alloc_buffer, (, struct, SMB2_sess... \n", 253 | "4 [[AutofillPopupWarningView, (, AutofillPopupVi... \n", 254 | "... ... \n", 255 | "27722 [[int, GetLastYear, (), {], [Time, last_year_t... \n", 256 | "27723 [[gpk_compute_crycks, (, sc_card_t, *, card, ,... \n", 257 | "27724 [[static, int, asepcos_set_sec_attributes, (, ... \n", 258 | "27725 [[after_select, (, fd_set, *, readset, ,, fd_s... \n", 259 | "27726 [[ext4_xattr_create_cache, (, char, *, name, )... \n", 260 | "\n", 261 | " symbolic_tokens targets \\\n", 262 | "0 [[static, void, FUNC1, (, VAR1, *, VAR2, ,, VA... [[0]] \n", 263 | "1 [[static, int, FUNC1, (, struct, VAR1, *, VAR1... [[0]] \n", 264 | "2 [[void, FUNC1, (, VAR1, :, VAR2, *, VAR3, ), {... [[0]] \n", 265 | "3 [[FUNC1, (, struct, VAR1, *, VAR2, )], [{], [i... [[0]] \n", 266 | "4 [[FUNC1, (, VAR1, *, VAR2, ,], [int, VAR1, )],... [[0]] \n", 267 | "... ... ... \n", 268 | "27722 [[int, FUNC1, (, ), {], [VAR1, VAR2, =, VAR1, ... [[0]] \n", 269 | "27723 [[FUNC1, (, VAR1, *, VAR2, ,, VAR3, *, VAR4, ,... [[0]] \n", 270 | "27724 [[static, int, FUNC1, (, VAR1, *, VAR2, ,, con... [[0]] \n", 271 | "27725 [[FUNC1, (, VAR1, *, VAR2, ,, VAR1, *, VAR3, )... [[0]] \n", 272 | "27726 [[FUNC1, (, char, *, VAR1, )], [{], [return, F... [[1]] \n", 273 | "\n", 274 | " token \\\n", 275 | "0 [static, void, usb_net_handle_dataout, (, USBN... \n", 276 | "1 [static, int, do_flock, (, struct, file, *, fi... \n", 277 | "2 [void, AddSystemStrings, (, content, ::, WebUI... \n", 278 | "3 [SMB2_sess_alloc_buffer, (, struct, SMB2_sess_... \n", 279 | "4 [AutofillPopupWarningView, (, AutofillPopupVie... \n", 280 | "... ... \n", 281 | "27722 [int, GetLastYear, (), {, Time, last_year_time... \n", 282 | "27723 [gpk_compute_crycks, (, sc_card_t, *, card, ,,... \n", 283 | "27724 [static, int, asepcos_set_sec_attributes, (, s... \n", 284 | "27725 [after_select, (, fd_set, *, readset, ,, fd_se... \n", 285 | "27726 [ext4_xattr_create_cache, (, char, *, name, ),... \n", 286 | "\n", 287 | " token_list \\\n", 288 | "0 static,void,usb_net_handle_dataout,(,USBNetSta... \n", 289 | "1 static,int,do_flock,(,struct,file,*,file,,,int... \n", 290 | "2 void,AddSystemStrings,(,content,::,WebUIDataSo... \n", 291 | "3 SMB2_sess_alloc_buffer,(,struct,SMB2_sess_data... \n", 292 | "4 AutofillPopupWarningView,(,AutofillPopupViewNa... \n", 293 | "... ... \n", 294 | "27722 int,GetLastYear,(),{,Time,last_year_time,=,Tim... \n", 295 | "27723 gpk_compute_crycks,(,sc_card_t,*,card,,,sc_apd... \n", 296 | "27724 static,int,asepcos_set_sec_attributes,(,sc_car... \n", 297 | "27725 after_select,(,fd_set,*,readset,,,fd_set,*,wri... \n", 298 | "27726 ext4_xattr_create_cache,(,char,*,name,),{,retu... \n", 299 | "\n", 300 | " line \n", 301 | "0 None \n", 302 | "1 None \n", 303 | "2 None \n", 304 | "3 None \n", 305 | "4 None \n", 306 | "... ... \n", 307 | "27722 None \n", 308 | "27723 None \n", 309 | "27724 None \n", 310 | "27725 None \n", 311 | "27726 \\treturn mb_cache_create(name, HASH_BUCKET_BIT... \n", 312 | "\n", 313 | "[27727 rows x 9 columns]" 314 | ] 315 | }, 316 | "execution_count": 4, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "df_test" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 38, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "hit = 0\n", 332 | "counter = 0\n", 333 | "for index, exp_line_list in gnn_explainer_dict.items():\n", 334 | "# print(index,line_list)\n", 335 | " vul_line = df_test.loc[index].line\n", 336 | " # top k, change k to 1,3,5,10 etc\n", 337 | " exp_line_list = exp_line_list[:10]\n", 338 | "# print(exp_line_list)\n", 339 | " if vul_line != None:\n", 340 | "# print(vul_line)\n", 341 | " found = False\n", 342 | " counter += 1\n", 343 | " origin_token_list = df_test.loc[index].original_tokens\n", 344 | " for exp_line in exp_line_list:\n", 345 | " exp_tokens = origin_token_list[exp_line]\n", 346 | " for exp_token in exp_tokens:\n", 347 | " if len(exp_token) > 1:\n", 348 | " if exp_token in vul_line:\n", 349 | "# print(exp_tokens)\n", 350 | "# print(vul_line)\n", 351 | " hit += 1\n", 352 | " found = True\n", 353 | " break\n", 354 | " if found :\n", 355 | " break\n", 356 | "# break" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 39, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "data": { 366 | "text/plain": [ 367 | "248" 368 | ] 369 | }, 370 | "execution_count": 39, 371 | "metadata": {}, 372 | "output_type": "execute_result" 373 | } 374 | ], 375 | "source": [ 376 | "hit" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 40, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "data": { 386 | "text/plain": [ 387 | "264" 388 | ] 389 | }, 390 | "execution_count": 40, 391 | "metadata": {}, 392 | "output_type": "execute_result" 393 | } 394 | ], 395 | "source": [ 396 | "counter" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 41, 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/plain": [ 407 | "0.9393939393939394" 408 | ] 409 | }, 410 | "execution_count": 41, 411 | "metadata": {}, 412 | "output_type": "execute_result" 413 | } 414 | ], 415 | "source": [ 416 | "hit/counter" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "gnn_explainer_dict" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [] 434 | } 435 | ], 436 | "metadata": { 437 | "kernelspec": { 438 | "display_name": "transformer_env", 439 | "language": "python", 440 | "name": "transformer_env" 441 | }, 442 | "language_info": { 443 | "codemirror_mode": { 444 | "name": "ipython", 445 | "version": 3 446 | }, 447 | "file_extension": ".py", 448 | "mimetype": "text/x-python", 449 | "name": "python", 450 | "nbconvert_exporter": "python", 451 | "pygments_lexer": "ipython3", 452 | "version": "3.8.8" 453 | } 454 | }, 455 | "nbformat": 4, 456 | "nbformat_minor": 4 457 | } -------------------------------------------------------------------------------- /Devign_Reveal_model/backbone.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from cmath import log 3 | import os 4 | import pickle 5 | import sys 6 | from data_loader.batch_graph import GGNNBatchGraph, BatchGraph 7 | from dgl.nn import GatedGraphConv 8 | from torch import nn 9 | import torch.nn.functional as f 10 | import copy 11 | import numpy as np 12 | import torch 13 | from torch.nn import BCELoss 14 | from torch.optim import Adam 15 | 16 | from data_loader.dataset import DataSet 17 | from modules.model import DevignModel, GGNNSum 18 | # from trainer import train 19 | from my_trainer import my_train 20 | from utils import tally_param, debug 21 | import logging 22 | from tqdm import tqdm 23 | from imblearn.under_sampling import RandomUnderSampler 24 | from imblearn.over_sampling import RandomOverSampler 25 | from imblearn.under_sampling import OneSidedSelection 26 | from imblearn.over_sampling import SMOTE 27 | from sklearn.neural_network import MLPClassifier 28 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_curve, auc 29 | 30 | 31 | class GGNNSum_MLP(nn.Module): 32 | def __init__(self, backbone): 33 | super(GGNNSum_MLP, self).__init__() 34 | self.inp_dim = backbone.inp_dim 35 | self.out_dim = backbone.out_dim 36 | self.max_edge_types = backbone.max_edge_types 37 | self.num_timesteps = backbone.num_timesteps 38 | self.ggnn = backbone.ggnn 39 | self.classifier = backbone.classifier 40 | 41 | # self.sigmoid = nn.Sigmoid() 42 | 43 | def forward(self, batch, device): 44 | graph, features, edge_types = batch.get_network_inputs(cuda=True, device=device) 45 | graph = graph.to(device) 46 | features = features.to(device) 47 | edge_types = edge_types.to(device) 48 | outputs = self.ggnn(graph, features, edge_types) 49 | h_i, _ = batch.de_batchify_graphs(outputs) 50 | return h_i.sum(dim=1) 51 | 52 | 53 | # ggnn_sum = self.classifier(h_i.sum(dim=1)) 54 | # result = self.sigmoid(ggnn_sum).squeeze(dim=-1) 55 | # return ggnn_sum 56 | 57 | class DevignModel_MLP(nn.Module): 58 | def __init__(self, backbone): 59 | super(DevignModel_MLP, self).__init__() 60 | self.inp_dim = backbone.inp_dim 61 | self.out_dim = backbone.out_dim 62 | self.max_edge_types = backbone.max_edge_types 63 | self.num_timesteps = backbone.num_timesteps 64 | self.ggnn = backbone.ggnn 65 | self.conv_l1 = backbone.conv_l1 66 | self.maxpool1 = backbone.maxpool1 67 | self.conv_l2 = backbone.conv_l2 68 | self.maxpool2 = backbone.maxpool2 69 | 70 | self.concat_dim = backbone.concat_dim 71 | self.conv_l1_for_concat = backbone.conv_l1_for_concat 72 | self.maxpool1_for_concat = backbone.maxpool1_for_concat 73 | self.conv_l2_for_concat = backbone.conv_l2_for_concat 74 | self.maxpool2_for_concat = backbone.maxpool2_for_concat 75 | 76 | self.mlp_z = backbone.mlp_z 77 | self.mlp_y = backbone.mlp_y 78 | 79 | def forward(self, batch, device): 80 | graph, features, edge_types = batch.get_network_inputs(cuda=True, device=device) 81 | # print("batch contain:",batch.num_of_subgraphs) 82 | graph = graph.to(device) 83 | features = features.to(device) 84 | edge_types = edge_types.to(device) 85 | outputs = self.ggnn(graph, features, edge_types) 86 | # print("features shape",features.shape) 87 | # print("outputs shape",outputs.shape) 88 | x_i, _ = batch.de_batchify_graphs(features) 89 | # print("x_i shape",x_i.shape) 90 | h_i, _ = batch.de_batchify_graphs(outputs) 91 | # print("h_i shape",h_i.shape) 92 | c_i = torch.cat((h_i, x_i), dim=-1) 93 | # print("c_i shape",c_i.shape) 94 | # print(h_i.transpose(1, 2).shape) 95 | batch_size, num_node, _ = c_i.size() 96 | Y_1 = self.maxpool1( 97 | f.relu( 98 | self.conv_l1(h_i.transpose(1, 2)) 99 | ) 100 | ) 101 | Y_2 = self.maxpool2( 102 | f.relu( 103 | self.conv_l2(Y_1) 104 | ) 105 | ).transpose(1, 2) 106 | Z_1 = self.maxpool1_for_concat( 107 | f.relu( 108 | self.conv_l1_for_concat(c_i.transpose(1, 2)) 109 | ) 110 | ) 111 | Z_2 = self.maxpool2_for_concat( 112 | f.relu( 113 | self.conv_l2_for_concat(Z_1) 114 | ) 115 | ).transpose(1, 2) 116 | # print("Y_2 shape",Y_2.shape) 117 | # print("Z_2 shape",Z_2.shape) 118 | Z_3 = self.mlp_z(Z_2) 119 | Y_3 = self.mlp_y(Y_2) 120 | ap_3 = torch.cat((Y_2, Z_2), 2) 121 | # print("ap_3 shape",ap_3.shape) 122 | # print("Y_3 shape",Y_3.shape) 123 | # print("Z_3 shape",Z_3.shape) 124 | # before_avg = torch.mul(Y_3, Z_3) 125 | # print("beforeavg shape",before_avg.shape) 126 | avg = ap_3.mean(dim=1) 127 | # print("avg shape",avg.shape) 128 | # result = self.sigmoid(avg).squeeze(dim=-1) 129 | return avg 130 | 131 | 132 | if __name__ == '__main__': 133 | torch.manual_seed(1000) 134 | np.random.seed(1000) 135 | parser = argparse.ArgumentParser() 136 | # parser.add_argument('--backbone_sampling_type', type=str, help='Type of the backbone sampling', 137 | # choices=['rus', 'ros','oss','smote'], default='smote') 138 | parser.add_argument('--model_type', type=str, help='Type of the model (devign/ggnn)', 139 | choices=['devign', 'ggnn'], default='devign') 140 | parser.add_argument('--model_state_dir', type=str, required=True, help='Dir of the model bin file') 141 | parser.add_argument('--dataset', type=str, required=True, help='Name of the dataset for experiment.') 142 | parser.add_argument('--input_dir', type=str, required=True, help='Input Directory of the parser') 143 | parser.add_argument('--node_tag', type=str, help='Name of the node feature.', default='node_features') 144 | parser.add_argument('--graph_tag', type=str, help='Name of the graph feature.', default='graph') 145 | parser.add_argument('--label_tag', type=str, help='Name of the label feature.', default='target') 146 | parser.add_argument('--data_split', type=str, default='1') 147 | parser.add_argument('--feature_size', type=int, help='Size of feature vector for each node', default=100) 148 | parser.add_argument('--graph_embed_size', type=int, help='Size of the Graph Embedding', default=200) 149 | parser.add_argument('--num_steps', type=int, help='Number of steps in GGNN', default=6) 150 | parser.add_argument('--batch_size', type=int, help='Batch Size for training', default=1) 151 | parser.add_argument('--device', type=str, default='cuda:0') 152 | args = parser.parse_args() 153 | 154 | if torch.cuda.device_count() > 1: 155 | if int(args.data_split) % 2 == 0: 156 | args.device = 'cuda:0' 157 | else: 158 | args.device = 'cuda:1' 159 | print(f'running split {args.data_split} on gpu {args.device}\n') 160 | LOG_FORMAT = "%(asctime)s - %(message)s" 161 | logging.basicConfig(filename=f'{args.dataset}_result/backbone_{args.model_type}_{args.data_split}.log', 162 | level=logging.INFO, format=LOG_FORMAT) 163 | if args.feature_size > args.graph_embed_size: 164 | print('Warning!!! Graph Embed dimension should be at least equal to the feature dimension.\n' 165 | 'Setting graph embedding size to feature size', file=sys.stderr) 166 | args.graph_embed_size = args.feature_size 167 | 168 | # model_dir = os.path.join('models', f'{args.model_type}_model', args.dataset, args.data_split) 169 | # if not os.path.exists(model_dir): 170 | # os.makedirs(model_dir) 171 | input_dir = args.input_dir 172 | processed_data_path = os.path.join(input_dir, 'processed.bin') 173 | print(processed_data_path) 174 | if os.path.exists(processed_data_path): 175 | debug('Reading already processed data from %s!' % processed_data_path) 176 | # logging.info('Reading already processed data from %s!' % processed_data_path) 177 | dataset = pickle.load(open(processed_data_path, 'rb')) 178 | debug(len(dataset.train_examples), len(dataset.valid_examples), len(dataset.test_examples)) 179 | # logging.info(f'{len(dataset.train_examples)}, {len(dataset.valid_examples)}, {len(dataset.test_examples)}') 180 | else: 181 | debug('ERROR require processed bin file!') 182 | exit() 183 | # dataset.batch_size = args.batch_size 184 | print('dataset batch size:', dataset.batch_size) 185 | # create model instance 186 | if args.model_type == 'ggnn': 187 | debug('model: GGNN') 188 | # logging.info('model: GGNN') 189 | model = GGNNSum(input_dim=dataset.feature_size, output_dim=args.graph_embed_size, 190 | num_steps=args.num_steps, max_edge_types=dataset.max_edge_type) 191 | model.load_state_dict(torch.load(args.model_state_dir, map_location='cuda:0')) 192 | my_model = GGNNSum_MLP(model) 193 | else: 194 | debug('model: Devign') 195 | # logging.info('model: Devign') 196 | model = DevignModel(input_dim=dataset.feature_size, output_dim=args.graph_embed_size, 197 | num_steps=args.num_steps, max_edge_types=dataset.max_edge_type) 198 | model.load_state_dict(torch.load(args.model_state_dir, map_location='cuda:0')) 199 | my_model = DevignModel_MLP(model) 200 | 201 | debug('Total Parameters : %d' % tally_param(my_model)) 202 | # logging.info('Total Parameters : %d' % tally_param(model)) 203 | # model.cuda() 204 | my_model.to(args.device) 205 | device = args.device 206 | 207 | # if args.model_type == 'devign': 208 | if True: 209 | with torch.no_grad(): 210 | my_model.eval() 211 | trainX_np_array = list() 212 | trainy_list = [] 213 | train_batch_len = dataset.initialize_train_batch() 214 | # train 215 | for i in tqdm(range(train_batch_len), desc=f'trainset'): 216 | graph, targets = dataset.get_next_train_batch() 217 | # print(targets) 218 | predictions = my_model(graph, device=device) 219 | trainX_np_array.append(predictions.cpu().detach().numpy()) 220 | # print(len(trainX_np_array)) 221 | trainy_list.extend(targets.cpu().detach().tolist()) 222 | trainy_np_list = np.array(trainy_list) 223 | trainX_np_array = np.vstack(trainX_np_array) 224 | print(trainX_np_array.shape) 225 | print(trainy_np_list.shape) 226 | train_X_dump_dir = f'{args.dataset}_result/backbone_{args.model_type}' 227 | if not os.path.exists(train_X_dump_dir): 228 | os.makedirs(train_X_dump_dir) 229 | torch.save(trainX_np_array, f'{train_X_dump_dir}/ros_{args.data_split}_trainX.pt') 230 | torch.save(trainy_np_list, f'{train_X_dump_dir}/ros_{args.data_split}_trainy.pt') 231 | # test 232 | 233 | testX_np_array = [] 234 | testy_list = [] 235 | test_batch_len = dataset.initialize_test_batch() 236 | for i in tqdm(range(test_batch_len), desc=f'testset'): 237 | graph, targets = dataset.get_next_test_batch() 238 | predictions = my_model(graph, device=device) 239 | testX_np_array.append(predictions.cpu().detach().numpy()) 240 | testy_list.extend(targets.cpu().detach().tolist()) 241 | testy_np_list = np.array(testy_list) 242 | testX_np_array = np.vstack(testX_np_array) 243 | print(testX_np_array.shape) 244 | print(testy_np_list.shape) 245 | 246 | print('latent training and testing phase') 247 | for type in ['rus', 'ros', 'oss', 'smote']: 248 | if type == 'rus': 249 | sampler = RandomUnderSampler() 250 | elif type == 'ros': 251 | sampler = RandomOverSampler() 252 | elif type == 'oss': 253 | sampler = OneSidedSelection() 254 | else: 255 | sampler = SMOTE() 256 | 257 | X_res, y_res = sampler.fit_resample(trainX_np_array, trainy_np_list) 258 | clf = MLPClassifier(max_iter=1000).fit(X_res, y_res) 259 | # print(clf.score(testX_np_array, testy_np_list)) 260 | all_predictions = clf.predict(testX_np_array) 261 | all_probabilities = clf.predict_proba(testX_np_array)[:, 1] 262 | fpr, tpr, _ = roc_curve(testy_np_list, all_probabilities) 263 | logging.info(f'{type}: acc: {accuracy_score(testy_np_list, all_predictions)} \ 264 | precision: {precision_score(testy_np_list, all_predictions)} \ 265 | recall: {recall_score(testy_np_list, all_predictions)}\ 266 | f1: {f1_score(testy_np_list, all_predictions)} \ 267 | auc: {auc(fpr, tpr)}') 268 | print(f'{type}: acc: {accuracy_score(testy_np_list, all_predictions)} \ 269 | precision: {precision_score(testy_np_list, all_predictions)} \ 270 | recall: {recall_score(testy_np_list, all_predictions)}\ 271 | f1: {f1_score(testy_np_list, all_predictions)} \ 272 | auc: {auc(fpr, tpr)}') 273 | zipped_result = zip(testy_np_list.tolist(), all_predictions.tolist(), all_probabilities.tolist()) 274 | sorted_zip = sorted(zipped_result, key=lambda x: x[2], reverse=True) 275 | dump_dir = f'{args.dataset}_result/backbone_{args.model_type}/{type}' 276 | if not os.path.exists(dump_dir): 277 | os.makedirs(dump_dir) 278 | pickle.dump(sorted_zip, open(f'{dump_dir}/zip_ans_pred_prob_{args.data_split}.pkl', "wb")) 279 | pickle.dump(clf, open(f'{dump_dir}/sk_model.pkl', 'wb')) 280 | 281 | 282 | 283 | 284 | 285 | 286 | -------------------------------------------------------------------------------- /Devign_Reveal_model/data_loader/__init__.py: -------------------------------------------------------------------------------- 1 | n_identifier = 'features' 2 | g_identifier = 'structure' 3 | l_identifier = 'label' -------------------------------------------------------------------------------- /Devign_Reveal_model/data_loader/batch_graph.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from dgl import DGLGraph 3 | 4 | 5 | class BatchGraph: 6 | def __init__(self): 7 | self.graph = DGLGraph() 8 | self.number_of_nodes = 0 9 | self.graphid_to_nodeids = {} 10 | self.num_of_subgraphs = 0 11 | 12 | def add_subgraph(self, _g): 13 | assert isinstance(_g, DGLGraph) 14 | num_new_nodes = _g.number_of_nodes() 15 | self.graphid_to_nodeids[self.num_of_subgraphs] = torch.LongTensor( 16 | list(range(self.number_of_nodes, self.number_of_nodes + num_new_nodes))) 17 | self.graph.add_nodes(num_new_nodes, data=_g.ndata) 18 | sources, dests = _g.all_edges() 19 | sources += self.number_of_nodes 20 | dests += self.number_of_nodes 21 | self.graph.add_edges(sources, dests, data=_g.edata) 22 | self.number_of_nodes += num_new_nodes 23 | self.num_of_subgraphs += 1 24 | 25 | def cuda(self, device=None): 26 | for k in self.graphid_to_nodeids.keys(): 27 | self.graphid_to_nodeids[k] = self.graphid_to_nodeids[k].cuda(device=device) 28 | 29 | def de_batchify_graphs(self, features=None): 30 | if features is None: 31 | features = self.graph.ndata['features'] 32 | assert isinstance(features, torch.Tensor) 33 | vectors = [features.index_select(dim=0, index=self.graphid_to_nodeids[gid]) for gid in 34 | self.graphid_to_nodeids.keys()] 35 | lengths = [f.size(0) for f in vectors] 36 | max_len = max(lengths) 37 | for i, v in enumerate(vectors): 38 | vectors[i] = torch.cat( 39 | (v, torch.zeros(size=(max_len - v.size(0), *(v.shape[1:])), requires_grad=v.requires_grad, 40 | device=v.device)), dim=0) 41 | output_vectors = torch.stack(vectors) 42 | lengths = torch.LongTensor(lengths).to(device=output_vectors.device) 43 | return output_vectors, lengths 44 | 45 | def get_network_inputs(self, cuda=False): 46 | raise NotImplementedError('Must be implemented by subclasses.') 47 | 48 | 49 | class GGNNBatchGraph(BatchGraph): 50 | def get_network_inputs(self, cuda=True, device=None): 51 | features = self.graph.ndata['features'] 52 | edge_types = self.graph.edata['etype'] 53 | if cuda: 54 | self.cuda(device=device) 55 | return self.graph, features.cuda(device=device), edge_types.cuda(device=device) 56 | else: 57 | return self.graph, features, edge_types 58 | pass 59 | -------------------------------------------------------------------------------- /Devign_Reveal_model/data_loader/dataset.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import sys 4 | 5 | import torch 6 | from dgl import DGLGraph 7 | from tqdm import tqdm 8 | 9 | from data_loader.batch_graph import GGNNBatchGraph 10 | from utils import load_default_identifiers, initialize_batch, debug 11 | 12 | 13 | class DataEntry: 14 | def __init__(self, datset, num_nodes, features, edges, target): 15 | self.dataset = datset 16 | self.num_nodes = num_nodes 17 | self.target = target 18 | self.graph = DGLGraph() 19 | self.features = torch.FloatTensor(features) 20 | self.graph.add_nodes(self.num_nodes, data={'features': self.features}) 21 | for s, _type, t in edges: 22 | etype_number = self.dataset.get_edge_type_number(_type) 23 | self.graph.add_edge(s, t, data={'etype': torch.LongTensor([etype_number])}) 24 | 25 | 26 | class DataSet: 27 | def __init__(self, train_src, valid_src=None, test_src=None, batch_size=32, n_ident=None, g_ident=None, l_ident=None): 28 | self.train_examples = [] 29 | self.valid_examples = [] 30 | self.test_examples = [] 31 | self.train_batches = [] 32 | self.valid_batches = [] 33 | self.test_batches = [] 34 | self.batch_size = batch_size 35 | self.edge_types = {} 36 | self.max_etype = 0 37 | self.feature_size = 0 38 | self.n_ident, self.g_ident, self.l_ident = load_default_identifiers(n_ident, g_ident, l_ident) 39 | self.read_dataset(test_src, train_src, valid_src) 40 | self.initialize_dataset() 41 | 42 | def initialize_dataset(self): 43 | self.initialize_train_batch() 44 | self.initialize_valid_batch() 45 | self.initialize_test_batch() 46 | 47 | def read_dataset(self, test_src, train_src, valid_src): 48 | debug('Reading Train File!') 49 | with open(train_src) as fp: 50 | train_data = json.load(fp) 51 | for entry in tqdm(train_data): 52 | # print("self.n_indent",self.n_ident) 53 | example = DataEntry(datset=self, num_nodes=len(entry[self.n_ident]), features=entry[self.n_ident], 54 | edges=entry[self.g_ident], target=entry[self.l_ident][0][0]) 55 | if self.feature_size == 0: 56 | self.feature_size = example.features.size(1) 57 | debug('Feature Size %d' % self.feature_size) 58 | self.train_examples.append(example) 59 | if valid_src is not None: 60 | debug('Reading Validation File!') 61 | with open(valid_src) as fp: 62 | valid_data = json.load(fp) 63 | for entry in tqdm(valid_data): 64 | example = DataEntry(datset=self, num_nodes=len(entry[self.n_ident]), 65 | features=entry[self.n_ident], 66 | edges=entry[self.g_ident], target=entry[self.l_ident][0][0]) 67 | self.valid_examples.append(example) 68 | if test_src is not None: 69 | debug('Reading Test File!') 70 | with open(test_src) as fp: 71 | test_data = json.load(fp) 72 | for entry in tqdm(test_data): 73 | example = DataEntry(datset=self, num_nodes=len(entry[self.n_ident]), 74 | features=entry[self.n_ident], 75 | edges=entry[self.g_ident], target=entry[self.l_ident][0][0]) 76 | self.test_examples.append(example) 77 | 78 | def get_edge_type_number(self, _type): 79 | if _type not in self.edge_types: 80 | self.edge_types[_type] = self.max_etype 81 | self.max_etype += 1 82 | return self.edge_types[_type] 83 | 84 | @property 85 | def max_edge_type(self): 86 | return self.max_etype 87 | 88 | def initialize_train_batch(self, batch_size=-1): 89 | if batch_size == -1: 90 | batch_size = self.batch_size 91 | self.train_batches = initialize_batch(self.train_examples, batch_size, shuffle=True) 92 | return len(self.train_batches) 93 | pass 94 | 95 | def initialize_valid_batch(self, batch_size=-1): 96 | if batch_size == -1: 97 | batch_size = self.batch_size 98 | self.valid_batches = initialize_batch(self.valid_examples, batch_size) 99 | return len(self.valid_batches) 100 | pass 101 | 102 | def initialize_test_batch(self, batch_size=-1): 103 | if batch_size == -1: 104 | batch_size = self.batch_size 105 | self.test_batches = initialize_batch(self.test_examples, batch_size) 106 | return len(self.test_batches) 107 | pass 108 | 109 | def get_dataset_by_ids_for_GGNN(self, entries, ids): 110 | taken_entries = [entries[i] for i in ids] 111 | labels = [e.target for e in taken_entries] 112 | batch_graph = GGNNBatchGraph() 113 | for entry in taken_entries: 114 | batch_graph.add_subgraph(copy.deepcopy(entry.graph)) 115 | return batch_graph, torch.FloatTensor(labels) 116 | 117 | def get_next_train_batch(self): 118 | if len(self.train_batches) == 0: 119 | self.initialize_train_batch() 120 | ids = self.train_batches.pop() 121 | return self.get_dataset_by_ids_for_GGNN(self.train_examples, ids) 122 | 123 | def get_next_valid_batch(self): 124 | if len(self.valid_batches) == 0: 125 | self.initialize_valid_batch() 126 | ids = self.valid_batches.pop() 127 | return self.get_dataset_by_ids_for_GGNN(self.valid_examples, ids) 128 | 129 | def get_next_test_batch(self): 130 | if len(self.test_batches) == 0: 131 | self.initialize_test_batch() 132 | ids = self.test_batches.pop() 133 | return self.get_dataset_by_ids_for_GGNN(self.test_examples, ids) 134 | -------------------------------------------------------------------------------- /Devign_Reveal_model/data_sampler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | from sklearn.model_selection import train_test_split 4 | import numpy as np 5 | from imblearn.over_sampling import RandomOverSampler 6 | from imblearn.under_sampling import RandomUnderSampler 7 | import argparse 8 | import pandas as pd 9 | import itertools 10 | 11 | if __name__ == "__main__": 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--sampling_type', type=str, help='Type of the sampling', 15 | choices=['ros', 'oss', 'rus'], default='rus') 16 | parser.add_argument('--json_dir', type=str, required=True) 17 | parser.add_argument('--out_dir', type=str, required=True) 18 | parser.add_argument('--data_split_number', type=int, default=1) 19 | args = parser.parse_args() 20 | 21 | df = pd.read_json(args.json_dir) 22 | print(df.info()) 23 | # merged = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(df['targets'].tolist())))) 24 | # print(merged) 25 | if args.sampling_type == 'ros': 26 | for i in range(args.data_split_number): 27 | train, test = train_test_split(df, random_state=i, test_size=0.2) 28 | train_targets = list( 29 | itertools.chain.from_iterable(list(itertools.chain.from_iterable(train['targets'].tolist())))) 30 | ros = RandomOverSampler() 31 | train_resampled, _ = ros.fit_resample(train, train_targets) 32 | print(train_resampled.info()) 33 | print(test.info()) 34 | if not os.path.exists(f'{args.out_dir}data_split_{i}'): 35 | os.makedirs(f'{args.out_dir}data_split_{i}') 36 | train_resampled.to_json(f'{args.out_dir}data_split_{i}/train_GGNNinput.json', orient='records') 37 | test.to_json(f'{args.out_dir}data_split_{i}/test_GGNNinput.json', orient='records') 38 | 39 | if args.sampling_type == 'rus': 40 | for i in range(args.data_split_number): 41 | train, test = train_test_split(df, random_state=i, test_size=0.2) 42 | train_targets = list( 43 | itertools.chain.from_iterable(list(itertools.chain.from_iterable(train['targets'].tolist())))) 44 | rus = RandomUnderSampler() 45 | train_resampled, _ = rus.fit_resample(train, train_targets) 46 | print(train_resampled.info()) 47 | print(test.info()) 48 | if not os.path.exists(f'{args.out_dir}data_split_{i}'): 49 | os.makedirs(f'{args.out_dir}data_split_{i}') 50 | train_resampled.to_json(f'{args.out_dir}data_split_{i}/train_GGNNinput.json', orient='records') 51 | test.to_json(f'{args.out_dir}data_split_{i}/test_GGNNinput.json', orient='records') -------------------------------------------------------------------------------- /Devign_Reveal_model/exp.bash: -------------------------------------------------------------------------------- 1 | # N=5 2 | # ( 3 | # for thing in {10..19}; do 4 | # ((i=i%N)); ((i++==0)) && wait 5 | # python main.py --dataset msr --sampling origin --input_dir reveal_model_data/msr_data/origin/data_split_"$thing" --node_tag node_features --graph_tag graph --label_tag targets --feature_size 100 --data_split "$thing" --model_type ggnn --batch_size 256 & 6 | # done 7 | # ) 8 | 9 | # N=5 10 | # ( 11 | # for thing in {0..19}; do 12 | # ((i=i%N)); ((i++==0)) && wait 13 | # python main.py --dataset reveal --sampling oss --input_dir reveal_model_data/reveal_data/actual_ros/data_split_"$thing" --node_tag node_features --graph_tag graph --label_tag targets --feature_size 100 --data_split "$thing" --model_type ggnn & 14 | # done 15 | # ) 16 | python main.py --dataset msr --sampling msr --input_dir reveal_model_data/msr_data/origin/data_split_0 --node_tag node_features --graph_tag graph --label_tag targets --feature_size 100 --data_split 0 --model_type ggnn --batch_size 256 -------------------------------------------------------------------------------- /Devign_Reveal_model/exp_latent.sh: -------------------------------------------------------------------------------- 1 | 2 | N=5 3 | ( 4 | for thing in {10..19}; do 5 | ((i=i%N)); ((i++==0)) && wait 6 | python backbone.py --model_state_dir msr_result/ggnn_model/origin/"$thing"/Model_ep_49.bin --dataset msr --input_dir reveal_model_data/msr_data/origin/data_split_"$thing" --node_tag node_features --graph_tag graph --label_tag targets --feature_size 100 --data_split "$thing" --model_type ggnn & 7 | done 8 | ) 9 | -------------------------------------------------------------------------------- /Devign_Reveal_model/gnn_explainer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import argparse\n", 10 | "from cmath import log\n", 11 | "import os\n", 12 | "import pickle\n", 13 | "import sys\n", 14 | "import numpy as np\n", 15 | "import torch\n", 16 | "from torch.nn import BCELoss\n", 17 | "from torch.optim import Adam\n", 18 | "from tqdm.notebook import tqdm\n", 19 | "from data_loader.dataset import DataSet\n", 20 | "from modules.model import DevignModel, GGNNSum" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import warnings\n", 30 | "warnings.filterwarnings(\"ignore\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# change to proceed dataset dir as the input dataset for interpretation\n", 40 | "input_dir = 'reveal_model_data/msr_data/ros_4x/'\n", 41 | "processed_data_path = os.path.join(input_dir, 'processed.bin')\n", 42 | "dataset = pickle.load(open(processed_data_path, 'rb'))" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 4, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "dataset.batch_size = 1" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 5, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "27726" 63 | ] 64 | }, 65 | "execution_count": 5, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "dataset.initialize_test_batch()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "model = GGNNSum(input_dim=dataset.feature_size, output_dim=200,num_steps=6, max_edge_types=dataset.max_edge_type)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 7, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "from torch import nn\n", 90 | "from data_loader.batch_graph import GGNNBatchGraph\n", 91 | "import copy" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 8, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "" 103 | ] 104 | }, 105 | "execution_count": 8, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | } 109 | ], 110 | "source": [ 111 | "# load the trained-model to interpretation\n", 112 | "model.load_state_dict(torch.load('msr_result/ggnn_model/msr_4x/0/Model_ep_49.bin'))" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 9, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "GGNNSum(\n", 124 | " (ggnn): GatedGraphConv(\n", 125 | " (linears): ModuleList(\n", 126 | " (0): Linear(in_features=200, out_features=200, bias=True)\n", 127 | " (1): Linear(in_features=200, out_features=200, bias=True)\n", 128 | " (2): Linear(in_features=200, out_features=200, bias=True)\n", 129 | " (3): Linear(in_features=200, out_features=200, bias=True)\n", 130 | " )\n", 131 | " (gru): GRUCell(200, 200)\n", 132 | " )\n", 133 | " (classifier): Linear(in_features=200, out_features=1, bias=True)\n", 134 | " (sigmoid): Sigmoid()\n", 135 | ")" 136 | ] 137 | }, 138 | "execution_count": 9, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "model.to('cuda:0')" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 10, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "import numpy" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 11, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# use when explain model with Sampling_R\n", 163 | "class GGNNSum_single(nn.Module):\n", 164 | " def __init__(self, GGNNSum):\n", 165 | " super(GGNNSum_single, self).__init__()\n", 166 | " self.net = GGNNSum\n", 167 | "\n", 168 | " def forward(self, graph, feat, eweight=None):\n", 169 | " batch_graph = GGNNBatchGraph()\n", 170 | " batch_graph.add_subgraph(copy.deepcopy(graph))\n", 171 | " outputs = self.net(batch_graph,device='cuda:0')\n", 172 | " return torch.tensor([[1-outputs, outputs]])\n" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 12, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# use when explain model with Sampling_L\n", 182 | "class GGNNSum_latent(nn.Module):\n", 183 | " def __init__(self, GGNNSum,skMLP):\n", 184 | " super(GGNNSum_latent, self).__init__()\n", 185 | " self.net = GGNNSum\n", 186 | " self.clf = skMLP\n", 187 | " \n", 188 | " def forward(self,graph,feat,eweight=None):\n", 189 | " device = 'cuda:0'\n", 190 | " batch_graph = GGNNBatchGraph()\n", 191 | " batch_graph.add_subgraph(copy.deepcopy(graph))\n", 192 | " graph, features, edge_types = batch_graph.get_network_inputs(cuda=True,device=device)\n", 193 | " graph = graph.to(device)\n", 194 | " features = features.to(device)\n", 195 | " edge_types = edge_types.to(device)\n", 196 | " outputs = self.net.ggnn(graph, features, edge_types)\n", 197 | " h_i, _ = batch_graph.de_batchify_graphs(outputs)\n", 198 | " digit = h_i.sum(dim=1).cpu().detach().numpy()\n", 199 | " clf_output = self.clf.predict_proba(digit)\n", 200 | " del graph,edge_types,features\n", 201 | " return torch.tensor(clf_output)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 13, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "# use when explain model with Sampling_L, load in the classifier you trained with sampling_L\n", 211 | "# clf = pickle.load(open('msr_result/backbone_ggnn/smote/sk_model.pkl', 'rb'))" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 14, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# switch between sampling_L and R\n", 221 | "exp_model = GGNNSum_single(model)\n", 222 | "# exp_model = GGNNSum_latent(model,clf)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 15, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "from dgl.nn.pytorch.explain import GNNExplainer" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 16, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "gnnexplainer = GNNExplainer(exp_model,num_hops=1,log =False)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 17, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "application/vnd.jupyter.widget-view+json": { 251 | "model_id": "36988f4744da4b50973a708ccb11d7c8", 252 | "version_major": 2, 253 | "version_minor": 0 254 | }, 255 | "text/plain": [ 256 | " 0%| | 0/27726 [00:00 10 and graph.num_nodes() > 10:\n", 271 | " features = graph.ndata['features']\n", 272 | " pred = exp_model(graph,features)\n", 273 | "# print(pred)\n", 274 | "# break\n", 275 | " if pred[0][1] > 0.5:\n", 276 | "# print(index,'tp')\n", 277 | " _ ,edge_mask = gnnexplainer.explain_graph(graph=graph,feat=features)\n", 278 | " top_10 = np.argpartition(edge_mask.numpy(), -10)[-10:]\n", 279 | " node_list = []\n", 280 | " for x in top_10:\n", 281 | " node_1,node_2 = graph.find_edges(x)\n", 282 | " node_list.append(node_1.numpy()[0])\n", 283 | " node_list.append(node_2.numpy()[0])\n", 284 | " TP_explaination_dict[index] = node_list" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 18, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "411" 296 | ] 297 | }, 298 | "execution_count": 18, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "len(TP_explaination_dict)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 19, 310 | "metadata": {}, 311 | "outputs": [ 312 | { 313 | "data": { 314 | "text/plain": [ 315 | "27726" 316 | ] 317 | }, 318 | "execution_count": 19, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "total_test_item" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 20, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "data": { 334 | "text/plain": [ 335 | "411" 336 | ] 337 | }, 338 | "execution_count": 20, 339 | "metadata": {}, 340 | "output_type": "execute_result" 341 | } 342 | ], 343 | "source": [ 344 | "len(TP_explaination_dict)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 21, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "# save the explaination results for further analysis\n", 354 | "import pickle\n", 355 | "with open('gnnexplainer_result/msr_4x_split_0_hop_1.pkl', 'wb') as fp:\n", 356 | " pickle.dump(TP_explaination_dict, fp)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [] 365 | } 366 | ], 367 | "metadata": { 368 | "kernelspec": { 369 | "display_name": "transformer_env", 370 | "language": "python", 371 | "name": "transformer_env" 372 | }, 373 | "language_info": { 374 | "codemirror_mode": { 375 | "name": "ipython", 376 | "version": 3 377 | }, 378 | "file_extension": ".py", 379 | "mimetype": "text/x-python", 380 | "name": "python", 381 | "nbconvert_exporter": "python", 382 | "pygments_lexer": "ipython3", 383 | "version": "3.8.8" 384 | } 385 | }, 386 | "nbformat": 4, 387 | "nbformat_minor": 4 388 | } -------------------------------------------------------------------------------- /Devign_Reveal_model/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from cmath import log 3 | import os 4 | import pickle 5 | import sys 6 | 7 | import numpy as np 8 | import torch 9 | from torch.nn import BCELoss 10 | from torch.optim import Adam 11 | 12 | from data_loader.dataset import DataSet 13 | from modules.model import DevignModel, GGNNSum 14 | # from trainer import train 15 | from my_trainer import my_train 16 | from utils import tally_param, debug 17 | import logging 18 | 19 | if __name__ == '__main__': 20 | torch.manual_seed(1000) 21 | np.random.seed(1000) 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--model_type', type=str, help='Type of the model (devign/ggnn)', 24 | choices=['devign', 'ggnn'], default='devign') 25 | parser.add_argument('--dataset', type=str, required=True, help='Name of the dataset for experiment.') 26 | parser.add_argument('--sampling', type=str, required=True, help='sampling method') 27 | parser.add_argument('--input_dir', type=str, required=True, help='Input Directory of the parser') 28 | parser.add_argument('--node_tag', type=str, help='Name of the node feature.', default='node_features') 29 | parser.add_argument('--graph_tag', type=str, help='Name of the graph feature.', default='graph') 30 | parser.add_argument('--label_tag', type=str, help='Name of the label feature.', default='target') 31 | parser.add_argument('--data_split', type=str, default='1') 32 | parser.add_argument('--feature_size', type=int, help='Size of feature vector for each node', default=100) 33 | parser.add_argument('--graph_embed_size', type=int, help='Size of the Graph Embedding', default=200) 34 | parser.add_argument('--num_steps', type=int, help='Number of steps in GGNN', default=6) 35 | parser.add_argument('--batch_size', type=int, help='Batch Size for training', default=128) 36 | 37 | parser.add_argument('--device', type=str, default='cuda:0') 38 | args = parser.parse_args() 39 | 40 | model_dir = os.path.join(f'{args.dataset}_result', f'{args.model_type}_model', args.sampling, args.data_split) 41 | print('out dir ', model_dir) 42 | if not os.path.exists(model_dir): 43 | os.makedirs(model_dir) 44 | 45 | LOG_FORMAT = "%(asctime)s - %(message)s" 46 | logging.basicConfig(filename=f'{model_dir}.log', level=logging.INFO, format=LOG_FORMAT) 47 | if args.feature_size > args.graph_embed_size: 48 | print('Warning!!! Graph Embed dimension should be at least equal to the feature dimension.\n' 49 | 'Setting graph embedding size to feature size', file=sys.stderr) 50 | args.graph_embed_size = args.feature_size 51 | 52 | input_dir = args.input_dir 53 | processed_data_path = os.path.join(input_dir, 'processed.bin') 54 | if os.path.exists(processed_data_path): 55 | debug('Reading already processed data from %s!' % processed_data_path) 56 | logging.info('Reading already processed data from %s!' % processed_data_path) 57 | dataset = pickle.load(open(processed_data_path, 'rb')) 58 | debug(len(dataset.train_examples), len(dataset.valid_examples), len(dataset.test_examples)) 59 | logging.info(f'{len(dataset.train_examples)}, {len(dataset.valid_examples)}, {len(dataset.test_examples)}') 60 | else: 61 | debug('generate new dataset') 62 | logging.info('generate new dataset') 63 | dataset = DataSet(train_src=os.path.join(input_dir, 'train_GGNNinput.json'), 64 | valid_src=os.path.join(input_dir, 'test_GGNNinput.json'), 65 | test_src=os.path.join(input_dir, 'test_GGNNinput.json'), 66 | batch_size=args.batch_size, n_ident=args.node_tag, g_ident=args.graph_tag, 67 | l_ident=args.label_tag) 68 | file = open(processed_data_path, 'wb') 69 | pickle.dump(dataset, file) 70 | file.close() 71 | debug(f'processed file dump to {processed_data_path}') 72 | logging.info(f'processed file dump to {processed_data_path}') 73 | assert args.feature_size == dataset.feature_size, \ 74 | 'Dataset contains different feature vector than argument feature size. ' \ 75 | 'Either change the feature vector size in argument, or provide different dataset.' 76 | if args.model_type == 'ggnn': 77 | debug('model: GGNN') 78 | logging.info('model: GGNN') 79 | model = GGNNSum(input_dim=dataset.feature_size, output_dim=args.graph_embed_size, 80 | num_steps=args.num_steps, max_edge_types=dataset.max_edge_type) 81 | else: 82 | debug('model: Devign') 83 | logging.info('model: Devign') 84 | model = DevignModel(input_dim=dataset.feature_size, output_dim=args.graph_embed_size, 85 | num_steps=args.num_steps, max_edge_types=dataset.max_edge_type) 86 | 87 | debug('Total Parameters : %d' % tally_param(model)) 88 | logging.info('Total Parameters : %d' % tally_param(model)) 89 | # model.cuda() 90 | model.to(args.device) 91 | # loss_function = BCELoss(reduction='sum') 92 | loss_function = BCELoss(reduction='mean') 93 | optim = Adam(model.parameters(), lr=0.0001, weight_decay=0.001) 94 | # train(model=model, dataset=dataset, max_steps=1000000, dev_every=128, 95 | # loss_function=loss_function, optimizer=optim, 96 | # save_path=model_dir + '/GGNNSumModel', max_patience=100, log_every=None, device=args.device) 97 | my_train(model=model, epochs=50, dataset=dataset, loss_function=loss_function, optimizer=optim, 98 | save_path=model_dir + '/Model', device=args.device) 99 | -------------------------------------------------------------------------------- /Devign_Reveal_model/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WIP2022/DataSampling4DLVD/a01d6cb246bfd4d2fd46448821bf457e9513b82a/Devign_Reveal_model/modules/__init__.py -------------------------------------------------------------------------------- /Devign_Reveal_model/modules/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from dgl.nn import GatedGraphConv 3 | from torch import nn 4 | import torch.nn.functional as f 5 | 6 | 7 | class DevignModel(nn.Module): 8 | def __init__(self, input_dim, output_dim, max_edge_types, num_steps=8): 9 | super(DevignModel, self).__init__() 10 | self.inp_dim = input_dim 11 | self.out_dim = output_dim 12 | self.max_edge_types = max_edge_types 13 | self.num_timesteps = num_steps 14 | self.ggnn = GatedGraphConv(in_feats=input_dim, out_feats=output_dim, 15 | n_steps=num_steps, n_etypes=max_edge_types) 16 | self.conv_l1 = torch.nn.Conv1d(output_dim, output_dim, 3) 17 | self.maxpool1 = torch.nn.MaxPool1d(3, stride=2) 18 | self.conv_l2 = torch.nn.Conv1d(output_dim, output_dim, 1) 19 | self.maxpool2 = torch.nn.MaxPool1d(2, stride=2) 20 | 21 | self.concat_dim = input_dim + output_dim 22 | self.conv_l1_for_concat = torch.nn.Conv1d(self.concat_dim, self.concat_dim, 3) 23 | self.maxpool1_for_concat = torch.nn.MaxPool1d(3, stride=2) 24 | self.conv_l2_for_concat = torch.nn.Conv1d(self.concat_dim, self.concat_dim, 1) 25 | self.maxpool2_for_concat = torch.nn.MaxPool1d(2, stride=2) 26 | 27 | self.mlp_z = nn.Linear(in_features=self.concat_dim, out_features=1) 28 | self.mlp_y = nn.Linear(in_features=output_dim, out_features=1) 29 | self.sigmoid = nn.Sigmoid() 30 | 31 | def forward(self, batch, device): 32 | graph, features, edge_types = batch.get_network_inputs(cuda=True, device=device) 33 | graph = graph.to(device) 34 | features = features.to(device) 35 | edge_types = edge_types.to(device) 36 | outputs = self.ggnn(graph, features, edge_types) 37 | x_i, _ = batch.de_batchify_graphs(features) 38 | h_i, _ = batch.de_batchify_graphs(outputs) 39 | c_i = torch.cat((h_i, x_i), dim=-1) 40 | batch_size, num_node, _ = c_i.size() 41 | Y_1 = self.maxpool1( 42 | f.relu( 43 | self.conv_l1(h_i.transpose(1, 2)) 44 | ) 45 | ) 46 | Y_2 = self.maxpool2( 47 | f.relu( 48 | self.conv_l2(Y_1) 49 | ) 50 | ).transpose(1, 2) 51 | Z_1 = self.maxpool1_for_concat( 52 | f.relu( 53 | self.conv_l1_for_concat(c_i.transpose(1, 2)) 54 | ) 55 | ) 56 | Z_2 = self.maxpool2_for_concat( 57 | f.relu( 58 | self.conv_l2_for_concat(Z_1) 59 | ) 60 | ).transpose(1, 2) 61 | before_avg = torch.mul(self.mlp_y(Y_2), self.mlp_z(Z_2)) 62 | avg = before_avg.mean(dim=1) 63 | result = self.sigmoid(avg).squeeze(dim=-1) 64 | return result 65 | 66 | 67 | class DevignModel_softmax(nn.Module): 68 | def __init__(self, input_dim, output_dim, max_edge_types, num_steps=8): 69 | super(DevignModel_softmax, self).__init__() 70 | self.inp_dim = input_dim 71 | self.out_dim = output_dim 72 | self.max_edge_types = max_edge_types 73 | self.num_timesteps = num_steps 74 | self.ggnn = GatedGraphConv(in_feats=input_dim, out_feats=output_dim, 75 | n_steps=num_steps, n_etypes=max_edge_types) 76 | self.conv_l1 = torch.nn.Conv1d(output_dim, output_dim, 3) 77 | self.maxpool1 = torch.nn.MaxPool1d(3, stride=2) 78 | self.conv_l2 = torch.nn.Conv1d(output_dim, output_dim, 1) 79 | self.maxpool2 = torch.nn.MaxPool1d(2, stride=2) 80 | 81 | self.concat_dim = input_dim + output_dim 82 | self.conv_l1_for_concat = torch.nn.Conv1d(self.concat_dim, self.concat_dim, 3) 83 | self.maxpool1_for_concat = torch.nn.MaxPool1d(3, stride=2) 84 | self.conv_l2_for_concat = torch.nn.Conv1d(self.concat_dim, self.concat_dim, 1) 85 | self.maxpool2_for_concat = torch.nn.MaxPool1d(2, stride=2) 86 | 87 | self.mlp_z = nn.Linear(in_features=self.concat_dim, out_features=2) 88 | self.mlp_y = nn.Linear(in_features=output_dim, out_features=2) 89 | # self.sigmoid = nn.Sigmoid() 90 | 91 | def forward(self, batch, device): 92 | graph, features, edge_types = batch.get_network_inputs(cuda=True, device=device) 93 | graph = graph.to(device) 94 | features = features.to(device) 95 | edge_types = edge_types.to(device) 96 | outputs = self.ggnn(graph, features, edge_types) 97 | x_i, _ = batch.de_batchify_graphs(features) 98 | h_i, _ = batch.de_batchify_graphs(outputs) 99 | c_i = torch.cat((h_i, x_i), dim=-1) 100 | batch_size, num_node, _ = c_i.size() 101 | 102 | Y_1 = self.maxpool1( 103 | f.relu( 104 | self.conv_l1(h_i.transpose(1, 2)) 105 | ) 106 | ) 107 | Y_2 = self.maxpool2( 108 | f.relu( 109 | self.conv_l2(Y_1) 110 | ) 111 | ).transpose(1, 2) 112 | # print('y2',Y_2.shape) 113 | Z_1 = self.maxpool1_for_concat( 114 | f.relu( 115 | self.conv_l1_for_concat(c_i.transpose(1, 2)) 116 | ) 117 | ) 118 | Z_2 = self.maxpool2_for_concat( 119 | f.relu( 120 | self.conv_l2_for_concat(Z_1) 121 | ) 122 | ).transpose(1, 2) 123 | # print('z2',Z_2.shape) 124 | before_avg = torch.mul(self.mlp_y(Y_2), self.mlp_z(Z_2)) 125 | # print('before ',before_avg.shape) 126 | avg = before_avg.mean(dim=1) 127 | # print('avg',avg.shape) 128 | result = nn.Softmax(dim=1)(avg) 129 | # print('result',result.shape) 130 | return result 131 | 132 | 133 | class GGNNSum(nn.Module): 134 | def __init__(self, input_dim, output_dim, max_edge_types, num_steps=8): 135 | super(GGNNSum, self).__init__() 136 | self.inp_dim = input_dim 137 | self.out_dim = output_dim 138 | self.max_edge_types = max_edge_types 139 | self.num_timesteps = num_steps 140 | self.ggnn = GatedGraphConv(in_feats=input_dim, out_feats=output_dim, n_steps=num_steps, 141 | n_etypes=max_edge_types) 142 | self.classifier = nn.Linear(in_features=output_dim, out_features=1) 143 | self.sigmoid = nn.Sigmoid() 144 | 145 | def forward(self, batch, device): 146 | graph, features, edge_types = batch.get_network_inputs(cuda=True, device=device) 147 | graph = graph.to(device) 148 | features = features.to(device) 149 | edge_types = edge_types.to(device) 150 | outputs = self.ggnn(graph, features, edge_types) 151 | h_i, _ = batch.de_batchify_graphs(outputs) 152 | ggnn_sum = self.classifier(h_i.sum(dim=1)) 153 | result = self.sigmoid(ggnn_sum).squeeze(dim=-1) 154 | return result 155 | 156 | 157 | class GGNNSum_softmax(nn.Module): 158 | def __init__(self, input_dim, output_dim, max_edge_types, num_steps=8): 159 | super(GGNNSum_softmax, self).__init__() 160 | self.inp_dim = input_dim 161 | self.out_dim = output_dim 162 | self.max_edge_types = max_edge_types 163 | self.num_timesteps = num_steps 164 | self.ggnn = GatedGraphConv(in_feats=input_dim, out_feats=output_dim, n_steps=num_steps, 165 | n_etypes=max_edge_types) 166 | self.classifier = nn.Linear(in_features=output_dim, out_features=2) 167 | 168 | def forward(self, batch, device): 169 | graph, features, edge_types = batch.get_network_inputs(cuda=True, device=device) 170 | graph = graph.to(device) 171 | features = features.to(device) 172 | edge_types = edge_types.to(device) 173 | outputs = self.ggnn(graph, features, edge_types) 174 | h_i, _ = batch.de_batchify_graphs(outputs) 175 | ggnn_sum = self.classifier(h_i.sum(dim=1)) 176 | result = nn.Softmax(dim=1)(ggnn_sum) 177 | # print(result.shape) 178 | return result -------------------------------------------------------------------------------- /Devign_Reveal_model/my_trainer.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from sys import stderr 3 | 4 | import numpy as np 5 | import torch 6 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score,roc_curve, auc, confusion_matrix 7 | from tqdm import tqdm 8 | import logging 9 | from utils import debug 10 | from torch import nn 11 | from torch.utils.tensorboard import SummaryWriter 12 | def my_evaluate_metrics(model, loss_function, num_batches, dataset, device='cuda:0'): 13 | if type(loss_function) is nn.BCELoss: 14 | logging.info('sigmoid') 15 | model.eval() 16 | with torch.no_grad(): 17 | _loss = [] 18 | all_predictions, all_targets = [], [] 19 | all_probabilities = [] 20 | for _ in tqdm(range(num_batches),desc='valid >'): 21 | graph, targets = dataset.get_next_valid_batch() 22 | targets = targets.cuda() 23 | predictions = model(graph, device=device) 24 | all_probabilities.extend(predictions.detach().cpu().tolist()) 25 | batch_loss = loss_function(predictions, targets) 26 | _loss.append(batch_loss.detach().cpu().item()) 27 | predictions = predictions.detach().cpu() 28 | if predictions.ndim == 2: 29 | all_predictions.extend(np.argmax(predictions.numpy(), axis=-1).tolist()) 30 | else: 31 | all_predictions.extend( 32 | predictions.ge(torch.ones(size=predictions.size()).fill_(0.5)).to( 33 | dtype=torch.int32).numpy().tolist() 34 | ) 35 | all_targets.extend(targets.detach().cpu().numpy().tolist()) 36 | model.train() 37 | print(confusion_matrix(all_targets,all_predictions)) 38 | fpr, tpr, _ = roc_curve(all_targets, all_probabilities) 39 | return accuracy_score(all_targets, all_predictions) , \ 40 | precision_score(all_targets, all_predictions) , \ 41 | recall_score(all_targets, all_predictions) , \ 42 | f1_score(all_targets, all_predictions) , \ 43 | auc(fpr, tpr) 44 | else: 45 | print('softmax') 46 | with torch.no_grad(): 47 | all_predictions, all_targets = [], [] 48 | all_probabilities = [] 49 | for _ in tqdm(range(num_batches),desc='valid >'): 50 | graph, targets = dataset.get_next_valid_batch() 51 | # targets = targets.cuda() 52 | out = model(graph, device=device) 53 | predictions = out.argmax(dim=1).cpu().detach().numpy() 54 | all_predictions.extend(predictions) 55 | all_targets.extend(targets.numpy()) 56 | # print(out[0]) 57 | prob_1 = out.cpu().detach().numpy()[:,-1] 58 | all_probabilities.extend(prob_1) 59 | model.train() 60 | fpr, tpr, _ = roc_curve(all_targets, all_probabilities) 61 | return accuracy_score(all_targets, all_predictions) , \ 62 | precision_score(all_targets, all_predictions) , \ 63 | recall_score(all_targets, all_predictions) , \ 64 | f1_score(all_targets, all_predictions) , \ 65 | auc(fpr, tpr) 66 | 67 | 68 | 69 | 70 | def my_train(model,epochs, dataset, loss_function, optimizer, save_path,device='cuda:0'): 71 | # writer = SummaryWriter() 72 | # debug('Start Training') 73 | # logging.info('Start Training') 74 | best_model = None 75 | patience_counter = 0 76 | best_f1 = 0 77 | for e in range(epochs): 78 | train_losses = [] 79 | model.train() 80 | train_batch_len = dataset.initialize_train_batch() 81 | for i in tqdm(range(train_batch_len),desc=f'train {e}'): 82 | optimizer.zero_grad() 83 | graph, targets = dataset.get_next_train_batch() 84 | # if type(loss_function) is not nn.BCELoss: 85 | # targets = targets.type(torch.LongTensor) 86 | targets = targets.to(device) 87 | predictions = model(graph, device=device) 88 | batch_loss = loss_function(predictions, targets) 89 | train_losses.append(batch_loss.detach().cpu().item()) 90 | batch_loss.backward() 91 | optimizer.step() 92 | train_loss_avg = sum(train_losses) / len(train_losses) 93 | debug(f'training loss in epochs {e} -> {train_loss_avg}') 94 | # logout every epochs, also save model 95 | _save_file = open(save_path + f'_ep_{e}.bin', 'wb') 96 | torch.save(model.state_dict(), _save_file) 97 | # skip eval 98 | # number_valid_batch = dataset.initialize_valid_batch() 99 | # acc, pr, rc, f1, auc_score = my_evaluate_metrics(model, loss_function, number_valid_batch, dataset, device) 100 | # writer.add_scalar('Loss/train', train_loss_avg, e) 101 | # writer.add_scalar('auc/test', auc_score, e) 102 | # writer.add_scalar('f1/test', f1, e) 103 | # debug('%s\t Epochs %d\tTest Accuracy: %f\tPrecision: %f\tRecall: %f\tF1: %f\t AUC: %f' % (save_path, e, acc, pr, rc, f1,auc_score)) 104 | # logging.info('%s\t Epochs %d\tTest Accuracy: %f\tPrecision: %f\tRecall: %f\tF1: %f\t AUC: %f' % (save_path, e, acc, pr, rc, f1,auc_score)) 105 | # debug('=' * 100) 106 | -------------------------------------------------------------------------------- /Devign_Reveal_model/readme.md: -------------------------------------------------------------------------------- 1 | # Reveal model & Devign model readme 2 | 3 | This repo is developed based on the official github repo of [Reveal](https://github.com/VulDetProject/ReVeal) 4 | 5 | The Reveal & Devign model share same input/output format and shape, only network is different. 6 | 7 | ## 0 pre-process with Reveal repo (optional) 8 | For this step, please follow extactly how Reveal repo do. We provide a instince pipeline of how to process from raw code to processed data. Please note that the embedding and graph extraction are also finish at this step. 9 | Please also note that for the BigVul(aka MSR)datasets, many code cannot be parsed by the reveal tool, so data point will be less than the raw datasets. 10 | 11 | **starting from here, you need the pre-processed dataset to continue.** 12 | 13 | ## 1 train-test split & sampling 14 | 1. use `data_sampler.py` to split the train-test set 15 | 1. change parameter `--sampling_type` to do raw code level sampling **with ROS_R or RUS_R** 16 | 17 | ## 2 train model with **sampling_R** 18 | 1. see the bash file`exp.bash` for a example of quick start 19 | 2. see the `main.py` arguments for further modification of parameters 20 | 1. change parameter `--model_type` to choose between using **devign** or **reveal** 21 | ## 3 train model with **sampling_L** 22 | 1. This required the model trained by previous step that train with **noSampling**. 23 | 2. see the `exp_latent.sh` for quick start 24 | 3. see the `backbone.py` arguments for futher modification parameters 25 | 26 | ## 4 GNNExplainer 27 | 1. this steps required the model trained by previous steps. Either a Sampling_R or Sampling_L model 28 | 2. this steps required your datasets comes with extra information of the `vulnerable line` as the groundtruth. Which is only provided in the BigVul(aka msr)datasets, 29 | when proceeding the BigVul datasets in the pre-procesing step 0, please preserve the information. Or you can do a cross-join of the bigvul datasets with vulnerable line with the processed datasets by their tokens to add the line to the processed datasets. 30 | 3. see&run `gnn_explainer.ipynb` to start the XAI with GNNExplainer,change model location and test data location at your demand. 31 | 4. run `GNN_explainer_result_analysis.ipynb` to calculate the hit rate. -------------------------------------------------------------------------------- /Devign_Reveal_model/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from data_loader import n_identifier, g_identifier, l_identifier 4 | import inspect 5 | from datetime import datetime 6 | 7 | 8 | def load_default_identifiers(n, g, l): 9 | if n is None: 10 | n = n_identifier 11 | if g is None: 12 | g = g_identifier 13 | if l is None: 14 | l = l_identifier 15 | return n, g, l 16 | 17 | 18 | def initialize_batch(entries, batch_size, shuffle=False): 19 | total = len(entries) 20 | indices = np.arange(0, total - 1, 1) 21 | if shuffle: 22 | np.random.shuffle(indices) 23 | batch_indices = [] 24 | start = 0 25 | end = len(indices) 26 | curr = start 27 | while curr < end: 28 | c_end = curr + batch_size 29 | if c_end > end: 30 | c_end = end 31 | batch_indices.append(indices[curr:c_end]) 32 | curr = c_end 33 | return batch_indices[::-1] 34 | 35 | 36 | def tally_param(model): 37 | total = 0 38 | for param in model.parameters(): 39 | total += param.data.nelement() 40 | return total 41 | 42 | 43 | def debug(*msg, sep='\t'): 44 | caller = inspect.stack()[1] 45 | file_name = caller.filename 46 | ln = caller.lineno 47 | now = datetime.now() 48 | time = now.strftime("%m/%d/%Y - %H:%M:%S") 49 | print('[' + str(time) + '] File \"' + file_name + '\", line ' + str(ln) + ' ', end='\t') 50 | for m in msg: 51 | print(m, end=sep) 52 | print('') 53 | -------------------------------------------------------------------------------- /IVDetect_model/gen_graphs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from torch.nn.utils.rnn import pack_sequence 4 | from tqdm import tqdm 5 | 6 | import utils.process as process 7 | import pandas as pd 8 | import torch 9 | from torch_geometric.data import Data 10 | import vul_model 11 | 12 | 13 | def generate_glove_file(data): 14 | print("Generating glove file") 15 | sample_big_data_pdg = process.collect_code_data(data) 16 | sample_big_data_ast = process.collect_tree_info(data) 17 | with open('glove/msr_pdg_word.txt', 'w') as file: 18 | for sentence in sample_big_data_pdg: 19 | for token in sentence: 20 | file.write(f'{token} ') 21 | with open('glove/msr_ast_word.txt', 'w') as file: 22 | for sentence in sample_big_data_ast: 23 | for token in sentence: 24 | file.write(f'{token} ') 25 | 26 | 27 | # this function check all nodes in pdg graphs, if src and des 28 | # all in ast nodes,then keep it, else discard 29 | # return a new graph + mapping of old_node_index -> new_node_index 30 | def clean_graph(_pdg_graph, _ast_nodes): 31 | node_list = [] 32 | all_nodes = torch.flatten(_pdg_graph) 33 | for node in all_nodes: 34 | if node in _ast_nodes: 35 | if node not in node_list: 36 | node_list.append(node.item()) 37 | # sort all ast nodes 38 | node_list.sort() 39 | index_list = list(range(0, len(node_list))) 40 | new_dict = dict(zip(node_list, index_list)) 41 | new_src_nodes = [] 42 | new_des_nodes = [] 43 | for i in range(len(_pdg_graph[0])): 44 | src_node = _pdg_graph[0][i] 45 | des_node = _pdg_graph[1][i] 46 | # source and destination all in ast nodes 47 | if src_node in node_list and des_node in node_list: 48 | new_src_nodes.append(new_dict[src_node.item()]) 49 | new_des_nodes.append(new_dict[des_node.item()]) 50 | edge_index = torch.tensor([new_src_nodes, new_des_nodes], dtype=torch.long) 51 | return edge_index, new_dict 52 | 53 | 54 | if __name__ == '__main__': 55 | data_file = "MSR_data/msr_with_vul_type.csv" 56 | # data = process.read_data(data_file, 100) 57 | data = pd.read_csv(data_file) 58 | print(data.info()) 59 | # generate_glove_file(data) 60 | # exit() 61 | from gensim.test.utils import get_tmpfile 62 | from gensim.models import KeyedVectors 63 | from gensim.scripts.glove2word2vec import glove2word2vec 64 | 65 | embedding_size = 100 66 | 67 | ast_glove_file_dir = f'{os.getcwd()}/glove/msr_ast_vectors.txt' 68 | ast_tmp_file_dir = f'{os.getcwd()}/glove/msr_ast_gensim.txt' 69 | pdg_glove_file_dir = f'{os.getcwd()}/glove/msr_pdg_vectors.txt' 70 | pdg_tmp_file_dir = f'{os.getcwd()}/glove/msr_pdg_gensim.txt' 71 | ast_temp_file = get_tmpfile(ast_tmp_file_dir) 72 | pdg_temp_file = get_tmpfile(pdg_tmp_file_dir) 73 | if not os.path.isfile(ast_tmp_file_dir): 74 | glove2word2vec(ast_glove_file_dir, ast_temp_file) 75 | if not os.path.isfile(pdg_tmp_file_dir): 76 | glove2word2vec(pdg_glove_file_dir, pdg_temp_file) 77 | 78 | ast_glove_vector = KeyedVectors.load_word2vec_format(ast_temp_file) 79 | pdg_glove_vector = KeyedVectors.load_word2vec_format(pdg_temp_file) 80 | 81 | # model = vul_model.Vulnerability(h_size=100, num_node_feature=5, num_classes=2, 82 | # feature_representation_size=100, 83 | # drop_out_rate=0, num_conv_layers=2) 84 | # model.eval() 85 | # torch.no_grad() 86 | 87 | # generate graphs for torch-geometric 88 | pdg_graphs = process.collect_pdg(data) 89 | 90 | # fea 1: sub token of each line of code -> embedding -> average 91 | feas_1 = process.generate_feature_1(data, pdg_glove_vector, embedding_size) 92 | # fea 2: tree-lstm of each line of code in AST 93 | feas_2 = process.generate_feature_2(data, ast_glove_vector, embedding_size) 94 | # fea 3: variable name and variable type of each line in AST 95 | # feas_3 = process.generate_feature_3(data, pdg_glove_vector, 100) 96 | feas_3 = process.generate_feature_3(data, ast_glove_vector, embedding_size) 97 | # fea 4: control dependency CDG context in AST 98 | feas_4 = process.generate_feature_4(data, pdg_glove_vector, embedding_size) 99 | # fea 5: data dependency DDG context in AST 100 | feas_5 = process.generate_feature_5(data, pdg_glove_vector, embedding_size) 101 | 102 | starting_index = 0 # some graph can't proceed, use a extra index to generate and store graph 103 | wrong_cumulate = 1 104 | for i in tqdm(range(0, len(data))): 105 | try: 106 | # get each feature 107 | fea_1 = feas_1[i] 108 | fea_2 = feas_2[i] 109 | fea_3 = feas_3[i] 110 | fea_4 = feas_4[i] 111 | fea_5 = feas_5[i] 112 | 113 | valid_fea = True 114 | for fea_index, fea in enumerate([fea_1, fea_2, fea_3, fea_4, fea_5]): 115 | if fea is None: 116 | valid_fea = False 117 | print(f'feature {fea_index} in data {i} is None, total wrong {wrong_cumulate}') 118 | if not valid_fea: 119 | wrong_cumulate += 1 120 | continue 121 | 122 | code = data.at[i, "code"] 123 | loc = len(code.splitlines()) 124 | # skip large file 125 | if loc > 500 or loc < 5: 126 | continue 127 | # get all nodes from ast graph 128 | ast_nodes = list(fea_2.keys()) 129 | raw_pdg_graph = pdg_graphs[i] 130 | # clean the pdg graph, all node in pdg graph must also shown in the AST nodes 131 | # point exist in ast+pdg will be preserved 132 | new_pdg_graph, mapping = clean_graph(raw_pdg_graph, ast_nodes) 133 | # Create a PyG Data object 134 | graph_i = Data(edge_index=new_pdg_graph) 135 | # clean generated features 136 | new_fea_1 = [] 137 | new_fea_2 = [] 138 | new_fea_3 = [] 139 | new_fea_4 = [] 140 | new_fea_5 = [] 141 | mapping_key_list = list(mapping.keys()) 142 | if len(mapping_key_list) == 0: 143 | continue 144 | # calculate the new_pdg/ast ratio, discard if too small 145 | ratio = len(mapping) / len(fea_2) 146 | if ratio < 0.3: 147 | continue 148 | # for each line in feature, if line also in the cleaned pdg graph, add its feature to the list 149 | for j in range(len(mapping)): 150 | # get key from the mapping 151 | key = mapping_key_list[j] 152 | # get feature representation of line j 153 | fea1_j = fea_1[key - 1] 154 | if len(fea1_j) == 0: 155 | new_fea_1.append(torch.from_numpy(np.stack(np.zeros(embedding_size)))) 156 | else: 157 | new_fea_1.append(torch.from_numpy(np.stack(fea1_j))) 158 | new_fea_2.append(fea_2[key]) 159 | # feature 3,4,5 have different key 160 | key_in_str = f'{key}" ' 161 | if key_in_str in fea_3.keys(): 162 | new_fea_3.append(torch.from_numpy(np.stack(fea_3[key_in_str]))) 163 | else: 164 | new_fea_3.append(torch.from_numpy(np.stack([np.zeros(embedding_size)]))) 165 | if key_in_str in fea_4.keys(): 166 | fea4_j = fea_4[key_in_str] 167 | if len(fea4_j) == 0: 168 | new_fea_4.append(torch.from_numpy(np.stack([np.zeros(embedding_size)]))) 169 | else: 170 | new_fea_4.append(torch.from_numpy(np.stack(fea4_j))) 171 | else: 172 | new_fea_4.append(torch.from_numpy(np.stack([np.zeros(embedding_size)]))) 173 | if key_in_str in fea_5.keys(): 174 | fea5_j = fea_5[key_in_str] 175 | if len(fea5_j) == 0: 176 | new_fea_5.append(torch.from_numpy(np.stack([np.zeros(embedding_size)]))) 177 | else: 178 | new_fea_5.append(torch.from_numpy(np.stack(fea5_j))) 179 | else: 180 | new_fea_5.append(torch.from_numpy(np.stack([np.zeros(embedding_size)]))) 181 | # padding 182 | new_fea_1 = pack_sequence(new_fea_1, enforce_sorted=False) 183 | new_fea_3 = pack_sequence(new_fea_3, enforce_sorted=False) 184 | new_fea_4 = pack_sequence(new_fea_4, enforce_sorted=False) 185 | new_fea_5 = pack_sequence(new_fea_5, enforce_sorted=False) 186 | # now all features have the same number of lines, format: [lines,sequences,embedding] 187 | graph_i.my_data = [new_fea_1, new_fea_2, new_fea_3, new_fea_4, new_fea_5] 188 | vul = data.at[i, "bug"] 189 | graph_i.y = torch.tensor([vul], dtype=int) 190 | graph_i.code = code 191 | graph_i.cve_type = data.at[i, "cve"] 192 | graph_i.cwe_type = data.at[i, "cwe"] 193 | 194 | # valid data by passing to sample model 195 | # model(graph_i.my_data,graph_i.edge_index) 196 | 197 | # no train-test set here, save each data point one by one 198 | torch.save(graph_i, f'{os.getcwd()}/MSR_data/pyg_graph/data_{starting_index}.pt') 199 | starting_index += 1 200 | # run train_test_valid.py to split the graphs 201 | except: 202 | print(f'error in {i}') -------------------------------------------------------------------------------- /IVDetect_model/joern.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WIP2022/DataSampling4DLVD/a01d6cb246bfd4d2fd46448821bf457e9513b82a/IVDetect_model/joern.zip -------------------------------------------------------------------------------- /IVDetect_model/latent_fine_tune.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "2cf15078", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import torch\n", 11 | "import torch.nn as nn" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "d81b485e", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \\\n", 22 | " average_precision_score, classification_report, roc_curve, auc, top_k_accuracy_score, ndcg_score" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "4ecb9aa5", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from tqdm import tqdm" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "51360350", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from torch.utils.data import TensorDataset,Dataset,DataLoader" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "id": "b7dc0cfb", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "def my_metric(all_predictions, all_targets, all_probs):\n", 53 | " fpr, tpr, _ = roc_curve(all_targets, all_probs)\n", 54 | " auc_score = round(auc(fpr, tpr) * 100, 2)\n", 55 | " acc = round(accuracy_score(all_targets, all_predictions) * 100, 2)\n", 56 | " precision = round(precision_score(all_targets, all_predictions) * 100, 2)\n", 57 | " f1 = round(f1_score(all_targets, all_predictions) * 100, 2)\n", 58 | " recall = round(recall_score(all_targets, all_predictions) * 100, 2)\n", 59 | " matrix = confusion_matrix(all_targets, all_predictions)\n", 60 | " target_names = ['non-vul', 'vul']\n", 61 | " report = classification_report(all_targets, all_predictions, target_names=target_names)\n", 62 | " result = f'auc: {auc_score} acc: {acc} precision: {precision} recall: {recall} f1: {f1} \\nreport:\\n{report}\\nmatrix:\\n{matrix}'\n", 63 | " print(result) " 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "7032fa75", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "ep = 9" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "id": "409a0a8a", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "train_X = torch.load(f'smote_data/stage_1_output/train_X_ep_{ep}.dt')\n", 84 | "train_y = torch.load(f'smote_data/stage_1_output/train_y_ep_{ep}.dt')" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "d7cf7a4e", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "test_X = torch.load(f'smote_data/stage_1_output/test_X_ep_{ep}.dt')\n", 95 | "test_y = torch.load(f'smote_data/stage_1_output/test_y_ep_{ep}.dt')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "6bbb5958", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "print(train_X.shape,train_y.shape,test_X.shape,test_y.shape)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "4d917761", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "from imblearn.over_sampling import SMOTE\n", 116 | "sm = SMOTE(random_state=42)\n", 117 | "train_X_sampled, train_y_sampled = sm.fit_resample(train_X, train_y)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "a0d3e728", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "print(train_X_sampled.shape,train_y_sampled.shape)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "ecafba6a", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "from sklearn.neural_network import MLPClassifier \n", 138 | "mlp = MLPClassifier(solver='adam',alpha=1e-5,hidden_layer_sizes=100,random_state=42,max_iter=1000)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "47e9db2a", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "mlp.fit(train_X_sampled,train_y_sampled)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "11c51ba1", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "predictions_mlp = mlp.predict(test_X)\n", 159 | "probs_mlp = mlp.predict_proba(test_X)\n", 160 | "probs_mlp = probs_mlp[:,1]" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "57e28e79", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "my_metric(predictions_mlp, test_y.squeeze(1), probs_mlp)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "4a4fe0bf", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.8.8" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 5 203 | } 204 | -------------------------------------------------------------------------------- /IVDetect_model/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ 4 | classification_report, roc_curve, auc 5 | from torch import nn 6 | from IVDetect_model import vul_model 7 | from tqdm import tqdm 8 | from torch.utils.data import DataLoader, Dataset 9 | import argparse 10 | import numpy as np 11 | from imblearn.over_sampling import RandomOverSampler 12 | from imblearn.under_sampling import RandomUnderSampler, OneSidedSelection 13 | import logging 14 | from torch.nn.utils.rnn import pad_packed_sequence 15 | from operator import itemgetter 16 | 17 | 18 | def extractDigits(lst): 19 | res = [] 20 | for el in lst: 21 | sub = el.split(', ') 22 | res.append(sub) 23 | return (res) 24 | 25 | 26 | class MyDatset(Dataset): 27 | def __init__(self, _datapoint_files, sampling=None): 28 | if sampling == None: 29 | self.datapoint_files = _datapoint_files 30 | elif sampling == 'ros': 31 | print("sample ros") 32 | list_of_targets = [] 33 | for file in tqdm(_datapoint_files): 34 | graph = torch.load(file) 35 | list_of_targets.append(graph.y.numpy()[0]) 36 | print(f'before sample 0:{(list_of_targets.count(0))},1:{(list_of_targets.count(1))}') 37 | train_files_list = extractDigits(_datapoint_files) 38 | ros = RandomOverSampler(random_state=42) 39 | train_files_list_resampled, list_of_targets_resampled = ros.fit_resample(train_files_list, list_of_targets) 40 | print(f'after sample 0:{(list_of_targets_resampled.count(0))},1:{(list_of_targets_resampled.count(1))}') 41 | flat_list = [item for sublist in train_files_list_resampled for item in sublist] 42 | self.datapoint_files = flat_list 43 | elif sampling == 'rus': 44 | print("sample rus") 45 | list_of_targets = [] 46 | for file in tqdm(_datapoint_files): 47 | graph = torch.load(file) 48 | list_of_targets.append(graph.y.numpy()[0]) 49 | print(f'before sample 0:{(list_of_targets.count(0))},1:{(list_of_targets.count(1))}') 50 | train_files_list = extractDigits(_datapoint_files) 51 | rus = RandomUnderSampler(random_state=42) 52 | train_files_list_resampled, list_of_targets_resampled = rus.fit_resample(train_files_list, list_of_targets) 53 | print(f'after sample 0:{(list_of_targets_resampled.count(0))},1:{(list_of_targets_resampled.count(1))}') 54 | flat_list = [item for sublist in train_files_list_resampled for item in sublist] 55 | self.datapoint_files = flat_list 56 | elif sampling == 'oss': 57 | print("sample oss") 58 | list_of_targets = [] 59 | list_of_features = [] 60 | for file in tqdm(_datapoint_files): 61 | graph = torch.load(file) 62 | list_of_targets.append(graph.y.numpy()[0]) 63 | feature = graph.my_data[0] 64 | feature, _ = pad_packed_sequence(feature) 65 | feature = feature.numpy() 66 | feature = np.average(feature, axis=1) 67 | feature = np.average(feature, axis=0) 68 | list_of_features.append(feature) 69 | feature_list_np = np.array(list_of_features) 70 | print(f'before sample 0:{(list_of_targets.count(0))},1:{(list_of_targets.count(1))}') 71 | oss = OneSidedSelection(random_state=42) 72 | _, list_of_targets_resampled = oss.fit_resample(feature_list_np, list_of_targets) 73 | print(f'after sample 0:{(list_of_targets_resampled.count(0))},1:{(list_of_targets_resampled.count(1))}') 74 | 75 | flat_list = list(itemgetter(*(oss.sample_indices_))(_datapoint_files)) 76 | self.datapoint_files = flat_list 77 | 78 | def __getitem__(self, index): 79 | graph_file = f'{self.datapoint_files[index]}' 80 | graph = torch.load(graph_file) 81 | return graph 82 | 83 | def __len__(self): 84 | return len(self.datapoint_files) 85 | 86 | 87 | def my_metric(all_predictions, all_targets, all_probs): 88 | fpr, tpr, _ = roc_curve(all_targets, all_probs) 89 | auc_score = round(auc(fpr, tpr) * 100, 2) 90 | acc = round(accuracy_score(all_targets, all_predictions) * 100, 2) 91 | precision = round(precision_score(all_targets, all_predictions) * 100, 2) 92 | f1 = round(f1_score(all_targets, all_predictions) * 100, 2) 93 | recall = round(recall_score(all_targets, all_predictions) * 100, 2) 94 | matrix = confusion_matrix(all_targets, all_predictions) 95 | target_names = ['non-vul', 'vul'] 96 | report = classification_report(all_targets, all_predictions, target_names=target_names) 97 | result = f'auc: {auc_score} acc: {acc} precision: {precision} recall: {recall} f1: {f1} \nreport:\n{report}\nmatrix:\n{matrix}' 98 | return result 99 | 100 | 101 | if __name__ == '__main__': 102 | parser = argparse.ArgumentParser() 103 | parser.add_argument('--processed_dir', type=str, help='dir of processed split datapoints', 104 | default='data_1/') 105 | parser.add_argument('--sampling', type=str, help='sampling method', 106 | default=None) 107 | parser.add_argument('--out_dir', type=str, help='output of trained model state', 108 | default='MSR_data/') 109 | # parser.add_argument('--data_split', type=str, help='data split id', 110 | # default='0') 111 | parser.add_argument("--train_shards", nargs="*", type=int, default=[0, 1, 2, 3]) 112 | parser.add_argument("--test_shards", nargs="*", type=int, default=[4]) 113 | parser.add_argument('-d', '--device', type=str, default='cuda:0') 114 | args = parser.parse_args() 115 | params = {'hidden_size': 100, 'lr': 1e-4, 'dropout_rate': 0.5, 'epochs': 20, 'num_conv_layers': 3} 116 | logging.basicConfig(filename=args.out_dir + f'/train_{args.train_shards}_test_{args.test_shards}.log', 117 | encoding='utf-8', level=logging.DEBUG) 118 | print(f'device use:', args.device) 119 | print(f'sampling : {args.sampling}') 120 | logging.info(f'sampling : {args.sampling}') 121 | print(f'curr dir: {os.getcwd()} ,reading processed datas from {args.processed_dir}') 122 | logging.info(f'curr dir: {os.getcwd()} ,reading processed datas from {args.processed_dir}') 123 | 124 | train_files = [] 125 | test_files = [] 126 | for i in args.train_shards: 127 | train_path = args.processed_dir + f'{i}/' 128 | train_files.extend([train_path + f for f in os.listdir(train_path) if 129 | os.path.isfile(os.path.join(train_path, f))]) 130 | train_dataset = MyDatset(train_files, sampling=args.sampling) 131 | for i in args.test_shards: 132 | test_path = args.processed_dir + f'{i}/' 133 | test_files.extend([test_path + f for f in os.listdir(test_path) if 134 | os.path.isfile(os.path.join(test_path, f))]) 135 | test_dataset = MyDatset(test_files) 136 | 137 | print(f'train {len(train_dataset)} test {len(test_dataset)}') 138 | # print(f'{args.out_dir}') 139 | logging.info(f'train {len(train_dataset)} test {len(test_dataset)}') 140 | train_loader = DataLoader(train_dataset, batch_size=None, batch_sampler=None, 141 | shuffle=True) # todo: shuffle set to False for testing 142 | test_loader = DataLoader(test_dataset, batch_size=None, batch_sampler=None, shuffle=False) 143 | max_epochs = params['epochs'] 144 | device = args.device 145 | backbone_model = vul_model.Vulnerability_backbone(h_size=params['hidden_size'], num_node_feature=5, num_classes=2, 146 | feature_representation_size=params['hidden_size'], 147 | drop_out_rate=params['dropout_rate'], 148 | num_conv_layers=params['num_conv_layers']) 149 | 150 | pretrain_model = vul_model.backbone_mlp(backbone_model, params['hidden_size']) 151 | 152 | pytorch_total_params = sum(p.numel() for p in pretrain_model.parameters()) 153 | print('total parameter:', pytorch_total_params) 154 | pretrain_optimizer = torch.optim.Adam(pretrain_model.parameters(), lr=params['lr']) 155 | learning_rate = params['lr'] 156 | # exit() 157 | criterion = nn.CrossEntropyLoss() 158 | if device != 'cpu': 159 | pretrain_model.to(device) 160 | for epoch in range(1, max_epochs): 161 | print(f'pretrain epochs {epoch}') 162 | logging.info(f'pretrain epochs {epoch}') 163 | pretrain_model.train() 164 | train_code = [] 165 | train_X = [] 166 | train_y = [] 167 | for index, graph in enumerate(tqdm(train_loader, desc='train')): 168 | # print(graph) 169 | # exit() 170 | if device != 'cpu': 171 | graph = graph.to(device) 172 | target = graph.y 173 | pretrain_optimizer.zero_grad() 174 | out = pretrain_model(graph.my_data, graph.edge_index) 175 | loss = criterion(out, target) 176 | loss.backward() 177 | pretrain_optimizer.step() 178 | train_X.append(out.cpu().detach().numpy()) 179 | train_y.append(target.cpu().detach().numpy()) 180 | print(f'pretrain epochs {epoch} finish') 181 | logging.info(f'pretrain epochs {epoch} finish') 182 | torch.save(pretrain_model.state_dict(), f'{args.out_dir}_pretrain_ep_{epoch}.dt') 183 | train_y_np = np.array(train_y) 184 | train_X_np = np.array(train_X) 185 | train_X_np = np.squeeze(train_X_np, axis=1) 186 | torch.save(train_code, f'{args.out_dir}_trainCode_ep_{epoch}.dt') 187 | torch.save(train_X_np, f'{args.out_dir}_trainX_ep_{epoch}.dt') 188 | torch.save(train_y_np, f'{args.out_dir}_trainy_ep_{epoch}.dt') 189 | with torch.no_grad(): 190 | pretrain_model.eval() 191 | # train_code = [] 192 | # train_X = [] 193 | # train_y = [] 194 | # for index, graph in enumerate(tqdm(train_loader),desc="train"): 195 | # train_code.append(graph.code) 196 | # if device != 'cpu': 197 | # graph = graph.to(device) 198 | # target = graph.y 199 | # out = pretrain_model.backbone(graph.my_data, graph.edge_index) 200 | # # print(out.shape) 201 | # train_X.append(out.cpu().detach().numpy()) 202 | # train_y.append(target.cpu().detach().numpy()) 203 | # train_y_np = np.array(train_y) 204 | # train_X_np = np.array(train_X) 205 | # train_X_np = np.squeeze(train_X_np,axis = 1) 206 | # torch.save(train_code,f'{args.out_dir}_trainCode_ep_{epoch}.dt') 207 | # torch.save(train_X_np,f'{args.out_dir}_trainX_ep_{epoch}.dt') 208 | # torch.save(train_y_np,f'{args.out_dir}_trainy_ep_{epoch}.dt') 209 | test_code = [] 210 | # test_cve = [] 211 | # test_cwe = [] 212 | test_X = [] 213 | test_y = [] 214 | all_predictions, all_targets, all_probs = [], [], [] 215 | for index, graph in enumerate(tqdm(test_loader, desc='test')): 216 | test_code.append(graph.code) 217 | # test_cve.append(graph.cve_type) 218 | # test_cwe.append(graph.cwe_type) 219 | if device != 'cpu': 220 | graph = graph.to(device) 221 | target = graph.y 222 | latent_out = pretrain_model.backbone(graph.my_data, graph.edge_index) 223 | out = pretrain_model(graph.my_data, graph.edge_index) 224 | pred = out.argmax(dim=1).cpu().detach().numpy() 225 | prob_1 = out.cpu().detach().numpy()[0][1] 226 | # print(out.shape) 227 | test_X.append(latent_out.cpu().detach().numpy()) 228 | test_y.append(target.cpu().detach().numpy()) 229 | all_probs.append(prob_1) 230 | all_predictions.append(pred) 231 | all_targets.append(target.cpu().detach().numpy()) 232 | test_y = np.array(test_y) 233 | test_X = np.array(test_X) 234 | test_X = np.squeeze(test_X, axis=1) 235 | torch.save(test_code, f'{args.out_dir}_testCode_ep_{epoch}.dt') 236 | torch.save(test_X, f'{args.out_dir}_testX_ep_{epoch}.dt') 237 | torch.save(test_y, f'{args.out_dir}_testy_ep_{epoch}.dt') 238 | torch.save(all_predictions, f'{args.out_dir}_testPreds_ep_{epoch}.dt') 239 | torch.save(all_targets, f'{args.out_dir}_testTargets_ep_{epoch}.dt') 240 | torch.save(all_probs, f'{args.out_dir}_testProbs_ep_{epoch}.dt') 241 | # torch.save(# test_cve,f'{args.out_dir}_testCve_ep_{epoch}.dt') 242 | # torch.save(# test_cwe,f'{args.out_dir}_testCwe_ep_{epoch}.dt') 243 | -------------------------------------------------------------------------------- /IVDetect_model/preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json as js 3 | import re 4 | import os, sys, string, re, glob 5 | import subprocess 6 | import tempfile 7 | import pickle 8 | from multiprocessing import Pool 9 | 10 | def generate_prolog(testcase, id_num, project): 11 | # change joern home dir here 12 | joern_home = "/joern_bc/" 13 | tmp_dir = tempfile.TemporaryDirectory() 14 | short_filename = str(id_num) + ".cpp" 15 | with open(tmp_dir.name + "/" + short_filename, 'w') as f: 16 | f.write(testcase) 17 | # print(short_filename) 18 | subprocess.check_call( 19 | "cd " + joern_home + "&& ./joern-parse " + tmp_dir.name + " --out " + tmp_dir.name + "/cpg.bin.zip", 20 | shell=True, 21 | stdout=subprocess.DEVNULL, 22 | stderr=subprocess.STDOUT) 23 | 24 | tree = subprocess.check_output( 25 | "cd "+joern_home +"&& ./joern --script joern_cfg_to_dot.sc --params cpgFile=" + tmp_dir.name + "/cpg.bin.zip", 26 | shell=True, 27 | universal_newlines=True) 28 | # subprocess.check_call( 29 | # "cd " + joern_home + "&& ./joern-export " + tmp_dir.name + "/cpg.bin.zip" + " --repr pdg --out " + os.getcwd() + "/pdg/" + project + "/" + str( 30 | # id_num),shell=True) 31 | # pos = tree.find("% FEATURE") 32 | pos = tree.find("digraph g") 33 | print(pos) 34 | if pos > 0: 35 | tree = tree[pos:] 36 | tmp_dir.cleanup() 37 | return tree 38 | 39 | 40 | def gen(_data,_i): 41 | # change file name here 42 | file_name = f'/MSR_data/raw/{_i}.pkl' 43 | if os.path.isfile(file_name): 44 | return 45 | print(f'IN -> {_i}') 46 | try: 47 | tree = generate_prolog(_data[1], _i, "Fan") 48 | _data.append(tree) 49 | with open(file_name, 'wb') as f: 50 | pickle.dump(_data, f) 51 | except: 52 | print(f'fail -> {_i}') 53 | 54 | if __name__ == '__main__': 55 | 56 | # # Reveal Dataset 57 | Rule1 = "\/\*[\s\S]*\*\/" 58 | Rule2 = "\/\/.*" 59 | # c1 = re.compile(Rule1) 60 | # data_1 = open("./raw_data/vulnerables.json") 61 | # all_functions_1 = js.load(data_1) 62 | # data_1_storage = [] 63 | # for function_1 in all_functions_1: 64 | # code = function_1["code"] 65 | # code = re.sub(Rule1, "", re.sub(Rule2, "", code)) 66 | # data_line = [1, code, ""] 67 | # data_1_storage.append(data_line) 68 | # data_1_ = open("./raw_data/non-vulnerables.json") 69 | # all_functions_1_ = js.load(data_1_) 70 | # for function_1_ in all_functions_1_: 71 | # code = function_1_["code"] 72 | # code = re.sub(Rule1, "", re.sub(Rule2, "", code)) 73 | # data_line = [0, code, ""] 74 | # data_1_storage.append(data_line) 75 | 76 | #Fan et al. dataset 77 | all_functions_2 = pd.read_csv("./raw_data/MSR_data_cleaned.csv") 78 | print(all_functions_2.info()) 79 | print(all_functions_2.vul.value_counts()) 80 | data_2_storage = [] 81 | # exit() 82 | for i, j in all_functions_2.iterrows(): 83 | code_1 = j[25] 84 | # code_2 = j[26] 85 | cve = j[5] 86 | cwe = j[7] 87 | # assert (code_1 != code_2) 88 | code_1 = re.sub(Rule1, "", re.sub(Rule2, "", code_1)) 89 | # code_2 = re.sub(Rule1, "", re.sub(Rule2, "", code_2)) 90 | data_2_storage.append([int(j[34]), code_1,cve,cwe]) 91 | 92 | # # FFMpeg+Qemu dataset 93 | # data_3 = open("./data/function.json") 94 | # all_functions_3 = js.load(data_3) 95 | # data_3_storage = [] 96 | # for function_3 in all_functions_3: 97 | # code = function_3["func"] 98 | # code = re.sub(Rule1, "", re.sub(Rule2, "", code)) 99 | # label = function_3["target"] 100 | # data_line = [label, code, ""] 101 | # data_3_storage.append(data_line) 102 | # 103 | # for i in range(len(data_1_storage)): 104 | # tree = generate_prolog(data_1_storage[i][1], i, "Reveal") 105 | # data_1_storage[i].append(tree) 106 | # print(data_1_storage[0]) 107 | 108 | 109 | 110 | # pool = Pool() 111 | # pool.starmap(gen, zip(data_2_storage, range(0,len(data_2_storage)))) 112 | for i in range(len(data_2_storage)): 113 | tree = generate_prolog(data_2_storage[i][1], i, "Fan") 114 | data_2_storage[i].append(tree) 115 | # change file name here 116 | file_name = f'IVdetect/MSR_data/raw/{i}_raw.pkl' 117 | print(f'in -> {i}') 118 | with open(file_name, 'wb') as f: 119 | pickle.dump(data_2_storage[i], f) 120 | # for i in range(len(data_3_storage)): 121 | # tree = generate_prolog(data_3_storage[i][1], i, "FFMpeg") 122 | # data_3_storage[i].append(tree) 123 | # df = pd.DataFrame(data_2_storage, columns=['bug', 'code', 'trees']) 124 | # 125 | # df.to_csv('all_msr_data.csv',index=True)wc -l -------------------------------------------------------------------------------- /IVDetect_model/readme.md: -------------------------------------------------------------------------------- 1 | # IVDetect readme 2 | This repo is based on official github repo of [IVDetect](https://github.com/vulnerabilitydetection/VulnerabilityDetectionResearch) 3 | 4 | This implementation also require a specific version of joern and a specific script for joern to extract the graph. please see the above link to the official github of IVDetect. 5 | However, we will provide a pre-proceed dataset(as state in main readme file) in zenodo so that you can skip the joern step. 6 | 7 | ## 0 pre-process with joern(optional) 8 | This step extract graph from code. We provide pre-processed dataset. But if you want to use your datasets, you need this step. 9 | 1. unzip `joern.zip` 10 | 2. prepare raw datasets 11 | 3. check code in `preprocess.py`, change dataset directory,joern directory, and destination folder 12 | 4. run `preprocess.py 13 | ` 14 | **starting from here, you need the pre-processed dataset to continue.** 15 | ## 1 generate glove 16 | 17 | embed word->vec with glove 18 | 1. download glove to this dir 19 | 2. uncomment line 75 in gen_graph.py 20 | 3. run `gen_graph.py` 21 | 4. run `glove/ast.sh` and `glove/phg.sh` to generate the glove embedding 22 | 23 | ## 2 generate graph file 24 | 1. run `gen_graphs.py` 25 | 1. line#171 for output dir 26 | 27 | ## 3 train/test split 28 | 1. run`shard_splitter.py` 29 | 30 | shard splitter will split dataset into 5 shard, use 4 for train, 1 for test. You decide. 31 | 32 | ## 4. model training & **raw code level sampling** 33 | 1. `--proceed_dir` where the shards at 34 | 2. `--sampling` to do **Sampling_R (ROS_R or RUS_R)** 35 | 36 | ## 5. parameter 37 | 1. line#175 for parameter 38 | 2. uncimment all #nni line and comment line#195 for nni autoML 39 | 1. `nnictl create --config config.yml` etc 40 | 2. see microsoft NNI for more autoML fine-tuning detail 41 | 42 | ## 6. **latent level sampling** + retrain final classifier 43 | 1. use the `train_X, train_y ,test_X, test_y`vector(in numpy format) obtain from the previous step. Which should be train on origin imbalanced data. 44 | 2. see example notebook in `latent_find_tune.ipynb` 45 | 3. change the imbalance learn class for other latent level samlpling approach to do **Sampling_L** 46 | 47 | ## 7. example result 48 | ``` 49 | train loss: 0.6700113380164868 acc: 0.579321892005023 50 | evaluate > 51 | 54.24 52 | auc: 59.92 acc: 54.24 precision: 49.77 recall: 72.11 f1: 58.89 53 | report: 54 | precision recall f1-score support 55 | 56 | non-vul 0.63 0.39 0.48 1433 57 | vul 0.50 0.72 0.59 1194 58 | 59 | accuracy 0.54 2627 60 | macro avg 0.56 0.56 0.54 2627 61 | weighted avg 0.57 0.54 0.53 2627 62 | 63 | matrix: 64 | [[564 869] 65 | [333 861]] 66 | ``` -------------------------------------------------------------------------------- /IVDetect_model/shard_splitter.py: -------------------------------------------------------------------------------- 1 | import os # Used to do path manipulations 2 | import shutil # Used to copy files 3 | import random 4 | 5 | 6 | # Taken from https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks 7 | def chunks(lst: list, n: int) -> list: 8 | """Yield successive n-sized chunks from lst.""" 9 | for i in range(0, len(lst), n): 10 | yield lst[i:i + n] 11 | 12 | 13 | def get_abspath_files_in_directory(directory: str) -> list: 14 | """Takes in a directory and returns the abspath of all the files in the directory as a list 15 | 16 | Parameters 17 | ---------- 18 | directory : str 19 | The directory to get the abspaths for 20 | 21 | Returns 22 | ------- 23 | list 24 | A list of the abspaths of all the files in the directory 25 | """ 26 | return [os.path.abspath(os.path.join(directory, path)) for path in os.listdir(directory)] 27 | 28 | 29 | def split_to_subdirectories(file_paths: list, amount_per_folder: int): 30 | """Take in a list of file absolute paths, and copy them to folders 31 | 32 | Parameters 33 | ---------- 34 | file_paths : list 35 | The list of abspaths to the file folders 36 | 37 | amount_per_folder : int 38 | The amount of files per folder to split the files into 39 | """ 40 | file_paths = chunks(file_paths, amount_per_folder) 41 | 42 | for index, chunk in enumerate(file_paths): 43 | os.mkdir(str(index)) # Create a folder with a name of the current iteration index 44 | for file_path in chunk: 45 | file_name = file_path.split(os.sep)[-1] 46 | shutil.copy(file_path, os.path.join(str(index), file_name)) 47 | 48 | 49 | if __name__ == "__main__": 50 | file_paths = get_abspath_files_in_directory( 51 | "pyg_graph") # Replace "original_folder" with the directory where your files are stored 52 | random.shuffle(file_paths) 53 | shard_size = int(len(file_paths) / 5) 54 | print(shard_size) 55 | split_to_subdirectories(file_paths, shard_size) -------------------------------------------------------------------------------- /IVDetect_model/vul_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import GRU, Dropout, ReLU, ModuleList 4 | from torch.autograd import Variable as Var 5 | from torch.nn.utils.rnn import pad_packed_sequence 6 | from torch_geometric.nn import GCNConv 7 | from torch_geometric.nn import global_max_pool 8 | from torch_geometric.nn.models import GCN 9 | 10 | 11 | class ChildSumTreeLSTM(nn.Module): 12 | def __init__(self, in_dim, mem_dim, dropout1): 13 | # in_dim is the input dim and mem_dim is the output dim 14 | super(ChildSumTreeLSTM, self).__init__() 15 | self.in_dim = in_dim 16 | self.mem_dim = mem_dim 17 | self.ioux = nn.Linear(self.in_dim, 3 * self.mem_dim) 18 | self.iouh = nn.Linear(self.mem_dim, 3 * self.mem_dim) 19 | self.fx = nn.Linear(self.in_dim, self.mem_dim) 20 | self.fh = nn.Linear(self.mem_dim, self.mem_dim) 21 | # self.H = [] 22 | self.drop = nn.Dropout(dropout1) 23 | 24 | def node_forward(self, inputs, child_c, child_h): 25 | # print("input", inputs.shape) 26 | inputs = torch.unsqueeze(inputs, 0) 27 | # print("input unsqueeze",inputs.shape) 28 | child_h_sum = torch.sum(child_h, dim=0) 29 | iou = self.ioux(inputs) + self.iouh(child_h_sum) 30 | i, o, u = torch.split(iou, iou.size(1) // 3, dim=1) 31 | i, o, u = torch.sigmoid(i), torch.sigmoid(o), torch.tanh(u) 32 | f = torch.sigmoid(self.fh(child_h) + self.fx(inputs).repeat(len(child_h), 1)) 33 | fc = torch.mul(f, child_c) 34 | c = torch.mul(i, u) + torch.sum(fc, dim=0) 35 | h = torch.mul(o, torch.tanh(c)) 36 | # print(h.size()) 37 | # self.H.append(h) 38 | return c, h 39 | 40 | def forward(self, data): 41 | tree = data[0] 42 | inputs = data[1] 43 | # The inputs here are the tree structure built from class Tree and the input is a list of values with the 44 | # node ids as the key to store the tree values 45 | _ = [self.forward([tree.children[idx], inputs]) for idx in range(tree.num_children)] 46 | 47 | if tree.num_children == 0: 48 | # print("jere",type(inputs[0])) 49 | # print("before crash",inputs) 50 | child_c = Var(inputs[tree.id].data.new(1, self.mem_dim).fill_(0.)) 51 | child_h = Var(inputs[tree.id].data.new(1, self.mem_dim).fill_(0.)) 52 | else: 53 | child_c, child_h = zip(*map(lambda x: x.state, tree.children)) 54 | child_c, child_h = torch.cat(child_c, dim=0), torch.cat(child_h, dim=0) 55 | # print("id",tree.id,"input len",len(inputs)) 56 | tree.state = self.node_forward(inputs[tree.id], child_c, child_h) 57 | return tree.state 58 | 59 | 60 | class Vulnerability(torch.nn.Module): 61 | def __init__(self, h_size, num_node_feature, num_classes, feature_representation_size, drop_out_rate, 62 | num_conv_layers): 63 | super(Vulnerability, self).__init__() 64 | self.h_size = h_size 65 | self.num_node_feature = num_node_feature 66 | self.num_classes = num_classes 67 | self.feature_representation_size = feature_representation_size 68 | self.drop_out_rate = drop_out_rate 69 | self.layer_num = num_conv_layers 70 | # The 1th feature input (tree) 71 | self.tree_lstm = ChildSumTreeLSTM(self.feature_representation_size, self.h_size, self.drop_out_rate) 72 | # The 2th feature input (sequence) 73 | self.gru_1 = GRU(input_size=self.feature_representation_size, hidden_size=self.h_size, batch_first=True) 74 | # The 3th feature input (sequence) 75 | self.gru_2 = GRU(input_size=self.feature_representation_size, hidden_size=self.h_size, batch_first=True) 76 | # The 4th feature input (sequence) 77 | self.gru_3 = GRU(input_size=self.feature_representation_size, hidden_size=self.h_size, batch_first=True) 78 | # The 5th feature input (sequence) 79 | self.gru_4 = GRU(input_size=self.feature_representation_size, hidden_size=self.h_size, batch_first=True) 80 | # This layer is the bi-directional GRU layer 81 | self.gru_combine = GRU(input_size=self.h_size, hidden_size=self.h_size, bidirectional=True, batch_first=True) 82 | self.dropout = Dropout(self.drop_out_rate) 83 | # This layer is the GCN Model layer 84 | # h_size : in channels. self.num_classes: out channels(2) 85 | self.connect = nn.Linear(self.h_size * self.num_node_feature * 2, self.h_size) 86 | # modified: 从单层gcn换成多层gcn 87 | # self.convs = ModuleList() 88 | for i in range(self.layer_num): 89 | if i < self.layer_num - 1: 90 | # self.convs.append(GCNConv(self.h_size, self.h_size)) 91 | exec('self.conv_{} = GCNConv(self.h_size, self.h_size)'.format(i)) 92 | if i == self.layer_num - 1: 93 | # self.convs.append(GCNConv(self.h_size, self.num_node_feature)) 94 | exec('self.conv_{} = GCNConv(self.h_size, self.num_classes)'.format(i)) 95 | self.relu = ReLU(inplace=True) 96 | # self.conv = GCN(in_channels=self.h_size,hidden_channels=self.h_size,out_channels=self.num_classes,num_layers=1) 97 | 98 | def forward(self, my_data, edge_index): 99 | # Input data format: a list that contains main graph, feature 1, ..., feature 5. The feature 1 is tree 100 | # structured and features 2-5 are sequences. 101 | feature_1 = my_data[1] 102 | feature_2 = my_data[0] 103 | feature_3 = my_data[2] 104 | feature_4 = my_data[3] 105 | feature_5 = my_data[4] 106 | feature_vec1 = None 107 | # for every statement, get its AST subtree 108 | for i in range(len(feature_1)): 109 | if i == 0: 110 | _, feature_vec1 = self.tree_lstm(feature_1[i]) 111 | else: 112 | _, feature_vec_temp = self.tree_lstm(feature_1[i]) 113 | feature_vec1 = torch.cat((feature_vec1, feature_vec_temp), 0) 114 | feature_vec1 = torch.reshape(feature_vec1, (-1, 1, self.h_size)) 115 | 116 | # pack for feature 2-5 is done in gen_graphs 117 | feature_2, _ = self.gru_1(feature_2.float()) 118 | feature_2, out_len = pad_packed_sequence(feature_2, batch_first=True) 119 | 120 | feature_3, _ = self.gru_2(feature_3.float()) 121 | feature_3, out_len = pad_packed_sequence(feature_3, batch_first=True) 122 | 123 | feature_4, _ = self.gru_3(feature_4.float()) 124 | feature_4, out_len = pad_packed_sequence(feature_4, batch_first=True) 125 | 126 | feature_5, _ = self.gru_4(feature_5.float()) 127 | feature_5, out_len = pad_packed_sequence(feature_5, batch_first=True) 128 | 129 | feature_input = torch.cat( 130 | (feature_vec1, feature_2[:, -1:, :], feature_3[:, -1:, :], feature_4[:, -1:, :], feature_5[:, -1:, :]), 1) 131 | 132 | feature_vec, _ = self.gru_combine(feature_input) 133 | feature_vec = self.dropout(feature_vec) 134 | feature_vec = torch.flatten(feature_vec, 1) 135 | feature_vec = self.connect(feature_vec) 136 | 137 | for i in range(self.layer_num): 138 | if i < self.layer_num - 1: 139 | feature_vec = eval('self.conv_{}(feature_vec, edge_index)'.format(i)) 140 | # modified: 每一层gcn后加入了relu层 141 | feature_vec = self.relu(feature_vec) 142 | if i == self.layer_num - 1: 143 | conv_output = eval('self.conv_{}(feature_vec, edge_index)'.format(i)) 144 | # conv_output = self.conv(feature_vec, edge_index) 145 | # modified: 从golval mean pooling换成 gloval max pooling 146 | pooled = global_max_pool(conv_output, torch.zeros(conv_output.shape[0], dtype=int, device=conv_output.device)) 147 | pooled = nn.Softmax(dim=1)(pooled) 148 | return pooled 149 | 150 | 151 | class Vulnerability_backbone(torch.nn.Module): 152 | def __init__(self, h_size, num_node_feature, num_classes, feature_representation_size, drop_out_rate, 153 | num_conv_layers): 154 | super(Vulnerability_backbone, self).__init__() 155 | self.h_size = h_size 156 | self.num_node_feature = num_node_feature 157 | self.num_classes = num_classes 158 | self.feature_representation_size = feature_representation_size 159 | self.drop_out_rate = drop_out_rate 160 | self.layer_num = num_conv_layers 161 | # The 1th feature input (tree) 162 | self.tree_lstm = ChildSumTreeLSTM(self.feature_representation_size, self.h_size, self.drop_out_rate) 163 | # The 2th feature input (sequence) 164 | self.gru_1 = GRU(input_size=self.feature_representation_size, hidden_size=self.h_size, batch_first=True) 165 | # The 3th feature input (sequence) 166 | self.gru_2 = GRU(input_size=self.feature_representation_size, hidden_size=self.h_size, batch_first=True) 167 | # The 4th feature input (sequence) 168 | self.gru_3 = GRU(input_size=self.feature_representation_size, hidden_size=self.h_size, batch_first=True) 169 | # The 5th feature input (sequence) 170 | self.gru_4 = GRU(input_size=self.feature_representation_size, hidden_size=self.h_size, batch_first=True) 171 | # This layer is the bi-directional GRU layer 172 | self.gru_combine = GRU(input_size=self.h_size, hidden_size=self.h_size, bidirectional=True, batch_first=True) 173 | self.dropout = Dropout(self.drop_out_rate) 174 | # This layer is the GCN Model layer 175 | # h_size : in channels. self.num_classes: out channels(2) 176 | self.connect = nn.Linear(self.h_size * self.num_node_feature * 2, self.h_size) 177 | # modified: 从单层gcn换成多层gcn 178 | self.convs = ModuleList() 179 | for i in range(self.layer_num): 180 | if i < self.layer_num - 1: 181 | self.convs.append(GCNConv(self.h_size, self.h_size)) 182 | # exec('self.conv_{} = GCNConv(self.h_size, self.h_size)'.format(i)) 183 | if i == self.layer_num - 1: 184 | self.convs.append(GCNConv(self.h_size, self.h_size)) 185 | # exec('self.conv_{} = GCNConv(self.h_size, self.h_size)'.format(i)) 186 | self.relu = ReLU(inplace=True) 187 | # self.conv = GCN(in_channels=self.h_size,hidden_channels=self.h_size,out_channels=self.num_classes,num_layers=1) 188 | 189 | def forward(self, my_data, edge_index, mask=None): 190 | # Input data format: a list that contains main graph, feature 1, ..., feature 5. The feature 1 is tree 191 | # structured and features 2-5 are sequences. 192 | feature_1 = my_data[1] 193 | feature_2 = my_data[0] 194 | feature_3 = my_data[2] 195 | feature_4 = my_data[3] 196 | feature_5 = my_data[4] 197 | feature_vec1 = None 198 | # for every statement, get its AST subtree 199 | for i in range(len(feature_1)): 200 | if i == 0: 201 | _, feature_vec1 = self.tree_lstm(feature_1[i]) 202 | else: 203 | _, feature_vec_temp = self.tree_lstm(feature_1[i]) 204 | feature_vec1 = torch.cat((feature_vec1, feature_vec_temp), 0) 205 | feature_vec1 = torch.reshape(feature_vec1, (-1, 1, self.h_size)) 206 | 207 | # pack for feature 2-5 is done in gen_graphs 208 | feature_2, _ = self.gru_1(feature_2.float()) 209 | feature_2, out_len = pad_packed_sequence(feature_2, batch_first=True) 210 | 211 | feature_3, _ = self.gru_2(feature_3.float()) 212 | feature_3, out_len = pad_packed_sequence(feature_3, batch_first=True) 213 | 214 | feature_4, _ = self.gru_3(feature_4.float()) 215 | feature_4, out_len = pad_packed_sequence(feature_4, batch_first=True) 216 | 217 | feature_5, _ = self.gru_4(feature_5.float()) 218 | feature_5, out_len = pad_packed_sequence(feature_5, batch_first=True) 219 | 220 | feature_input = torch.cat( 221 | (feature_vec1, feature_2[:, -1:, :], feature_3[:, -1:, :], feature_4[:, -1:, :], feature_5[:, -1:, :]), 1) 222 | 223 | feature_vec, _ = self.gru_combine(feature_input) 224 | feature_vec = self.dropout(feature_vec) 225 | feature_vec = torch.flatten(feature_vec, 1) 226 | feature_vec = self.connect(feature_vec) 227 | # print(feature_vec.shape) 228 | if mask != None: 229 | for m in mask: 230 | feature_vec[m:m + 1, :] = 0 231 | 232 | for i in range(self.layer_num): 233 | if i < self.layer_num: 234 | feature_vec = self.convs[i](feature_vec, edge_index) 235 | # feature_vec = eval('self.conv_{}(feature_vec, edge_index)'.format(i)) 236 | # modified: 每一层gcn后加入了relu层 237 | feature_vec = self.relu(feature_vec) 238 | if i == self.layer_num - 1: 239 | conv_output = self.convs[i](feature_vec, edge_index) 240 | conv_output = self.relu(conv_output) 241 | # conv_output = eval('self.conv_{}(feature_vec, edge_index)'.format(i)) 242 | # conv_output = self.conv(feature_vec, edge_index) 243 | # modified: 从golval mean pooling换成 gloval max pooling 244 | pooled = global_max_pool(conv_output, torch.zeros(conv_output.shape[0], dtype=int, device=conv_output.device)) 245 | return pooled 246 | 247 | 248 | class backbone_mlp(torch.nn.Module): 249 | def __init__(self, backbone_model, h_size): 250 | super(backbone_mlp, self).__init__() 251 | self.backbone = backbone_model 252 | self.mlp = nn.Linear(h_size, 2) 253 | 254 | def forward(self, my_data, edge_index, mask=None): 255 | backcone_output = self.backbone(my_data, edge_index, mask) 256 | out = self.mlp(backcone_output) 257 | out = nn.Softmax(dim=1)(out) 258 | return out 259 | 260 | 261 | class Vulnerability_final(torch.nn.Module): 262 | def __init__(self, backbone_model, MLP_model): 263 | super(Vulnerability_final, self).__init__() 264 | self.backbone = backbone_model 265 | self.mlp = MLP_model 266 | 267 | def forward(self, my_data, edge_index): 268 | backcone_output = self.backbone(my_data, edge_index) 269 | out = self.mlp(backcone_output) 270 | return out 271 | 272 | 273 | class simpleMLP(torch.nn.Module): 274 | def __init__(self, h_size): 275 | super(simpleMLP, self).__init__() 276 | self.mlp = nn.Linear(h_size, 2) 277 | 278 | def forward(self, x): 279 | x = self.mlp(x) 280 | x = nn.Softmax(dim=1)(x) 281 | return x 282 | -------------------------------------------------------------------------------- /LineVul_model/data_splitter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | from sklearn.model_selection import train_test_split 4 | import numpy as np 5 | from imblearn.over_sampling import RandomOverSampler 6 | from imblearn.under_sampling import RandomUnderSampler 7 | from imblearn.under_sampling import OneSidedSelection 8 | import argparse 9 | import pandas as pd 10 | import itertools 11 | from imblearn.under_sampling import OneSidedSelection 12 | from tqdm import tqdm 13 | tqdm.pandas() 14 | def get_emb(model,tokenizer,code): 15 | code_token = tokenizer(code, padding=True, truncation=True, return_tensors='pt') 16 | # code_token.to(device) 17 | input_ids = code_token.input_ids.to('cuda') 18 | # print(input_ids) 19 | # context_embeddings=model(torch.tensor(code_token.input_ids)[None,:])[0] 20 | context_embeddings=model(input_ids)[0] 21 | mean_context_embeddings = torch.mean(context_embeddings,1).squeeze().cpu().detach().numpy() 22 | return mean_context_embeddings 23 | 24 | if __name__ == "__main__": 25 | 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('--sampling_type', type=str, help='Type of the sampling', 28 | choices=['origin','ros', 'oss','rus'], default='origin') 29 | parser.add_argument('--json_dir', type=str,required=True) 30 | parser.add_argument('--out_dir', type=str,required=True) 31 | parser.add_argument('--data_split_number', type=int, default=20) 32 | args = parser.parse_args() 33 | 34 | df = pd.read_json(args.json_dir) 35 | print(df.info()) 36 | print(df.head()) 37 | # merged = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(df['targets'].tolist())))) 38 | # print(merged) 39 | if args.sampling_type == 'origin': 40 | for i in range(args.data_split_number): 41 | train, test = train_test_split(df,random_state=i,test_size=0.2) 42 | if not os.path.exists(f'{args.out_dir}data_split_{i}'): 43 | os.makedirs(f'{args.out_dir}data_split_{i}') 44 | train.to_json(f'{args.out_dir}data_split_{i}/train.jsonl',orient='records', lines=True) 45 | test.to_json(f'{args.out_dir}data_split_{i}/test.jsonl',orient='records', lines=True) 46 | if args.sampling_type == 'ros': 47 | for i in range(args.data_split_number): 48 | train, test = train_test_split(df,random_state=i,test_size=0.2) 49 | # print(train['target'].tolist()) 50 | # train_targets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(train['target'].tolist())))) 51 | train_targets = train['target'].tolist() 52 | ros = RandomOverSampler() 53 | train_resampled, _ = ros.fit_resample(train, train_targets) 54 | print(train_resampled.info()) 55 | print(test.info()) 56 | if not os.path.exists(f'{args.out_dir}data_split_{i}'): 57 | os.makedirs(f'{args.out_dir}data_split_{i}') 58 | train_resampled.to_json(f'{args.out_dir}data_split_{i}/train.jsonl',orient='records', lines=True) 59 | test.to_json(f'{args.out_dir}data_split_{i}/test.jsonl',orient='records', lines=True) 60 | 61 | if args.sampling_type == 'rus': 62 | for i in range(args.data_split_number): 63 | train, test = train_test_split(df,random_state=i,test_size=0.2) 64 | train_targets = train['target'].tolist() 65 | rus = RandomUnderSampler() 66 | train_resampled, _ = rus.fit_resample(train, train_targets) 67 | print(train_resampled.info()) 68 | print(test.info()) 69 | if not os.path.exists(f'{args.out_dir}data_split_{i}'): 70 | os.makedirs(f'{args.out_dir}data_split_{i}') 71 | train_resampled.to_json(f'{args.out_dir}data_split_{i}/train.jsonl',orient='records', lines=True) 72 | test.to_json(f'{args.out_dir}data_split_{i}/test.jsonl',orient='records', lines=True) 73 | if args.sampling_type == 'oss': 74 | import torch 75 | from transformers import AutoTokenizer, AutoModel 76 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 77 | tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base",return_tensors="pt") 78 | # tokenizer.to(device) 79 | model = AutoModel.from_pretrained("microsoft/codebert-base") 80 | model.to(device) 81 | for i in range(args.data_split_number): 82 | train, test = train_test_split(df,random_state=i,test_size=0.2) 83 | train_targets = train.target.tolist() 84 | oss = OneSidedSelection() 85 | fea_mean = [] 86 | train.progress_apply(lambda row: fea_mean.append(get_emb(model,tokenizer,row['func'])),axis=1) 87 | _, _ = oss.fit_resample(np.array(fea_mean), train_targets) 88 | train_resampled = train.iloc[oss.sample_indices_] 89 | print(train_resampled.info()) 90 | print(test.info()) 91 | if not os.path.exists(f'{args.out_dir}data_split_{i}'): 92 | os.makedirs(f'{args.out_dir}data_split_{i}') 93 | train_resampled.to_json(f'{args.out_dir}data_split_{i}/train.jsonl',orient='records', lines=True) 94 | test.to_json(f'{args.out_dir}data_split_{i}/test.jsonl',orient='records', lines=True) 95 | 96 | 97 | -------------------------------------------------------------------------------- /LineVul_model/evaluator/my_evaluator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | import logging 4 | import sys 5 | import json 6 | import numpy as np 7 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_curve, auc, confusion_matrix 8 | 9 | 10 | def read_answers(filename): 11 | answers = {} 12 | with open(filename) as f: 13 | for line in f: 14 | line = line.strip() 15 | js = json.loads(line) 16 | answers[js['idx']] = js['target'] 17 | return answers 18 | 19 | 20 | def read_predictions(filename): 21 | predictions = {} 22 | with open(filename) as f: 23 | for line in f: 24 | line = line.strip() 25 | idx, label = line.split() 26 | predictions[int(idx)] = int(label) 27 | return predictions 28 | 29 | 30 | def read_predictions_prob(filename): 31 | predictions_prob = {} 32 | with open(filename) as f: 33 | for line in f: 34 | line = line.strip() 35 | idx, label = line.split() 36 | predictions_prob[int(idx)] = float(label) 37 | return predictions_prob 38 | 39 | 40 | def calculate_scores(answers, predictions, predictions_prob): 41 | Acc = [] 42 | Ans = [] 43 | Pred = [] 44 | Pred_prob = [] 45 | for key in answers: 46 | Ans.append(answers[key]) 47 | if key not in predictions: 48 | logging.error("Missing prediction for index {}.".format(key)) 49 | sys.exit() 50 | Acc.append(answers[key] == predictions[key]) 51 | for key in predictions: 52 | Pred.append(predictions[key]) 53 | for key in predictions_prob: 54 | Pred_prob.append(predictions_prob[key]) 55 | scores = {} 56 | scores['Acc'] = np.mean(Acc) 57 | fpr, tpr, _ = roc_curve(Ans, Pred_prob) 58 | print('auc\t', auc(fpr, tpr)) 59 | print('acc\t', accuracy_score(Ans, Pred)) 60 | print('f1\t', f1_score(Ans, Pred)) 61 | print('recall\t', recall_score(Ans, Pred)) 62 | print('precision\t', precision_score(Ans, Pred)) 63 | return scores 64 | 65 | 66 | def main(): 67 | import argparse 68 | parser = argparse.ArgumentParser(description='Evaluate leaderboard predictions for Defect Detection dataset.') 69 | parser.add_argument('--answers', '-a', help="filename of the labels, in txt format.") 70 | parser.add_argument('--predictions', '-p', help="filename of the leaderboard predictions, in txt format.") 71 | parser.add_argument('--predictions_prob', '-b', help="filename of the leaderboard predictions prob, in txt format.") 72 | 73 | args = parser.parse_args() 74 | answers = read_answers(args.answers) 75 | predictions = read_predictions(args.predictions) 76 | predictions_prob = read_predictions_prob(args.predictions_prob) 77 | scores = calculate_scores(answers, predictions, predictions_prob) 78 | print(scores) 79 | 80 | 81 | if __name__ == '__main__': 82 | main() 83 | -------------------------------------------------------------------------------- /LineVul_model/exp_latent.sh: -------------------------------------------------------------------------------- 1 | for index in {0..4}; do 2 | CUDA_VISIBLE_DEVICES=1, python latent.py msr_output/origin/saved_models_"$index"/checkpoint-best/model.bin ../msr_dataset/origin/data_split_"$index"/train.jsonl ../msr_dataset/origin/data_split_"$index"/test.jsonl msr_output/latent/data_split_"$index" 3 | done -------------------------------------------------------------------------------- /LineVul_model/latent_result_analyser.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, \ 2 | classification_report, confusion_matrix 3 | import numpy as np 4 | from sklearn.neural_network import MLPClassifier 5 | # from autosklearn.classification import AutoSklearnClassifier as ASC 6 | # from autosklearn.metrics import balanced_accuracy, precision, recall, f1 7 | # import autoPyTorch 8 | # from autoPyTorch.api.tabular_classification import TabularClassificationTask 9 | from imblearn.over_sampling import RandomOverSampler, SMOTE 10 | from imblearn.under_sampling import RandomUnderSampler, OneSidedSelection 11 | import tensorflow as tf 12 | from tensorflow.keras import layers 13 | from tensorflow.keras import losses 14 | import pickle 15 | 16 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, \ 17 | classification_report, confusion_matrix 18 | # from autosklearn.classification import AutoSklearnClassifier as ASC 19 | # from autosklearn.metrics import balanced_accuracy, precision, recall, f1 20 | # import autoPyTorch 21 | # from autoPyTorch.api.tabular_classification import TabularClassificationTask 22 | from imblearn.over_sampling import RandomOverSampler, SMOTE 23 | from sklearn.neural_network import MLPClassifier 24 | from imblearn.under_sampling import RandomUnderSampler, OneSidedSelection 25 | import tensorflow as tf 26 | 27 | gpu_devices = tf.config.experimental.list_physical_devices('GPU') 28 | for device in gpu_devices: 29 | tf.config.experimental.set_memory_growth(device, True) 30 | 31 | 32 | def calculate_backbone_metric(trainX, trainy, testX, testy, sampling_method): 33 | if sampling_method == 'smote': 34 | smote = SMOTE(n_jobs=-1, random_state=42) 35 | trainX_res, trainy_res = smote.fit_resample(trainX, trainy) 36 | elif sampling_method == 'rus': 37 | rus = RandomUnderSampler(random_state=42) 38 | trainX_res, trainy_res = rus.fit_resample(trainX, trainy) 39 | elif sampling_method == 'ros': 40 | ros = RandomOverSampler(random_state=42) 41 | trainX_res, trainy_res = ros.fit_resample(trainX, trainy) 42 | elif sampling_method == 'oss': 43 | oss = OneSidedSelection(n_jobs=-1, random_state=42) 44 | trainX_res, trainy_res = oss.fit_resample(trainX, trainy) 45 | 46 | clf = tf.keras.Sequential([ 47 | tf.keras.layers.Dropout(0.2), 48 | tf.keras.layers.Dense(768, activation='tanh'), 49 | tf.keras.layers.Dropout(0.2), 50 | tf.keras.layers.Dense(1, activation='sigmoid') 51 | ]) 52 | clf.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy']) 53 | clf.fit(trainX_res, trainy_res, batch_size=128, epochs=100, verbose=0) 54 | print(clf.evaluate(testX, testy)) 55 | pred_prob = clf.predict(testX) 56 | # print(pred_prob) 57 | asign = lambda t: 0 if t < 0.5 else 1 58 | pred = list(map(asign, pred_prob)) 59 | # print(pred) 60 | fpr, tpr, _ = roc_curve(testy, pred_prob) 61 | auc_score = auc(fpr, tpr) * 100 62 | f1 = f1_score(testy, pred, zero_division=0) * 100 63 | recall = recall_score(testy, pred, zero_division=0) * 100 64 | precision = precision_score(testy, pred, zero_division=0) * 100 65 | acc = accuracy_score(testy, pred) * 100 66 | print(classification_report(testy, pred)) 67 | print(confusion_matrix(testy, pred)) 68 | print(f'auc: {auc_score} acc: {acc} precision: {precision} recall: {recall} f1: {f1}') 69 | zipped_result = zip(testy, pred, pred_prob) 70 | sorted_zip = sorted(zipped_result, key=lambda x: x[2], reverse=True) 71 | return sorted_zip 72 | 73 | 74 | for i in range(20): 75 | train_X = np.load(f'devign_output/latent/data_split_{i}/train_X.npy') 76 | train_y = np.load(f'devign_output/latent/data_split_{i}/train_y.npy') 77 | test_X = np.load(f'devign_output/latent/data_split_{i}/test_X.npy') 78 | test_y = np.load(f'devign_output/latent/data_split_{i}/test_y.npy') 79 | for sampling_method in ['smote', 'rus', 'ros', 'oss']: 80 | sorted_zip = calculate_backbone_metric(train_X, train_y, test_X, test_y, sampling_method=sampling_method) 81 | with open(f'devign_output/latent/data_split_{i}/{sampling_method}_sorted_zip.pkl', 'wb') as f: 82 | pickle.dump(sorted_zip, f) 83 | -------------------------------------------------------------------------------- /LineVul_model/lime_result_analyze.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "1686b7bd-6d0c-475c-8016-4424849df157", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pickle" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 10, 16 | "id": "81eeeb8d-83a5-473c-aef3-d7cc71da0345", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "with open(f'lime_output/msr_4x_zip_exp_line_prob.pkl', \"rb\" ) as f:\n", 21 | " x = pickle.load(f)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 11, 27 | "id": "89419654-3251-4007-82d9-a35df273d843", 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "652" 34 | ] 35 | }, 36 | "execution_count": 11, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "len(x)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 15, 48 | "id": "b616cf56-0e12-4374-98b5-8531e66b75c2", 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "0.45245398773006135\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "counter = 0\n", 61 | "for exp,line,prob,_ in x:\n", 62 | " exp = exp[:1]\n", 63 | " for focus_line, importance_score in exp:\n", 64 | " if importance_score > 0:\n", 65 | " if focus_line in line:\n", 66 | " # print(focus_line)\n", 67 | " counter += 1\n", 68 | " break\n", 69 | " # print('focus:',focus_line)\n", 70 | " # print('vul line',line)\n", 71 | "print(counter/len(x))" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 46, 77 | "id": "34ed3125-c9db-4a03-9e62-fc923d60b394", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "with open(f'lime_output/msr_ros_zip_exp_line_prob.pkl', \"rb\" ) as f:\n", 82 | " x2 = pickle.load(f)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 47, 88 | "id": "ddfab645-b9da-4542-b67e-13fb0134bd82", 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/plain": [ 94 | "606" 95 | ] 96 | }, 97 | "execution_count": 47, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "len(x2)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 32, 109 | "id": "d80ef8cd-ce9b-439a-9ea7-d7d9c6450016", 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "0.8201320132013201\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "counter = 0\n", 122 | "for exp,line,prob in x2:\n", 123 | " exp = exp[:10]\n", 124 | " for focus_line, importance_score in exp:\n", 125 | " if importance_score > 0:\n", 126 | " if focus_line in line:\n", 127 | " counter += 1\n", 128 | " break\n", 129 | " # print('focus:',focus_line)\n", 130 | " # print('vul line',line)\n", 131 | "print(counter/len(x2))" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 33, 137 | "id": "8cd6ee3f-feb7-4bfc-a556-700f0d961121", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "with open(f'lime_output/msr_origin_zip_exp_line_prob.pkl', \"rb\" ) as f:\n", 142 | " x3 = pickle.load(f)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 39, 148 | "id": "86d631e9-8a6d-411d-9b8e-e6aac4cb60af", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "18" 155 | ] 156 | }, 157 | "execution_count": 39, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "len(x3)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 37, 169 | "id": "1428d308-40a4-4b1d-85da-5b76c35b296a", 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "0.6666666666666666\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "counter = 0\n", 182 | "for exp,line,prob in x3:\n", 183 | " exp = exp[:10]\n", 184 | " for focus_line, importance_score in exp:\n", 185 | " if importance_score > 0:\n", 186 | " if focus_line in line:\n", 187 | " counter += 1\n", 188 | " break\n", 189 | " # print('focus:',focus_line)\n", 190 | " # print('vul line',line)\n", 191 | " \n", 192 | "print(counter/len(x3))" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 45, 198 | "id": "06ab81b2-9b99-4f7b-b4e9-f9a50e5dbb98", 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "1270" 205 | ] 206 | }, 207 | "execution_count": 45, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "len(x3)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 17, 219 | "id": "a4c00c63-369f-471b-b586-1800810f9f7b", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "with open(f'lime_output/msr_rus_zip_exp_line_prob.pkl', \"rb\" ) as f:\n", 224 | " x3 = pickle.load(f)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 20, 230 | "id": "3244d639-77a4-428c-9aa5-d8fa49a02eed", 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "name": "stdout", 235 | "output_type": "stream", 236 | "text": [ 237 | "0.31683168316831684\n" 238 | ] 239 | } 240 | ], 241 | "source": [ 242 | "index = 0\n", 243 | "counter = 0\n", 244 | "for exp,line,prob,_ in x3:\n", 245 | " index += 1\n", 246 | " if index > 606:\n", 247 | " break\n", 248 | " exp = exp[:1]\n", 249 | " for focus_line, importance_score in exp:\n", 250 | " if importance_score > 0:\n", 251 | " if focus_line in line:\n", 252 | " counter += 1\n", 253 | " break\n", 254 | " # print('focus:',focus_line)\n", 255 | " # print('vul line',line)\n", 256 | " \n", 257 | "print(counter/606)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "id": "1f2f2e7e-6944-4414-8bdd-ff2390722dfe", 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "id": "98611d03-fec0-4bec-bd15-65b10f06376c", 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [] 275 | } 276 | ], 277 | "metadata": { 278 | "kernelspec": { 279 | "display_name": "transformer_env", 280 | "language": "python", 281 | "name": "transformer_env" 282 | }, 283 | "language_info": { 284 | "codemirror_mode": { 285 | "name": "ipython", 286 | "version": 3 287 | }, 288 | "file_extension": ".py", 289 | "mimetype": "text/x-python", 290 | "name": "python", 291 | "nbconvert_exporter": "python", 292 | "pygments_lexer": "ipython3", 293 | "version": "3.8.8" 294 | } 295 | }, 296 | "nbformat": 4, 297 | "nbformat_minor": 5 298 | } 299 | -------------------------------------------------------------------------------- /LineVul_model/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | import torch 4 | import torch.nn as nn 5 | import torch 6 | from torch.autograd import Variable 7 | import copy 8 | from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss 9 | from transformers.models.roberta.modeling_roberta import RobertaClassificationHead 10 | import torch.nn.functional as F 11 | 12 | 13 | class Model(nn.Module): 14 | def __init__(self, encoder, config, tokenizer, args): 15 | super(Model, self).__init__() 16 | self.encoder = encoder 17 | self.config = config 18 | self.tokenizer = tokenizer 19 | self.args = args 20 | 21 | def forward(self, input_ids=None, labels=None, loss_func=None, class_weights=None): 22 | outputs = self.encoder(input_ids, attention_mask=input_ids.ne(1)) 23 | print(outputs) 24 | logits = outputs[0] 25 | print(logits) 26 | prob = torch.sigmoid(logits) 27 | if labels is not None: 28 | 29 | labels = labels.float() 30 | loss = torch.log(prob[:, 0] + 1e-10) * labels + torch.log((1 - prob)[:, 0] + 1e-10) * (1 - labels) 31 | loss = -loss.mean() 32 | return loss, prob 33 | else: 34 | return prob 35 | 36 | 37 | class Model_BCE(nn.Module): 38 | def __init__(self, encoder, config, tokenizer, args): 39 | super(Model_BCE, self).__init__() 40 | self.encoder = encoder 41 | self.config = config 42 | self.tokenizer = tokenizer 43 | self.args = args 44 | 45 | def forward(self, input_ids=None, labels=None, loss_func=None, class_weights=None, get_latent=False): 46 | if get_latent: 47 | outputs = self.encoder.roberta(input_ids,output_attentions=True,output_hidden_states=True,return_dict=True) 48 | return outputs 49 | else: 50 | outputs = self.encoder(input_ids, attention_mask=input_ids.ne(1),output_hidden_states =get_latent) 51 | logits = outputs[0] 52 | probs = torch.sigmoid(logits) 53 | if labels is not None: 54 | labels = labels.float() 55 | loss_fct = BCEWithLogitsLoss() 56 | loss = loss_fct(logits.squeeze(), labels) 57 | return loss, probs 58 | else: 59 | return probs 60 | -------------------------------------------------------------------------------- /LineVul_model/myrun.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa). 18 | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned 19 | using a masked language modeling (MLM) loss. 20 | """ 21 | 22 | from __future__ import absolute_import, division, print_function 23 | 24 | import argparse 25 | from calendar import c 26 | import glob 27 | import logging 28 | import os 29 | from collections import Counter 30 | 31 | import pickle 32 | import random 33 | import re 34 | import shutil 35 | 36 | from sklearn.metrics import f1_score 37 | import numpy as np 38 | import torch 39 | from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler, TensorDataset 40 | from torch.utils.data.distributed import DistributedSampler 41 | import torch.nn.functional as F 42 | 43 | import json 44 | 45 | try: 46 | from torch.utils.tensorboard import SummaryWriter 47 | except: 48 | from tensorboardX import SummaryWriter 49 | 50 | from tqdm import tqdm, trange 51 | import multiprocessing 52 | from model import Model, Model_BCE 53 | 54 | cpu_cont = multiprocessing.cpu_count() 55 | from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, 56 | BertConfig, BertForMaskedLM, BertTokenizer, 57 | GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, 58 | OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, 59 | RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer, 60 | DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer) 61 | 62 | logger = logging.getLogger(__name__) 63 | 64 | MODEL_CLASSES = { 65 | 'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), 66 | 'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), 67 | 'bert': (BertConfig, BertForMaskedLM, BertTokenizer), 68 | 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), 69 | 'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer) 70 | } 71 | 72 | 73 | class InputFeatures(object): 74 | """A single training/test features for a example.""" 75 | 76 | def __init__(self, 77 | input_tokens, 78 | input_ids, 79 | idx, 80 | label, 81 | 82 | ): 83 | self.input_tokens = input_tokens 84 | self.input_ids = input_ids 85 | self.idx = str(idx) 86 | self.label = label 87 | 88 | 89 | def convert_examples_to_features(js, tokenizer, args): 90 | # source 91 | code = ' '.join(js['func'].split()) 92 | code_tokens = tokenizer.tokenize(code)[:args.block_size - 2] 93 | source_tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.sep_token] 94 | source_ids = tokenizer.convert_tokens_to_ids(source_tokens) 95 | padding_length = args.block_size - len(source_ids) 96 | source_ids += [tokenizer.pad_token_id] * padding_length 97 | return InputFeatures(source_tokens, source_ids, js['idx'], js['target']) 98 | 99 | 100 | class TextDataset(Dataset): 101 | def __init__(self, tokenizer, args, file_path=None): 102 | label_count = [] 103 | self.examples = [] 104 | with open(file_path) as f: 105 | for line in f: 106 | js = json.loads(line.strip()) 107 | example = convert_examples_to_features(js, tokenizer, args) 108 | label_count.append(example.label) 109 | self.examples.append(example) 110 | logger.info("label ratio: {}".format(Counter(label_count))) 111 | # if 'train' in file_path: 112 | # for idx, example in enumerate(self.examples[:3]): 113 | # logger.info("*** Example ***") 114 | # logger.info("idx: {}".format(idx)) 115 | # logger.info("label: {}".format(example.label)) 116 | # logger.info("input_tokens: {}".format([x.replace('\u0120', '_') for x in example.input_tokens])) 117 | # logger.info("input_ids: {}".format(' '.join(map(str, example.input_ids)))) 118 | 119 | def __len__(self): 120 | return len(self.examples) 121 | 122 | def __getitem__(self, i): 123 | return torch.tensor(self.examples[i].input_ids), torch.tensor(self.examples[i].label) 124 | 125 | 126 | def set_seed(seed=42): 127 | random.seed(seed) 128 | os.environ['PYHTONHASHSEED'] = str(seed) 129 | np.random.seed(seed) 130 | torch.manual_seed(seed) 131 | torch.cuda.manual_seed(seed) 132 | torch.backends.cudnn.deterministic = True 133 | 134 | 135 | def train(args, train_dataset, model, tokenizer): 136 | """ Train the model """ 137 | if args.tensorboard: 138 | args.writer = SummaryWriter() 139 | 140 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 141 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) 142 | 143 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, 144 | batch_size=args.train_batch_size, num_workers=4, pin_memory=True) 145 | args.max_steps = args.epoch * len(train_dataloader) 146 | args.save_steps = len(train_dataloader) 147 | args.warmup_steps = len(train_dataloader) 148 | # args.logging_steps = len(train_dataloader) 149 | # args.logging_steps= 100 150 | args.num_train_epochs = args.epoch 151 | model.to(args.device) 152 | # Prepare optimizer and schedule (linear warmup and decay) 153 | no_decay = ['bias', 'LayerNorm.weight'] 154 | optimizer_grouped_parameters = [ 155 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 156 | 'weight_decay': args.weight_decay}, 157 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 158 | ] 159 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) 160 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.max_steps * 0.1, 161 | num_training_steps=args.max_steps) 162 | if args.fp16: 163 | try: 164 | from apex import amp 165 | except ImportError: 166 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 167 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 168 | 169 | # multi-gpu training (should be after apex fp16 initialization) 170 | if args.n_gpu > 1: 171 | model = torch.nn.DataParallel(model) 172 | 173 | # Distributed training (should be after apex fp16 initialization) 174 | if args.local_rank != -1: 175 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 176 | output_device=args.local_rank, 177 | find_unused_parameters=True) 178 | 179 | checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last') 180 | scheduler_last = os.path.join(checkpoint_last, 'scheduler.pt') 181 | optimizer_last = os.path.join(checkpoint_last, 'optimizer.pt') 182 | if os.path.exists(scheduler_last): 183 | scheduler.load_state_dict(torch.load(scheduler_last)) 184 | if os.path.exists(optimizer_last): 185 | optimizer.load_state_dict(torch.load(optimizer_last)) 186 | # Train! 187 | logger.info("***** Running training *****") 188 | logger.info(" Num examples = %d", len(train_dataset)) 189 | logger.info(" Num Epochs = %d", args.num_train_epochs) 190 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 191 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 192 | args.train_batch_size * args.gradient_accumulation_steps * ( 193 | torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 194 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 195 | logger.info(" Total optimization steps = %d", args.max_steps) 196 | 197 | global_step = args.start_step 198 | tr_loss, logging_loss, avg_loss, tr_nb, tr_num, train_loss = 0.0, 0.0, 0.0, 0, 0, 0 199 | best_mrr = 0.0 200 | best_acc = 0.0 201 | # model.resize_token_embeddings(len(tokenizer)) 202 | model.zero_grad() 203 | 204 | for idx in range(args.start_epoch, int(args.num_train_epochs)): 205 | bar = tqdm(train_dataloader, total=len(train_dataloader)) 206 | tr_num = 0 207 | train_loss = 0 208 | for step, batch in enumerate(bar): 209 | inputs = batch[0].to(args.device) 210 | labels = batch[1].to(args.device) 211 | model.train() 212 | loss, logits = model(inputs, labels) 213 | # print(logits.size()) 214 | # print(loss) 215 | # exit() 216 | if args.n_gpu > 1: 217 | loss = loss.mean() # mean() to average on multi-gpu parallel training 218 | if args.gradient_accumulation_steps > 1: 219 | loss = loss / args.gradient_accumulation_steps 220 | 221 | if args.fp16: 222 | with amp.scale_loss(loss, optimizer) as scaled_loss: 223 | scaled_loss.backward() 224 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) 225 | else: 226 | loss.backward() 227 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 228 | 229 | tr_loss += loss.item() 230 | tr_num += 1 231 | train_loss += loss.item() 232 | if avg_loss == 0: 233 | avg_loss = tr_loss 234 | avg_loss = round(train_loss / tr_num, 5) 235 | bar.set_description("epoch {} loss {}".format(idx, avg_loss)) 236 | 237 | # args.writer.add_scalar(f'loss', loss.item(), global_step) 238 | 239 | if (step + 1) % args.gradient_accumulation_steps == 0: 240 | optimizer.step() 241 | optimizer.zero_grad() 242 | scheduler.step() 243 | global_step += 1 244 | output_flag = True 245 | avg_loss = round(np.exp((tr_loss - logging_loss) / (global_step - tr_nb)), 4) 246 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 247 | logging_loss = tr_loss 248 | tr_nb = global_step 249 | if args.tensorboard: 250 | args.writer.add_scalar(f'train_loss', avg_loss, global_step) 251 | 252 | if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: 253 | 254 | if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well 255 | results = evaluate(args, model, tokenizer, eval_when_training=True) 256 | if args.tensorboard: 257 | args.writer.add_scalar(f'metrics/f1', results['f1_score'], global_step) 258 | args.writer.add_scalar(f'metrics/acc', results['eval_acc'], global_step) 259 | args.writer.add_scalar(f'loss', avg_loss, global_step) 260 | for key, value in results.items(): 261 | logger.info(" %s = %s", key, round(value, 4)) 262 | # Save model checkpoint 263 | # dataset 264 | checkpoint_prefix = 'checkpoint-best' 265 | output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix)) 266 | if not os.path.exists(output_dir): 267 | os.makedirs(output_dir) 268 | model_to_save = model.module if hasattr(model, 'module') else model 269 | output_dir = os.path.join(output_dir, '{}'.format('model.bin')) 270 | torch.save(model_to_save.state_dict(), output_dir) 271 | logger.info("Saving model checkpoint to %s", output_dir) 272 | # if results['eval_acc']>best_acc: 273 | # best_acc=results['eval_acc'] 274 | # logger.info(" "+"*"*20) 275 | # logger.info(" Best acc:%s",round(best_acc,4)) 276 | # logger.info(" "+"*"*20) 277 | 278 | # checkpoint_prefix = 'checkpoint-best-acc' 279 | # output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix)) 280 | # if not os.path.exists(output_dir): 281 | # os.makedirs(output_dir) 282 | # model_to_save = model.module if hasattr(model,'module') else model 283 | # output_dir = os.path.join(output_dir, '{}'.format('model.bin')) 284 | # torch.save(model_to_save.state_dict(), output_dir) 285 | # logger.info("Saving model checkpoint to %s", output_dir) 286 | if args.tensorboard: 287 | args.writer.close() 288 | 289 | 290 | def evaluate(args, model, tokenizer, eval_when_training=False): 291 | # Loop to handle MNLI double evaluation (matched, mis-matched) 292 | eval_output_dir = args.output_dir 293 | 294 | eval_dataset = TextDataset(tokenizer, args, args.eval_data_file) 295 | 296 | if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: 297 | os.makedirs(eval_output_dir) 298 | 299 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 300 | # Note that DistributedSampler samples randomly 301 | eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) 302 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=4, 303 | pin_memory=True) 304 | 305 | # multi-gpu evaluate 306 | if args.n_gpu > 1 and eval_when_training is False: 307 | model = torch.nn.DataParallel(model) 308 | 309 | # Eval! 310 | logger.info("***** Running evaluation *****") 311 | logger.info(" Num examples = %d", len(eval_dataset)) 312 | logger.info(" Batch size = %d", args.eval_batch_size) 313 | eval_loss = 0.0 314 | nb_eval_steps = 0 315 | model.eval() 316 | logits = [] 317 | labels = [] 318 | for batch in eval_dataloader: 319 | inputs = batch[0].to(args.device) 320 | label = batch[1].to(args.device) 321 | with torch.no_grad(): 322 | lm_loss, logit = model(inputs, label) 323 | eval_loss += lm_loss.mean().item() 324 | logits.append(logit.cpu().numpy()) 325 | labels.append(label.cpu().numpy()) 326 | nb_eval_steps += 1 327 | logits = np.concatenate(logits, 0) 328 | labels = np.concatenate(labels, 0) 329 | preds = logits[:, 0] > 0.5 330 | eval_acc = np.mean(labels == preds) 331 | eval_loss = eval_loss / nb_eval_steps 332 | perplexity = torch.tensor(eval_loss) 333 | f1 = f1_score(labels, preds) 334 | result = { 335 | "eval_loss": float(perplexity), 336 | "eval_acc": round(eval_acc, 4), 337 | "f1_score": round(f1, 4), 338 | } 339 | return result 340 | 341 | 342 | def test(args, model, tokenizer): 343 | # Loop to handle MNLI double evaluation (matched, mis-matched) 344 | eval_dataset = TextDataset(tokenizer, args, args.test_data_file) 345 | 346 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 347 | # Note that DistributedSampler samples randomly 348 | eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) 349 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) 350 | 351 | # multi-gpu evaluate 352 | if args.n_gpu > 1: 353 | model = torch.nn.DataParallel(model) 354 | 355 | # Eval! 356 | logger.info("***** Running Test *****") 357 | logger.info(" Num examples = %d", len(eval_dataset)) 358 | logger.info(" Batch size = %d", args.eval_batch_size) 359 | eval_loss = 0.0 360 | nb_eval_steps = 0 361 | model.eval() 362 | logits = [] 363 | labels = [] 364 | for batch in tqdm(eval_dataloader, total=len(eval_dataloader)): 365 | inputs = batch[0].to(args.device) 366 | label = batch[1].to(args.device) 367 | with torch.no_grad(): 368 | logit = model(inputs) 369 | logits.append(logit.cpu().numpy()) 370 | labels.append(label.cpu().numpy()) 371 | 372 | logits = np.concatenate(logits, 0) 373 | labels = np.concatenate(labels, 0) 374 | preds = logits[:, 0] > 0.5 375 | with open(os.path.join(args.output_dir, "predictions.txt"), 'w') as f: 376 | for example, pred in zip(eval_dataset.examples, preds): 377 | if pred: 378 | f.write(example.idx + '\t1\n') 379 | else: 380 | f.write(example.idx + '\t0\n') 381 | # record prob 382 | with open(os.path.join(args.output_dir, "predictions_prob.txt"), 'w') as f: 383 | for i, example in enumerate(eval_dataset.examples): 384 | f.write(example.idx + f'\t{logits[i][0]}\n') 385 | 386 | 387 | def main(): 388 | parser = argparse.ArgumentParser() 389 | 390 | ## Required parameters 391 | parser.add_argument("--train_data_file", default=None, type=str, required=True, 392 | help="The input training data file (a text file).") 393 | parser.add_argument("--output_dir", default=None, type=str, required=True, 394 | help="The output directory where the model predictions and checkpoints will be written.") 395 | 396 | ## Other parameters 397 | parser.add_argument("--eval_data_file", default=None, type=str, 398 | help="An optional input evaluation data file to evaluate the perplexity on (a text file).") 399 | parser.add_argument("--test_data_file", default=None, type=str, 400 | help="An optional input evaluation data file to evaluate the perplexity on (a text file).") 401 | 402 | parser.add_argument("--model_type", default="bert", type=str, 403 | help="The model architecture to be fine-tuned.") 404 | parser.add_argument("--model_name_or_path", default=None, type=str, 405 | help="The model checkpoint for weights initialization.") 406 | 407 | parser.add_argument("--mlm", action='store_true', 408 | help="Train with masked-language modeling loss instead of language modeling.") 409 | parser.add_argument("--mlm_probability", type=float, default=0.15, 410 | help="Ratio of tokens to mask for masked language modeling loss") 411 | 412 | parser.add_argument("--config_name", default="", type=str, 413 | help="Optional pretrained config name or path if not the same as model_name_or_path") 414 | parser.add_argument("--tokenizer_name", default="", type=str, 415 | help="Optional pretrained tokenizer name or path if not the same as model_name_or_path") 416 | parser.add_argument("--cache_dir", default="", type=str, 417 | help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)") 418 | parser.add_argument("--block_size", default=-1, type=int, 419 | help="Optional input sequence length after tokenization." 420 | "The training dataset will be truncated in block of this size for training." 421 | "Default to the model max input length for single sentence inputs (take into account special tokens).") 422 | parser.add_argument("--do_train", action='store_true', 423 | help="Whether to run training.") 424 | parser.add_argument("--do_eval", action='store_true', 425 | help="Whether to run eval on the dev set.") 426 | parser.add_argument("--do_test", action='store_true', 427 | help="Whether to run eval on the dev set.") 428 | parser.add_argument("--evaluate_during_training", action='store_true', 429 | help="Run evaluation during training at each logging step.") 430 | parser.add_argument("--do_lower_case", action='store_true', 431 | help="Set this flag if you are using an uncased model.") 432 | 433 | parser.add_argument("--train_batch_size", default=4, type=int, 434 | help="Batch size per GPU/CPU for training.") 435 | parser.add_argument("--eval_batch_size", default=4, type=int, 436 | help="Batch size per GPU/CPU for evaluation.") 437 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 438 | help="Number of updates steps to accumulate before performing a backward/update pass.") 439 | parser.add_argument("--learning_rate", default=5e-5, type=float, 440 | help="The initial learning rate for Adam.") 441 | parser.add_argument("--weight_decay", default=0.0, type=float, 442 | help="Weight deay if we apply some.") 443 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 444 | help="Epsilon for Adam optimizer.") 445 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 446 | help="Max gradient norm.") 447 | parser.add_argument("--num_train_epochs", default=1.0, type=float, 448 | help="Total number of training epochs to perform.") 449 | parser.add_argument("--max_steps", default=-1, type=int, 450 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 451 | parser.add_argument("--warmup_steps", default=0, type=int, 452 | help="Linear warmup over warmup_steps.") 453 | 454 | parser.add_argument('--logging_steps', type=int, default=50, 455 | help="Log every X updates steps.") 456 | parser.add_argument('--save_steps', type=int, default=50, 457 | help="Save checkpoint every X updates steps.") 458 | parser.add_argument('--save_total_limit', type=int, default=None, 459 | help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default') 460 | parser.add_argument("--eval_all_checkpoints", action='store_true', 461 | help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number") 462 | parser.add_argument("--no_cuda", action='store_true', 463 | help="Avoid using CUDA when available") 464 | parser.add_argument('--overwrite_output_dir', action='store_true', 465 | help="Overwrite the content of the output directory") 466 | parser.add_argument('--overwrite_cache', action='store_true', 467 | help="Overwrite the cached training and evaluation sets") 468 | parser.add_argument('--seed', type=int, default=42, 469 | help="random seed for initialization") 470 | parser.add_argument('--epoch', type=int, default=42, 471 | help="random seed for initialization") 472 | parser.add_argument('--fp16', action='store_true', 473 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 474 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 475 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 476 | "See details at https://nvidia.github.io/apex/amp.html") 477 | parser.add_argument("--local_rank", type=int, default=-1, 478 | help="For distributed training: local_rank") 479 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") 480 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") 481 | parser.add_argument("--tensorboard", action='store_true', 482 | help="use tensorboard") 483 | parser.add_argument("--do_logits", action='store_true', 484 | help="Whether to get logits on the dev set.") 485 | 486 | args = parser.parse_args() 487 | 488 | # Setup CUDA, GPU & distributed training 489 | if args.local_rank == -1 or args.no_cuda: 490 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 491 | args.n_gpu = torch.cuda.device_count() 492 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 493 | torch.cuda.set_device(args.local_rank) 494 | device = torch.device("cuda", args.local_rank) 495 | torch.distributed.init_process_group(backend='nccl') 496 | args.n_gpu = 1 497 | args.device = device 498 | args.per_gpu_train_batch_size = args.train_batch_size // args.n_gpu 499 | args.per_gpu_eval_batch_size = args.eval_batch_size // args.n_gpu 500 | # Setup logging 501 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 502 | datefmt='%m/%d/%Y %H:%M:%S', 503 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) 504 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 505 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 506 | 507 | set_seed(args.seed) 508 | 509 | # Load pretrained model and tokenizer 510 | if args.local_rank not in [-1, 0]: 511 | torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab 512 | 513 | args.start_epoch = 0 514 | args.start_step = 0 515 | checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last') 516 | if os.path.exists(checkpoint_last) and os.listdir(checkpoint_last): 517 | args.model_name_or_path = os.path.join(checkpoint_last, 'pytorch_model.bin') 518 | args.config_name = os.path.join(checkpoint_last, 'config.json') 519 | idx_file = os.path.join(checkpoint_last, 'idx_file.txt') 520 | with open(idx_file, encoding='utf-8') as idxf: 521 | args.start_epoch = int(idxf.readlines()[0].strip()) + 1 522 | 523 | step_file = os.path.join(checkpoint_last, 'step_file.txt') 524 | if os.path.exists(step_file): 525 | with open(step_file, encoding='utf-8') as stepf: 526 | args.start_step = int(stepf.readlines()[0].strip()) 527 | 528 | logger.info("reload model from {}, resume from {} epoch".format(checkpoint_last, args.start_epoch)) 529 | 530 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 531 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, 532 | cache_dir=args.cache_dir if args.cache_dir else None) 533 | config.num_labels = 1 534 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, 535 | do_lower_case=args.do_lower_case, 536 | cache_dir=args.cache_dir if args.cache_dir else None) 537 | if args.block_size <= 0: 538 | args.block_size = tokenizer.max_len_single_sentence # Our input block size will be the max possible for the model 539 | args.block_size = min(args.block_size, tokenizer.max_len_single_sentence) 540 | if args.model_name_or_path: 541 | model = model_class.from_pretrained(args.model_name_or_path, 542 | from_tf=bool('.ckpt' in args.model_name_or_path), 543 | config=config, 544 | cache_dir=args.cache_dir if args.cache_dir else None) 545 | else: 546 | model = model_class(config) 547 | # print(model) 548 | model = Model_BCE(model, config, tokenizer, args) 549 | if args.local_rank == 0: 550 | torch.distributed.barrier() # End of barrier to make sure only the first process in distributed training download model & vocab 551 | 552 | logger.info("Training/evaluation parameters %s", args) 553 | 554 | # Training 555 | if args.do_train: 556 | if args.local_rank not in [-1, 0]: 557 | torch.distributed.barrier() # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache 558 | 559 | train_dataset = TextDataset(tokenizer, args, args.train_data_file) 560 | if args.local_rank == 0: 561 | torch.distributed.barrier() 562 | 563 | train(args, train_dataset, model, tokenizer) 564 | # args.writer.close() 565 | 566 | # Evaluation 567 | results = {} 568 | if args.do_eval and args.local_rank in [-1, 0]: 569 | checkpoint_prefix = 'checkpoint-best/model.bin' 570 | output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix)) 571 | model.load_state_dict(torch.load(output_dir)) 572 | model.to(args.device) 573 | result = evaluate(args, model, tokenizer) 574 | logger.info("***** Eval results *****") 575 | for key in sorted(result.keys()): 576 | logger.info(" %s = %s", key, str(round(result[key], 4))) 577 | 578 | if args.do_test and args.local_rank in [-1, 0]: 579 | checkpoint_prefix = 'checkpoint-best/model.bin' 580 | output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix)) 581 | model.load_state_dict(torch.load(output_dir)) 582 | model.to(args.device) 583 | # model = model.module.to(device) 584 | test(args, model, tokenizer) 585 | 586 | # if args.do_test and args.local_rank in [-1, 0]: 587 | # checkpoint_prefix = 'checkpoint-best/model.bin' 588 | # output_dir = os.path.join(args.output_dir, '{}'.format(checkpoint_prefix)) 589 | # model.load_state_dict(torch.load(output_dir)) 590 | # model.to(args.device) 591 | # get_logits(args, model, tokenizer) 592 | 593 | return results 594 | 595 | 596 | if __name__ == "__main__": 597 | main() 598 | 599 | 600 | -------------------------------------------------------------------------------- /LineVul_model/readme.md: -------------------------------------------------------------------------------- 1 | This readme file also the repo is based on 'codeXglue'[https://microsoft.github.io/CodeXGLUE/] which is the same as the backbone network of the model proposed in 'LineVul'[https://github.com/awsm-research/LineVul] 2 | # CodeXGLUE -- Defect Detection 3 | 4 | ## Task Definition 5 | 6 | Given a source code, the task is to identify whether it is an insecure code that may attack software systems, such as resource leaks, use-after-free vulnerabilities and DoS attack. We treat the task as binary classification (0/1), where 1 stands for insecure code and 0 for secure code. 7 | 8 | 9 | ### Data Format 10 | 11 | After preprocessing dataset, you can obtain three .jsonl files, i.e. train.jsonl, valid.jsonl, test.jsonl 12 | 13 | For each file, each line in the uncompressed file represents one function. One row is illustrated below. 14 | 15 | - **func:** the source code 16 | - **target:** 0 or 1 (vulnerability or not) 17 | - **idx:** the index of example 18 | 19 | ### Input predictions 20 | 21 | A predications file that has predictions in TXT format, such as evaluator/predictions.txt. For example: 22 | 23 | ```shell 24 | 0 0 25 | 1 1 26 | 2 1 27 | 3 0 28 | 4 0 29 | ``` 30 | 31 | I append a prediction with probability file as like the above one. 32 | 33 | ## Pipeline-CodeBERT 34 | 35 | We also provide a pipeline that fine-tunes [CodeBERT](https://arxiv.org/pdf/2002.08155.pdf) on this task. 36 | 37 | ## data split & data sampling 38 | run file `data_splitter.py` to do data splitting and data sampling 39 | 1. change parameter `--sampling_type` to do **raw code level sampling** 40 | 41 | ### run with **Sampling_R** 42 | See example in `exp.bash` or see under: 43 | ```shell 44 | cd code 45 | CUDA_VISIBLE_DEVICES=3, python my_run.py \ 46 | --output_dir=./devign_output/origin/saved_models_0 \ 47 | --model_type=roberta \ 48 | --tokenizer_name=microsoft/codebert-base \ 49 | --model_name_or_path=microsoft/codebert-base \ 50 | --do_train \ 51 | --do_test\ 52 | --train_data_file=../devign_dataset/origin/data_split_0/train.jsonl \ 53 | --test_data_file=../devign_dataset/origin/data_split_0/test.jsonl \ 54 | --epoch 4 \ 55 | --block_size 400 \ 56 | --train_batch_size 32 \ 57 | --eval_batch_size 16 \ 58 | --learning_rate 5e-5 \ 59 | --max_grad_norm 1.0 60 | ``` 61 | 62 | ### run with **Sampling_L** 63 | require the NoSampling trained model from previous step 64 | see example run script in `exp_latent.bash` 65 | 66 | ### explaination: 67 | 1. CUDA_VISIBLE_DEVICES -> which gpu to use. Count from 0. I usually might using the 0, you can use the other. 68 | 2. do_train -> actually the fine tuning process 69 | 3. do_test -> use with do_test, it will run the test set and calculate the performance 70 | 4. epochs -> usually 4 is enough, you can try other 71 | 5. train_batch_size -> dont modify. 3090 gpu can only hold 32 72 | 6. learning_rate 5e-5 -> this one you can try from 2e-5 to 5e-5 73 | 7. data_split_x -> used in my project for see performance variance, you can just use the 0 for now i guess 74 | 8. in order to validate if you methods works(i,e, augmentation/transformation on code), only change the data in train file and keep the test file unmodified for fair comparision with the baseline 75 | 9. model might take hours or days to run 76 | ### Result 77 | 1. result will be printed out at console 78 | 2. fine-tuned model will be save at out_dir/model.bin 79 | 3. also prediction and prediction_probability file will be at output dir, you can run `python ../evaluator/evaluator.py -a ../dataset/test.jsonl -p outdir/predictions.txt -b outdir/prediction_prob.txt` to evaluate again 80 | 81 | ### Lime 82 | 1. For sampling_R, run the jupyter notebook `lime_explainer.ipynb` 83 | 2. For samplong_L, rin the jupyter notebook `lime_explainer-latent.ipynb` 84 | 3. run `lime_result_analyser.ipynb` to calculate the hit rate 85 | 86 | 87 | ## Reference 88 |


89 | @inproceedings{fu2022linevul,
90 |   title={LineVul: A Transformer-based Line-Level Vulnerability Prediction},
91 |   author={Fu, Michael and Tantithamthavorn, Chakkrit},
92 |   booktitle={2022 IEEE/ACM 19th International Conference on Mining Software Repositories (MSR)},
93 |   year={2022},
94 |   organization={IEEE}
95 | }
96 |

97 | -------------------------------------------------------------------------------- /LineVul_model/result_analyser.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "pycharm": { 8 | "name": "#%%\n" 9 | } 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import logging\n", 14 | "import sys\n", 15 | "import json\n", 16 | "import numpy as np\n", 17 | "from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score,roc_curve, auc, confusion_matrix,classification_report\n", 18 | "\n", 19 | "def read_answers(filename):\n", 20 | " answers={}\n", 21 | " with open(filename) as f:\n", 22 | " for line in f:\n", 23 | " line=line.strip()\n", 24 | " js=json.loads(line)\n", 25 | " answers[js['idx']]=js['target']\n", 26 | " return answers\n", 27 | "\n", 28 | "def read_predictions(filename):\n", 29 | " predictions={}\n", 30 | " with open(filename) as f:\n", 31 | " for line in f:\n", 32 | " line=line.strip()\n", 33 | " idx,label=line.split()\n", 34 | " predictions[int(idx)]=int(label)\n", 35 | " return predictions\n", 36 | "\n", 37 | "def read_predictions_prob(filename):\n", 38 | " predictions_prob={}\n", 39 | " with open(filename) as f:\n", 40 | " for line in f:\n", 41 | " line=line.strip()\n", 42 | " idx,label=line.split()\n", 43 | " predictions_prob[int(idx)]= float(label)\n", 44 | " return predictions_prob\n", 45 | "\n", 46 | "def calculate_scores(answers,predictions,predictions_prob):\n", 47 | " Acc=[]\n", 48 | " Ans=[]\n", 49 | " Pred=[]\n", 50 | " Pred_prob=[]\n", 51 | " for key in answers:\n", 52 | " Ans.append(answers[key])\n", 53 | " if key not in predictions:\n", 54 | " logging.error(\"Missing prediction for index {}.\".format(key))\n", 55 | " sys.exit()\n", 56 | " Acc.append(answers[key]==predictions[key])\n", 57 | " for key in predictions:\n", 58 | " Pred.append(predictions[key])\n", 59 | " for key in predictions_prob:\n", 60 | " Pred_prob.append(predictions_prob[key])\n", 61 | " scores={}\n", 62 | " results = []\n", 63 | "# scores['acc']=np.mean(Acc)\n", 64 | " fpr, tpr, _ = roc_curve(Ans, Pred_prob)\n", 65 | " results.append(auc(fpr, tpr)*100)\n", 66 | " results.append(accuracy_score(Ans,Pred)*100)\n", 67 | " results.append(recall_score(Ans,Pred)*100)\n", 68 | " results.append(precision_score(Ans,Pred,zero_division=0)*100)\n", 69 | " results.append(f1_score(Ans,Pred,zero_division=0)*100)\n", 70 | " zipped_result = zip(Ans,Pred,Pred_prob)\n", 71 | " sorted_zip = sorted(zipped_result, key=lambda x: x[2],reverse=True)\n", 72 | " print(confusion_matrix(Ans,Pred))\n", 73 | "# print('auc\\t',auc(fpr, tpr))\n", 74 | "# print('acc\\t',accuracy_score(Ans,Pred))\n", 75 | "# print('f1\\t',f1_score(Ans,Pred))\n", 76 | "# print('recall\\t',recall_score(Ans,Pred))\n", 77 | "# print('precision\\t',precision_score(Ans,Pred))\n", 78 | " print(results)\n", 79 | " return results,sorted_zip,Pred_prob\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 2, 85 | "metadata": { 86 | "pycharm": { 87 | "name": "#%%\n" 88 | } 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "def read_output(test_dir,result_dir):\n", 93 | " answers=read_answers(test_dir+'test.jsonl')\n", 94 | " predictions=read_predictions(result_dir+'predictions.txt')\n", 95 | " predictions_prob = read_predictions_prob(result_dir+'predictions_prob.txt')\n", 96 | " scores,sorted_zip, Pred_prob=calculate_scores(answers,predictions,predictions_prob)\n", 97 | " return scores,sorted_zip,Pred_prob" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 3, 103 | "metadata": { 104 | "pycharm": { 105 | "name": "#%%\n" 106 | } 107 | }, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "msr_output/ros_msr/saved_models_0/\n", 114 | "[[34518 1005]\n", 115 | " [ 1384 821]]\n", 116 | "[76.60132239704429, 93.66783290924512, 37.23356009070295, 44.961664841182916, 40.734309104440584]\n", 117 | "msr_output/ros_msr/saved_models_1/\n", 118 | "[[34550 952]\n", 119 | " [ 1402 824]]\n", 120 | "[80.29656200986969, 93.76060220525869, 37.01707097933513, 46.3963963963964, 41.17941029485258]\n", 121 | "msr_output/ros_msr/saved_models_2/\n", 122 | "[[34610 964]\n", 123 | " [ 1389 765]]\n", 124 | "[74.40449763551453, 93.76325275657337, 35.515320334261844, 44.24522845575477, 39.40252382178728]\n", 125 | "msr_output/ros_msr/saved_models_3/\n", 126 | "[[34615 941]\n", 127 | " [ 1361 811]]\n", 128 | "[80.82965498670217, 93.89843087362172, 37.338858195211785, 46.289954337899545, 41.335372069317025]\n", 129 | "msr_output/ros_msr/saved_models_4/\n", 130 | "[[34665 864]\n", 131 | " [ 1394 805]]\n", 132 | "[70.99250026920473, 94.01505513146734, 36.607548885857206, 48.23247453565009, 41.623578076525334]\n", 133 | "msr_output/ros_msr/saved_models_5/\n", 134 | "[[34676 888]\n", 135 | " [ 1400 764]]\n", 136 | "[74.2297515858006, 93.93553859202714, 35.304990757855826, 46.246973365617436, 40.04192872117401]\n", 137 | "msr_output/ros_msr/saved_models_6/\n", 138 | "[[34531 1004]\n", 139 | " [ 1343 850]]\n", 140 | "[75.72716276015676, 93.7791560644614, 38.759689922480625, 45.84681769147789, 42.006424511984186]\n", 141 | "msr_output/ros_msr/saved_models_7/\n", 142 | "[[34515 1041]\n", 143 | " [ 1382 790]]\n", 144 | "[70.13264954181166, 93.57771416454622, 36.37200736648251, 43.14582195521573, 39.470397202098425]\n", 145 | "msr_output/ros_msr/saved_models_8/\n", 146 | "[[34619 950]\n", 147 | " [ 1387 772]]\n", 148 | "[74.38341665790833, 93.80566157760815, 35.75729504400185, 44.83159117305458, 39.78356093790259]\n", 149 | "msr_output/ros_msr/saved_models_9/\n", 150 | "[[34570 961]\n", 151 | " [ 1394 803]]\n", 152 | "[74.86113436019835, 93.75795165394402, 36.54984069185252, 45.52154195011338, 40.54531683918202]\n", 153 | "msr_output/ros_msr/saved_models_10/\n", 154 | "[[34602 968]\n", 155 | " [ 1334 824]]\n", 156 | "[78.09118049152124, 93.89843087362172, 38.18350324374421, 45.982142857142854, 41.721518987341774]\n", 157 | "msr_output/ros_msr/saved_models_11/\n", 158 | "[[34695 932]\n", 159 | " [ 1346 755]]\n", 160 | "[60.05300877286018, 93.96204410517387, 35.93526891956211, 44.75400118553646, 39.86272439281943]\n", 161 | "msr_output/ros_msr/saved_models_12/\n", 162 | "[[34671 945]\n", 163 | " [ 1374 738]]\n", 164 | "[76.57451326884922, 93.85337150127226, 34.94318181818182, 43.85026737967914, 38.89328063241107]\n", 165 | "msr_output/ros_msr/saved_models_13/\n", 166 | "[[34653 935]\n", 167 | " [ 1353 787]]\n", 168 | "[74.31879865522244, 93.93553859202714, 36.77570093457944, 45.70267131242741, 40.75608493008804]\n", 169 | "msr_output/ros_msr/saved_models_14/\n", 170 | "[[34569 942]\n", 171 | " [ 1378 839]]\n", 172 | "[76.4742981098934, 93.85072094995759, 37.843933243121334, 47.10836608646828, 41.97098549274638]\n", 173 | "msr_output/ros_msr/saved_models_15/\n", 174 | "[[34560 960]\n", 175 | " [ 1384 824]]\n", 176 | "[79.01079408212561, 93.78710771840542, 37.31884057971014, 46.18834080717489, 41.28256513026052]\n", 177 | "msr_output/ros_msr/saved_models_16/\n", 178 | "[[34574 994]\n", 179 | " [ 1364 796]]\n", 180 | "[68.79657874952099, 93.75, 36.851851851851855, 44.46927374301676, 40.30379746835443]\n", 181 | "msr_output/ros_msr/saved_models_17/\n", 182 | "[[34658 964]\n", 183 | " [ 1287 819]]\n", 184 | "[77.0780650667612, 94.03360899067006, 38.88888888888889, 45.93381940549636, 42.11879660581126]\n", 185 | "msr_output/ros_msr/saved_models_18/\n", 186 | "[[34730 897]\n", 187 | " [ 1373 728]]\n", 188 | "[72.78113745214627, 93.98324851569126, 34.65016658733936, 44.800000000000004, 39.07675791733764]\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "result_list = [] \n", 194 | "for i in range(0,19):\n", 195 | " try:\n", 196 | " test_dir = f'../msr_dataset/ros_msr/data_split_{i}/'\n", 197 | " result_dir = f'msr_output/ros_msr/saved_models_{i}/'\n", 198 | " print(result_dir)\n", 199 | " result,sorted_zip,pred_prob = read_output(test_dir,result_dir)\n", 200 | " # import pickle\n", 201 | " # pickle.dump(sorted_zip, open( f'../../metrics_exp/data/Codebert_model/reveal/oss/{i}_zip_ans_pred_prob.pkl', \"wb\" ))\n", 202 | " # result_list.append(result)\n", 203 | " except:\n", 204 | " print('error',i)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 3, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "[[34612 911]\n", 217 | " [ 1380 825]]\n", 218 | "[70.96231479295169, 93.92758693808312, 37.41496598639456, 47.52304147465438, 41.86754630804364]\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "i=0\n", 224 | "test_dir = f'../msr_dataset/origin/data_split_{i}/'\n", 225 | "result_dir = f'msr_outout/ros_4x/'\n", 226 | "result,sorted_zip,pred_prob_2 = read_output(test_dir,result_dir)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "import seaborn as sns\n", 236 | "sns.displot(pred_prob, kde=False, \n", 237 | " bins=100, color = 'blue')" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "import seaborn as sns\n", 247 | "sns.displot(pred_prob_2, kde=False, \n", 248 | " bins=100, color = 'blue')" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": { 255 | "pycharm": { 256 | "name": "#%%\n" 257 | } 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "import pandas as pd\n", 262 | "df = pd.DataFrame(result_list, columns = ['auc', 'acc','recall','precision','f1'])" 263 | ] 264 | }, 265 | { 266 | "cell_type": "raw", 267 | "metadata": { 268 | "pycharm": { 269 | "name": "#%%\n" 270 | } 271 | }, 272 | "source": [ 273 | "zipped_result = zip(Ans,Pred,Pred_prob)\n", 274 | "sorted_zip = sorted(zipped_result, key=lambda x: x[2],reverse=True)\n", 275 | "import pickle\n", 276 | "pickle.dump(sorted_zip, open( f'{result_dir}zip_ans_pred_prob.pkl', \"wb\" ))" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [] 285 | } 286 | ], 287 | "metadata": { 288 | "kernelspec": { 289 | "display_name": "transformer_env", 290 | "language": "python", 291 | "name": "transformer_env" 292 | }, 293 | "language_info": { 294 | "codemirror_mode": { 295 | "name": "ipython", 296 | "version": 3 297 | }, 298 | "file_extension": ".py", 299 | "mimetype": "text/x-python", 300 | "name": "python", 301 | "nbconvert_exporter": "python", 302 | "pygments_lexer": "ipython3", 303 | "version": "3.8.8" 304 | } 305 | }, 306 | "nbformat": 4, 307 | "nbformat_minor": 4 308 | } 309 | -------------------------------------------------------------------------------- /LineVul_model/run.bash: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1 python my_run.py \ 2 | --output_dir=./msr_outout/ros_2x \ 3 | --model_type=roberta \ 4 | --tokenizer_name=microsoft/codebert-base \ 5 | --model_name_or_path=microsoft/codebert-base \ 6 | --do_train \ 7 | --do_test\ 8 | --train_data_file=../msr_dataset/ros_2x/train.jsonl \ 9 | --test_data_file=../msr_dataset/origin/data_split_0/test.jsonl \ 10 | --epoch 4 \ 11 | --block_size 400 \ 12 | --train_batch_size 64 \ 13 | --eval_batch_size 32 \ 14 | --learning_rate 5e-5 \ 15 | --max_grad_norm 1.0 -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # DataSampling4DLVD 2 | This is the official replication repository for our paper 3 | > *Does data sampling improve deep learning-based 4 | vulnerability detection? Yeas! and Nays!* 5 | 6 | ## 0.datasets 7 | ### 0.1 processed datasets dump availble in Zenodo: 8 | Please click to see our zenodo site for our processed datasets at: 9 | 10 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7057996.svg)](https://doi.org/10.5281/zenodo.7057996) 11 | 12 | 13 | ### 0.2 raw datasets: 14 | This repo consist three model that are developed based on their official releases github repo. 15 | 1. [IVDetect](https://github.com/vulnerabilitydetection/VulnerabilityDetectionResearch) 16 | 2. [Reveal](https://github.com/VulDetProject/ReVeal) 17 | 3. [LineVul/codebert](https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Defect-detection) 18 | 19 | We thank for researchers for their hard work. 20 | ## 1 models 21 | The repository consist of 3 model replication folder. Also required package are listed. 22 | ### 1.1 IVDetect_model 23 | Package: 24 | 1. Pytorch 25 | 2. Pytorch-geometric 26 | 3. imblearn 27 | 4. sklearn 28 | 5. gensim 29 | 6. nni 30 | ### 1.2 Reveal_model 31 | >**Devign_model** also inside Reveal_model folder. This is because the author of Devign didn't open source the code, our implementation of Devign are based on the replicate written by Reveal's author. 32 | 33 | Package: 34 | 1. pytorch 35 | 2. dgl (which includes the GNNExplainer implementation for XAI) 36 | 3. imblearn 37 | 4. sklearn 38 | 39 | ### 1.3 LineVul_model 40 | package: 41 | 1. Pytorch 42 | 2. Transformer (by Huggingface) 43 | 3. Lime (if you want to use the XAI tool Lime) 44 | 4. tensorflow 45 | 5. imblearn 46 | 6. sklearn 47 | 48 | ## 2 Datasets 49 | We use three datasets in the experiment, we provide only the link to the **raw** dataset here, and we will provide the **processed** datasets(as model input) in zenodo 50 | 1. [Devign](https://drive.google.com/file/d/1x6hoF7G-tSYxg8AFybggypLZgMGDNHfF/view) dataset 51 | 1. for more detailed info about the devign datasets, check [Devign's official webpage](https://sites.google.com/view/devign) 52 | 2. [Reveal](https://drive.google.com/drive/folders/1KuIYgFcvWUXheDhT--cBALsfy1I4utOy) dataset 53 | 1. for more detailed info about the reveal datasets, check [Reveal github](https://github.com/VulDetProject/ReVeal) 54 | 3. [BigVul](https://drive.google.com/file/d/1-0VhnHBp9IGh90s2wCNjeCMuy70HPl8X/view) dataset 55 | 1. we use a cleaned version of BigVul, the origin BigVul contain much more information to digest, we suggest researchers to check the origin BigVul dataset at [BigVul](https://github.com/ZeoVan/MSR_20_Code_vulnerability_CSV_Dataset) official repo 56 | 57 | ## 3 Interpretable Tool 58 | We provide the code/jupyter-notebook that we use in our RQ and discussion part for future study 59 | 1. **Lime** is in LineVul_model folder 60 | 2. **GNNExplainer** is in Reveal_model folder 61 | 62 | 63 | ## 4 The whole pipeline 64 | We tried our best to describe how to conduct the experiment from **raw data -> proceed data -> model traning -> evaluation** in each readme file in different folder of model --------------------------------------------------------------------------------