├── README.md ├── convert_parsed.py ├── evaluate_vgraph.py ├── find_vulns.py ├── gen_target_graph_db.sh ├── gen_vgraph.py ├── gen_vgraph_db.sh ├── mine.sh ├── parsed_to_networkx.py ├── repos.config ├── requirements.txt └── src ├── __init__.py ├── __init__.pyc ├── code └── gen_src_files.sh ├── graph ├── utils.py ├── vgraph.py └── vgraph.pyc └── matching ├── __init__.py ├── __init__.pyc ├── __pycache__ └── exact_matcher.cpython-37.pyc ├── exact_matcher.py ├── exact_matcher.pyc ├── matcher.py ├── matcher.pyc ├── path_matcher.py ├── path_matcher.pyc ├── tale_matcher.py ├── tale_matcher.pyc ├── test.py └── triplet_match.py /README.md: -------------------------------------------------------------------------------- 1 | # VGraph: A Robust Vulnerable Code Clone Detection System Using Code Property Triplets 2 | 3 | This is the code for our paper published in EuroS&P 2020. This tool mines GitHub for vulnerable and patched code samples, converts them to code property graphs, and ultimately to our VGraph structure which contains elements from the vulnerable code, patched code, and contextual code, for various vulnerabilities. 4 | 5 | VGraphs can then be used to find new vulnerable code clones with a significant amount of modification from the original. 6 | 7 | # Preqrequesties for running the code 8 | 9 | VGraph relies on Joern to generate code property graphs. We were using Joern v. 0.2.5 (really old by now). If you plan to use a newer version the code may need some tweaking. We were using the `joern-parse` utility which would skip the neo4j database stuff and just parse the code and generate the CPGs as a text file. Once you have a suitable version of Joern installed, you need to modify the `mine.sh` file and update the path of the `JOERN` variable to point to the location of `joern-parse`. 10 | 11 | The majority of the code is Python + Bash. There is a requirements.txt file that can be used to install the required Python packages: `pip install -r requirements.txt`. 12 | 13 | # Running the code simple 14 | 15 | Most of the code has been streamlined into a couple of scripts: 16 | 17 | ```#> ./mine.sh``` 18 | 19 | This will crawl github for the repositories listed in `repos.config`. It will checkout the various repositories, and then scan through their commits looking for references to CVE numbers. It will Then download the raw sourcecode associated with those commits, as well as historic versions from both before and after the relevent commits. Next it will generate the graphs with Joern, and finally convert the Joern graphs to NetworkX format. 20 | 21 | The result will be a ton of useful data in the `data` directory. There will be raw source code in directories indicating what CVE they are associated with, if they are vulnerable or patched to that particular CVE, the commit hashes which created the particular files, and more. 22 | 23 | Next, we will actually build the VGraph database: 24 | 25 | ```#> ./gen_vgraph_db.sh``` 26 | 27 | This will scan through the `data` directory and build a VGraph for each vulnerable/patched code sample we extracted from GitHub. The resulting VGraphs will be placed in the `data` directory in an appropriately named directory. 28 | 29 | Also included is an evaluation script: `evaluate_vgraph.py`, which will allow you to see how well the VGraph representation was able to detect and differentiate between the original vulnerable/patched code samples, as well as the historic versions downloaded for testing. 30 | 31 | 32 | -------------------------------------------------------------------------------- /convert_parsed.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import csv 4 | import networkx as nx 5 | import pickle as pkl 6 | from tqdm import tqdm 7 | 8 | from src.graph.utils import joern_to_networkx, tripleize, vectorize 9 | 10 | 11 | def write_graph(graph, base_dir, repo, cve, v_or_p, file_name, func_name): 12 | path = "%s/%s/%s/%s/%s/graph" % (base_dir, repo, cve, v_or_p, file_name) 13 | name = "%s.gpickle" % func_name 14 | if not os.path.exists(path): 15 | os.makedirs(path) 16 | nx.write_gpickle(graph, path + '/' + name) 17 | 18 | trips = tripleize(graph) 19 | pkl.dump(trips, open(path + '/' + func_name + '.triples', 'wb')) 20 | 21 | vec = vectorize(graph) 22 | pkl.dump(vec, open(path + '/' + func_name + '.vec', 'wb')) 23 | 24 | def write_code(char_buf, base_dir, repo, cve, v_or_p, file_name, func_name): 25 | path = "%s/%s/%s/%s/%s/code" % (base_dir, repo, cve, v_or_p, file_name) 26 | name = "%s.%s" % (func_name, file_name.split('.')[-1]) # same extension as original file 27 | if not os.path.exists(path): 28 | os.makedirs(path) 29 | with open(path + '/' + name, 'w') as f: 30 | for c in char_buf: 31 | f.write(c) 32 | 33 | def extract_func(from_file, to_file, location): 34 | buf_start = int(location.split(':')[2]) 35 | buf_end = int(location.split(':')[3]) 36 | with open(from_file, 'r') as f: 37 | char_list = list(f.read()) 38 | return char_list[buf_start:buf_end+1] 39 | 40 | 41 | vuln_code_dir=sys.argv[1] # location of source code files 42 | parsed_dir=sys.argv[2] # Location of Joern parsed data 43 | output_dir=sys.argv[3] # Location to write our final database containing code, graphs 44 | 45 | repos = os.listdir(vuln_code_dir) 46 | all_cves = [] 47 | for repo in repos: 48 | all_cves = all_cves + os.listdir(vuln_code_dir + '/' + repo) 49 | 50 | 51 | 52 | pbar = tqdm(total=len(all_cves)) 53 | 54 | graphs_written=0 55 | # For every code repository... 56 | for repo in os.listdir(vuln_code_dir): 57 | # For every CVE... 58 | for cve in os.listdir(vuln_code_dir + '/' + repo): 59 | pbar.update(1) 60 | # Inside here we have funcname, vuln, patch, before, after 61 | 62 | # Get names of functions of interest 63 | function_names = [] 64 | try: 65 | with open(vuln_code_dir + '/' + repo + '/' + cve + '/funcnames') as fp: 66 | for f_name in fp.readlines(): 67 | f_name = f_name.rstrip() 68 | if f_name: 69 | function_names.append(f_name) 70 | except: 71 | # Error opening function names file. Skip 72 | print("Error opening funcnames file for %s/%s...Skipping" % (repo, cve)) 73 | continue 74 | 75 | # Get list of vuln files 76 | vuln_file_names = [] 77 | for h in os.listdir(vuln_code_dir + '/' + repo + '/' + cve + '/vuln/'): 78 | for f in os.listdir(vuln_code_dir + '/' + repo + '/' + cve + '/vuln/' + h): 79 | vuln_file_names.append('%s/%s' % (h,f)) 80 | 81 | # Get list of patch files 82 | patch_file_names = [] 83 | for h in os.listdir(vuln_code_dir + '/' + repo + '/' + cve + '/patch/'): 84 | for f in os.listdir(vuln_code_dir + '/' + repo + '/' + cve + '/patch/' + h): 85 | patch_file_names.append('%s/%s' % (h,f)) 86 | 87 | # Must have been an error generating these files. Skip. 88 | if len(vuln_file_names) == 0 or len(patch_file_names) == 0: 89 | print("Missing vulnerable or patched files for %s/%s...Skipping" % (repo, cve)) 90 | continue 91 | 92 | # Get list of before patch files (also vuln) 93 | before_file_names = [] 94 | for h in os.listdir(vuln_code_dir + '/' + repo + '/' + cve + '/before/'): 95 | for f in os.listdir(vuln_code_dir + '/' + repo + '/' + cve + '/before/'+h+'/'): 96 | before_file_names.append('%s/%s' % (h,f)) 97 | 98 | # Get list of after patch files (also patched) 99 | after_file_names = [] 100 | for h in os.listdir(vuln_code_dir + '/' + repo + '/' + cve + '/after/'): 101 | for f in os.listdir(vuln_code_dir + '/' + repo + '/' + cve + '/after/'+h+'/'): 102 | after_file_names.append('%s/%s' % (h,f)) 103 | 104 | # Now need to: 105 | # 1) Find those functions in parsed directory 106 | # 2) Build Networkx graph from .csv files 107 | # 3) Extract source code of specific functions from orig source code 108 | for (f_names, d) in [(vuln_file_names, 'vuln'), 109 | (patch_file_names, 'patch'), 110 | (before_file_names, 'before'), 111 | (after_file_names, 'after')]: 112 | for f in f_names: 113 | parsed_file_nodes = "%s/%s/%s/%s/%s/%s/nodes.csv" % (parsed_dir,vuln_code_dir,repo,cve,d,f) 114 | parsed_file_edges = "%s/%s/%s/%s/%s/%s/edges.csv" % (parsed_dir,vuln_code_dir,repo,cve,d,f) 115 | graphs, num = joern_to_networkx(parsed_file_nodes, parsed_file_edges, func_names=function_names) 116 | print(parsed_file_nodes, num) 117 | # Now need to write out data 118 | for g in graphs: 119 | graphs_written += 1 120 | write_graph(g['graph'], output_dir, repo, cve, d, f, g['name']) 121 | just_the_func = extract_func("%s/%s/%s/%s/%s" % (vuln_code_dir,repo,cve,d,f), 'to_file', g['location']) 122 | write_code(just_the_func, output_dir, repo, cve, d, f, g['name']) 123 | 124 | 125 | pbar.close() 126 | 127 | print(graphs_written) 128 | 129 | -------------------------------------------------------------------------------- /evaluate_vgraph.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import filecmp 4 | from tqdm import tqdm 5 | import numpy as np 6 | import pickle as pkl 7 | from src.graph.utils import load_vgraph_db, load_target_db 8 | from src.matching.triplet_match import * 9 | from multiprocessing import Pool,Process, Queue, SimpleQueue 10 | import time 11 | 12 | def decision_function(cvg_score, pvg_score, nvg_score): 13 | return cvg_score >= CVG_THRESH and pvg_score >= PVG_THRESH and pvg_score > nvg_score 14 | 15 | def consume(work): 16 | (target_id, vg, t_trips) = work 17 | cvg_score, pvg_score, nvg_score = triplet_match_exact(vg, t_trips) 18 | return (target_id, vg, cvg_score, pvg_score, nvg_score) 19 | 20 | def generate_ground_truth(target_graphs): 21 | NUM_VULN=0 22 | NUM_PATCH=0 23 | NUM_BEFORE=0 24 | NUM_AFTER=0 25 | 26 | for g in target_graphs: 27 | repo,cve,t,hash,f,_,_=g['path'].split('/')[-7:] 28 | func=g['base_name'] 29 | 30 | 31 | if t == 'vuln': 32 | NUM_VULN += 1 33 | elif t == 'patch': 34 | NUM_PATCH += 1 35 | elif t == 'before': 36 | NUM_BEFORE += 1 37 | elif t == 'after': 38 | NUM_AFTER += 1 39 | 40 | 41 | print("Ground truth stats:") 42 | print("NUM_VULN: %d" % NUM_VULN) 43 | print("NUM_PATCH: %d" % NUM_PATCH) 44 | print("NUM_BEFORE: %d" % NUM_BEFORE) 45 | print("NUM_AFTER: %d" % NUM_AFTER) 46 | print("TOT_VULN: %d" % (NUM_VULN + NUM_BEFORE)) 47 | print("TOT_NOT_VULN: %d" % (NUM_PATCH + NUM_AFTER)) 48 | return NUM_VULN, NUM_PATCH, NUM_BEFORE, NUM_AFTER 49 | 50 | def get_hits_multi(vgraph_db, target_db): 51 | work=[] 52 | scores = [] 53 | for i, tg in enumerate(target_db): 54 | t_trips = tg['triples'] 55 | tg['hits'] = [] # place to put hits 56 | for vg in vgraph_db: 57 | work.append((i,vg, t_trips)) 58 | print("Work size: %d" % len(work)) 59 | print("Applying pool...") 60 | p = Pool(NUM_PROCS) 61 | res = p.map(consume, work) 62 | print("done..") 63 | 64 | for (target_id, vg, cvg_score, pvg_score, nvg_score) in res: 65 | scores.append((vg['cve'],vg['repo'],vg['func'],target_db[target_id]['path'],cvg_score, pvg_score, nvg_score)) 66 | if decision_function(cvg_score, pvg_score, nvg_score): 67 | target_db[target_id]['hits'].append(vg) 68 | 69 | 70 | def get_hits(vgraph_db, target_db): 71 | skipped=0 72 | scores = [] 73 | for i, tg in tqdm(enumerate(target_db)): 74 | tg['hits'] = [] # place to store our hits 75 | t_trips = tg['triples'] 76 | t_vec = np.array(tg['vec']) 77 | for vg in vgraph_db: 78 | cvg_score, pvg_score, nvg_score = triplet_match_exact(vg, t_trips) 79 | scores.append((vg['cve'],vg['repo'],vg['func'],tg['path'],cvg_score, pvg_score, nvg_score)) 80 | if decision_function(cvg_score, pvg_score, nvg_score): 81 | # we have a hit 82 | tg['hits'].append(vg) 83 | return scores 84 | 85 | 86 | def eval_vgraph(vgraph_db, target_db, gt, manual_labels): 87 | # Loop through target graphs. Get any vGraph hits and evaluate for truthiness 88 | # Now we score 89 | print("Scoring results...") 90 | 91 | # Score all: 92 | TP=0. 93 | FP=0. 94 | TN=0. 95 | FN=0. 96 | UNK=0 97 | for tg in target_db: 98 | if len(tg['hits']) == 0: # nothing hit on this target 99 | if '/patch/' in tg['path'] or '/after/' in tg['path']: 100 | TN +=1 101 | else: # something should have hit 102 | FN += 1 103 | if(PRINT_FN): 104 | print("FN: %s" % (tg['path'])) 105 | else: 106 | # something hit 107 | #accounted_for = False 108 | for vg in tg['hits']: 109 | if vg['cve'] in tg['path'] and ('/vuln/' in tg['path'] or '/before/' in tg['path']): 110 | TP += 1 111 | #accounted_for = True 112 | #break 113 | elif vg['cve'] in tg['path'] and ('/patch/' in tg['path'] or '/after/' in tg['path']): 114 | FP += 1 115 | if PRINT_FP: 116 | print("FP: %s %s" % (vg['cve'], tg['path'])) 117 | #accounted_for = True 118 | #break 119 | else: 120 | #continue # until we find either TP or FP 121 | # Need to check 122 | tg_name = tg['path'].split('/')[-1][:-len('.gpickle')] 123 | vg_name = vg['func'] 124 | tg_time = int(tg['path'].split('/')[-4].split('_')[1]) 125 | vg_time = int(vg['hsh'].split('_')[1]) 126 | if tg_name == vg_name and tg_time < vg_time: 127 | TP += 1 128 | else: 129 | # check in manual labels 130 | found=False 131 | for label in manual_labels: 132 | label_split = label.rstrip().split(' ') 133 | if label_split[1] == vg['cve'] and label_split[2] in tg['path']: 134 | if label_split[0] == 'TP': 135 | TP += 1 136 | else: 137 | FP += 1 138 | if PRINT_FP: 139 | print("FP: %s %s" % (vg['cve'], tg['path'])) 140 | found=True 141 | break 142 | if not found: 143 | UNK += 1 144 | if(PRINT_UNK): 145 | print("UNK: %s %s/%s/%s/%s/%s)" % (tg['path'], vg['repo'], vg['cve'], vg['hsh'],vg['file'], vg['func'])) 146 | 147 | #if not accounted_for: 148 | # if '/patch/' in tg['path'] or '/after/' in tg['path']: 149 | # TN +=1 150 | # else: # something should have hit 151 | # FN += 1 152 | 153 | 154 | P = TP/(TP+FP) 155 | R = TP/(TP+FN) 156 | F1 = 2*(P*R)/(P+R) 157 | print("All Score:") 158 | print("TP\tFP\tTN\tFN\tUNK\tP\tR\tF1") 159 | print("%d\t%d\t%d\t%d\t%d\t%02f\t%02f\t%02f"%(TP,FP,TN,FN,UNK,P,R,F1)) 160 | 161 | print("Worst Case:") 162 | P = TP/(TP+FP + UNK) 163 | R = TP/(TP+FN) 164 | F1 = 2*(P*R)/(P+R) 165 | print("%02f\t%02f\t%02f"%(P,R,F1)) 166 | print("Best Case:") 167 | P = (TP+UNK)/(TP+FP+UNK) 168 | R = (TP+UNK)/(TP+UNK+FN) 169 | F1 = 2*(P*R)/(P+R) 170 | print("%02f\t%02f\t%02f"%(P,R,F1)) 171 | 172 | 173 | # Score train data: 174 | TP=0. 175 | FP=0. 176 | TN=0. 177 | FN=0. 178 | UNK=0 179 | for tg in target_db: 180 | if not ('/vuln/' in tg['path'] or '/patch/' in tg['path']): 181 | continue # only vuln/patch 182 | if len(tg['hits']) == 0: # nothing hit on this target 183 | if '/patch/' in tg['path'] or '/after/' in tg['path']: 184 | TN +=1 185 | else: # something should have hit 186 | if(PRINT_FN): 187 | print("FN: %s" % (tg['path'])) 188 | FN += 1 189 | else: 190 | # something hit 191 | accounted_for = False 192 | for vg in tg['hits']: 193 | if vg['cve'] in tg['path'] and ('/vuln/' in tg['path'] or '/before/' in tg['path']): 194 | TP += 1 195 | #accounted_for = True 196 | #break 197 | elif vg['cve'] in tg['path'] and ('/patch/' in tg['path'] or '/after/' in tg['path']): 198 | FP += 1 199 | if PRINT_FP: 200 | print("FP: %s %s" % (vg['cve'], tg['path'])) 201 | #accounted_for = True 202 | #break 203 | else: 204 | #continue 205 | # Need to check 206 | tg_name = tg['path'].split('/')[-1][:-len('.gpickle')] 207 | vg_name = vg['func'] 208 | tg_time = int(tg['path'].split('/')[-4].split('_')[1]) 209 | vg_time = int(vg['hsh'].split('_')[1]) 210 | if tg_name == vg_name and tg_time < vg_time: 211 | TP += 1 212 | else: 213 | # check in manual labels 214 | found=False 215 | for label in manual_labels: 216 | label_split = label.rstrip().split(' ') 217 | if label_split[1] == vg['cve'] and label_split[2] in tg['path']: 218 | if label_split[0] == 'TP': 219 | TP += 1 220 | else: 221 | FP += 1 222 | if PRINT_FP: 223 | print("FP: %s %s" % (vg['cve'], tg['path'])) 224 | found=True 225 | break 226 | if not found: 227 | UNK += 1 228 | if(PRINT_UNK): 229 | print("UNK: %s %s/%s/%s/%s/%s)" % (tg['path'], vg['repo'], vg['cve'], vg['hsh'],vg['file'], vg['func'])) 230 | 231 | #if not accounted_for: 232 | # if '/patch/' in tg['path'] or '/after/' in tg['path']: 233 | # TN +=1 234 | # else: # something should have hit 235 | # FN += 1 236 | 237 | P = TP/(TP+FP) 238 | R = TP/(TP+FN) 239 | F1 = 2*(P*R)/(P+R) 240 | print("Train Score:") 241 | print("TP\tFP\tTN\tFN\tUNK\tP\tR\tF1") 242 | print("%d\t%d\t%d\t%d\t%d\t%02f\t%02f\t%02f"%(TP,FP,TN,FN,UNK,P,R,F1)) 243 | print("Worst Case:") 244 | P = TP/(TP+FP + UNK) 245 | R = TP/(TP+FN) 246 | F1 = 2*(P*R)/(P+R) 247 | print("%02f\t%02f\t%02f"%(P,R,F1)) 248 | print("Best Case:") 249 | P = (TP+UNK)/(TP+FP+UNK) 250 | R = (TP+UNK)/(TP+UNK+FN) 251 | F1 = 2*(P*R)/(P+R) 252 | print("%02f\t%02f\t%02f"%(P,R,F1)) 253 | 254 | # score test data: 255 | TP=0. 256 | FP=0. 257 | TN=0. 258 | FN=0. 259 | UNK=0 260 | for tg in target_db: 261 | if not ('/before/' in tg['path'] or '/after/' in tg['path']): 262 | continue # only before/after 263 | if len(tg['hits']) == 0: # nothing hit on this target 264 | if '/patch/' in tg['path'] or '/after/' in tg['path']: 265 | TN +=1 266 | else: # something should have hit 267 | if(PRINT_FN): 268 | print("FN: %s" % (tg['path'])) 269 | FN += 1 270 | else: 271 | # something hit 272 | accounted_for = False 273 | for vg in tg['hits']: 274 | if vg['cve'] in tg['path'] and ('/vuln/' in tg['path'] or '/before/' in tg['path']): 275 | TP += 1 276 | #accounted_for = True 277 | #break 278 | elif vg['cve'] in tg['path'] and ('/patch/' in tg['path'] or '/after/' in tg['path']): 279 | FP += 1 280 | if PRINT_FP: 281 | print("FP: %s %s" % (vg['cve'], tg['path'])) 282 | #accounted_for = True 283 | #break 284 | else: 285 | #continue 286 | # Need to check 287 | tg_name = tg['path'].split('/')[-1][:-len('.gpickle')] 288 | vg_name = vg['func'] 289 | tg_time = int(tg['path'].split('/')[-4].split('_')[1]) 290 | vg_time = int(vg['hsh'].split('_')[1]) 291 | if tg_name == vg_name and tg_time < vg_time: 292 | TP += 1 293 | else: 294 | # check in manual labels 295 | found=False 296 | for label in manual_labels: 297 | label_split = label.rstrip().split(' ') 298 | if label_split[1] == vg['cve'] and label_split[2] in tg['path']: 299 | if label_split[0] == 'TP': 300 | TP += 1 301 | else: 302 | FP += 1 303 | if PRINT_FP: 304 | print("FP: %s %s" % (vg['cve'], tg['path'])) 305 | found=True 306 | break 307 | if not found: 308 | UNK += 1 309 | if(PRINT_UNK): 310 | print("UNK: %s %s/%s/%s/%s/%s)" % (tg['path'], vg['repo'], vg['cve'], vg['hsh'],vg['file'], vg['func'])) 311 | 312 | #if not accounted_for: 313 | # if '/patch/' in tg['path'] or '/after/' in tg['path']: 314 | # TN +=1 315 | # else: # something should have hit 316 | # FN += 1 317 | 318 | 319 | P = TP/(TP+FP) 320 | R = TP/(TP+FN) 321 | F1 = 2*(P*R)/(P+R) 322 | print("Test Score:") 323 | print("TP\tFP\tTN\tFN\tUNK\tP\tR\tF1") 324 | print("%d\t%d\t%d\t%d\t%d\t%02f\t%02f\t%02f"%(TP,FP,TN,FN,UNK,P,R,F1)) 325 | print("Worst Case:") 326 | P = TP/(TP+FP + UNK) 327 | R = TP/(TP+FN) 328 | F1 = 2*(P*R)/(P+R) 329 | print("%02f\t%02f\t%02f"%(P,R,F1)) 330 | print("Best Case:") 331 | P = (TP+UNK)/(TP+FP+UNK) 332 | R = (TP+UNK)/(TP+UNK+FN) 333 | F1 = 2*(P*R)/(P+R) 334 | print("%02f\t%02f\t%02f"%(P,R,F1)) 335 | 336 | # score test modified only: 337 | test_mod_tps = [] 338 | score_by_line_mods = [] 339 | TP=0. 340 | FP=0. 341 | TN=0. 342 | FN=0. 343 | UNK=0 344 | for tg in target_db: 345 | if not ('/before/' in tg['path'] or '/after/' in tg['path']): 346 | continue # only before/after 347 | 348 | d = '/'.join(tg['path'].split('/')[0:4]) # root/repo/CVE 349 | func = tg['path'].split('/')[-1] # function.gpickle 350 | 351 | if '/before/' in tg['path']: # should be vuln 352 | is_same = False 353 | for (root,dirs,files) in os.walk(d): 354 | if func in files and '/vuln/' in root: 355 | # This is orig vuln file. lets check for differences 356 | before_src = tg['path'].replace('/graph/', '/code/') 357 | before_src = before_src.replace('.gpickle','.c') 358 | vuln_src = (root + '/' + func).replace('/graph/','/code/') 359 | vuln_src = vuln_src.replace('.gpickle','.c') 360 | if filecmp.cmp(before_src, vuln_src): 361 | # found match 362 | is_same = True 363 | break 364 | if is_same: 365 | continue # skip this one 366 | 367 | # Count line diffs of files 368 | res = subprocess.check_output('diff %s %s | grep "^>" | wc -l' % (vuln_src, before_src), shell=True) 369 | num_right_mods = int(res.decode('utf-8').rstrip()) 370 | res = subprocess.check_output('diff %s %s | grep "^<" | wc -l' % (vuln_src, before_src), shell=True) 371 | num_left_mods = int(res.decode('utf-8').rstrip()) 372 | res = subprocess.check_output("wc -l %s | awk '{print $1}'" % (vuln_src), shell=True) 373 | num_lines_orig = int(res.decode('utf-8').rstrip()) 374 | #os.system('diff %s %s | grep "^>" | wc -l > scratch' % (vuln_src, before_src)) 375 | 376 | else: # after patch so should be patched 377 | is_same = False 378 | for (root,dirs,files) in os.walk(d): 379 | if func in files and '/patch/' in root: 380 | # This is orig vuln file. lets check for differences 381 | after_src = tg['path'].replace('/graph/', '/code/') 382 | after_src = after_src.replace('.gpickle','.c') 383 | patch_src = (root + '/' + func).replace('/graph/','/code/') 384 | patch_src = patch_src.replace('.gpickle','.c') 385 | if filecmp.cmp(after_src, patch_src): 386 | is_same = True 387 | break 388 | if is_same: 389 | continue # skip this 390 | 391 | # Count line diffs of files 392 | res = subprocess.check_output('diff %s %s | grep "^>" | wc -l' % (patch_src, after_src), shell=True) 393 | num_right_mods = int(res.decode('utf-8').rstrip()) 394 | res = subprocess.check_output('diff %s %s | grep "^<" | wc -l' % (patch_src, after_src), shell=True) 395 | num_left_mods = int(res.decode('utf-8').rstrip()) 396 | res = subprocess.check_output("wc -l %s | awk '{print $1}'" % (patch_src), shell=True) 397 | num_lines_orig = int(res.decode('utf-8').rstrip()) 398 | #os.system('diff %s %s | grep "^>" | wc -l > scratch' % (patch_src, after_src)) 399 | 400 | # If we make it here, this is either a before/after target graph which has 401 | # source code thats modified from the vuln/patch file used to generate vGraph 402 | 403 | # count num lines 404 | #with open('scratch', 'r') as fp: 405 | # num_mods = fp.readlines()[0] 406 | if num_right_mods == num_left_mods: 407 | print("Type-2") 408 | clone_type=2 409 | elif num_right_mods+num_left_mods > int(0.5*num_lines_orig): 410 | print("Type-4") 411 | clone_type=4 412 | else: 413 | print("Type-3") 414 | clone_type=3 415 | num_mods = num_right_mods 416 | #print("Num mods: %s" % num_mods) 417 | 418 | # score it 419 | if len(tg['hits']) == 0: # nothing hit on this target 420 | if '/patch/' in tg['path'] or '/after/' in tg['path']: 421 | TN +=1 422 | score_by_line_mods.append([tg['path'],num_mods,clone_type,'TN']) 423 | else: # something should have hit 424 | FN += 1 425 | score_by_line_mods.append([tg['path'],num_mods,clone_type,'FN']) 426 | else: 427 | # something hit 428 | accounted_for = False 429 | for vg in tg['hits']: 430 | if vg['cve'] in tg['path'] and ('/vuln/' in tg['path'] or '/before/' in tg['path']): 431 | TP += 1 432 | score_by_line_mods.append([tg['path'],num_mods,clone_type,'TP']) 433 | accounted_for = True 434 | break 435 | #test_mod_tps.append((tg['path'],vg['cve'],vg['file'],vg['func'])) 436 | elif vg['cve'] in tg['path'] and ('/patch/' in tg['path'] or '/after/' in tg['path']): 437 | FP += 1 438 | score_by_line_mods.append([tg['path'],num_mods,clone_type,'FP']) 439 | accounted_for = True 440 | break 441 | else: 442 | continue # Not considering cross-cve clones for per-line mode experiment 443 | # Need to check 444 | tg_name = tg['path'].split('/')[-1][:-len('.gpickle')] 445 | vg_name = vg['func'] 446 | tg_time = int(tg['path'].split('/')[-4].split('_')[1]) 447 | vg_time = int(vg['hsh'].split('_')[1]) 448 | if tg_name == vg_name and tg_time < vg_time: 449 | test_mod_tps.append((tg['path'],vg['cve'],vg['file'],vg['func'])) 450 | TP += 1 451 | else: 452 | # check in manual labels 453 | found=False 454 | for label in manual_labels: 455 | label_split = label.rstrip().split(' ') 456 | if label_split[1] == vg['cve'] and label_split[2] in tg['path']: 457 | if label_split[0] == 'TP': 458 | TP += 1 459 | else: 460 | FP += 1 461 | found=True 462 | break 463 | if not found: 464 | UNK += 1 465 | if(PRINT_UNK): 466 | print("UNK: %s %s/%s/%s/%s/%s)" % (tg['path'], vg['repo'], vg['cve'], vg['hsh'],vg['file'], vg['func'])) 467 | 468 | if not accounted_for: 469 | if '/patch/' in tg['path'] or '/after/' in tg['path']: 470 | TN +=1 471 | score_by_line_mods.append([tg['path'],num_mods,clone_type,'TN']) 472 | else: # something should have hit 473 | FN += 1 474 | score_by_line_mods.append([tg['path'],num_mods,clone_type,'FN']) 475 | 476 | 477 | with open('vgraph_score_by_line_mods.txt','w') as fp: 478 | for s in score_by_line_mods: 479 | fp.write("%s %s %s %s\n" % (s[0], s[1], s[2], s[3])) 480 | 481 | 482 | 483 | 484 | P = TP/(TP+FP) 485 | R = TP/(TP+FN) 486 | F1 = 2*(P*R)/(P+R) 487 | print("Test Modified") 488 | print("TP\tFP\tTN\tFN\tUNK\tP\tR\tF1") 489 | print("%d\t%d\t%d\t%d\t%d\t%02f\t%02f\t%02f"%(TP,FP,TN,FN,UNK,P,R,F1)) 490 | print("Worst Case:") 491 | P = TP/(TP+FP + UNK) 492 | R = TP/(TP+FN) 493 | F1 = 2*(P*R)/(P+R) 494 | print("%02f\t%02f\t%02f"%(P,R,F1)) 495 | print("Best Case:") 496 | P = (TP+UNK)/(TP+FP+UNK) 497 | R = (TP+UNK)/(TP+UNK+FN) 498 | F1 = 2*(P*R)/(P+R) 499 | print("%02f\t%02f\t%02f"%(P,R,F1)) 500 | 501 | 502 | def eval_vgraph_mods_only(vgraph_db, target_db, gt, manual_labels): 503 | # score test modified 504 | test_mod_tps = [] 505 | score_by_line_mods = [] 506 | TP=0. 507 | FP=0. 508 | TN=0. 509 | FN=0. 510 | UNK=0 511 | for tg in target_db: 512 | if not ('/before/' in tg['path'] or '/after/' in tg['path']): 513 | continue # only before/after 514 | 515 | d = '/'.join(tg['path'].split('/')[0:4]) # root/repo/CVE 516 | func = tg['path'].split('/')[-1] # function.gpickle 517 | 518 | if '/before/' in tg['path']: # should be vuln 519 | is_same = False 520 | for (root,dirs,files) in os.walk(d): 521 | if func in files and '/vuln/' in root: 522 | # This is orig vuln file. lets check for differences 523 | before_src = tg['path'].replace('/graph/', '/code/') 524 | before_src = before_src.replace('.gpickle','.c') 525 | vuln_src = (root + '/' + func).replace('/graph/','/code/') 526 | vuln_src = vuln_src.replace('.gpickle','.c') 527 | if filecmp.cmp(before_src, vuln_src): 528 | # found match 529 | is_same = True 530 | break 531 | if is_same: 532 | continue # skip this one 533 | print('diff %s %s | grep "^>" | wc -l > scratch' % (vuln_src, before_src)) 534 | #os.system('diff %s %s | grep "^>" | wc -l > scratch' % (vuln_src, before_src)) 535 | else: # after patch so should be patched 536 | is_same = False 537 | for (root,dirs,files) in os.walk(d): 538 | if func in files and '/patch/' in root: 539 | # This is orig vuln file. lets check for differences 540 | after_src = tg['path'].replace('/graph/', '/code/') 541 | after_src = after_src.replace('.gpickle','.c') 542 | patch_src = (root + '/' + func).replace('/graph/','/code/') 543 | patch_src = patch_src.replace('.gpickle','.c') 544 | if filecmp.cmp(after_src, patch_src): 545 | is_same = True 546 | break 547 | if is_same: 548 | continue # skip this 549 | #os.system('diff %s %s | grep "^>" | wc -l > scratch' % (patch_src, after_src)) 550 | # If we make it here, this is either a before/after target graph which has 551 | # source code thats modified from the vuln/patch file used to generate vGraph 552 | 553 | # count num lines 554 | #with open('scratch', 'r') as fp: 555 | # num_mods = fp.readlines()[0] 556 | #print("Num mods: %s" % num_mods) 557 | 558 | # score it 559 | if len(tg['hits']) == 0: # nothing hit on this target 560 | if '/patch/' in tg['path'] or '/after/' in tg['path']: 561 | TN +=1 562 | #score_by_line_mods.append([tg['path'],num_mods,'TN']) 563 | else: # something should have hit 564 | FN += 1 565 | #score_by_line_mods.append([tg['path'],num_mods,'FN']) 566 | else: 567 | # something hit 568 | accounted_for = False 569 | for vg in tg['hits']: 570 | if vg['cve'] in tg['path'] and ('/vuln/' in tg['path'] or '/before/' in tg['path']): 571 | TP += 1 572 | #score_by_line_mods.append([tg['path'],num_mods,'TP']) 573 | accounted_for = True 574 | break 575 | #test_mod_tps.append((tg['path'],vg['cve'],vg['file'],vg['func'])) 576 | elif vg['cve'] in tg['path'] and ('/patch/' in tg['path'] or '/after/' in tg['path']): 577 | FP += 1 578 | #score_by_line_mods.append([tg['path'],num_mods,'FP']) 579 | accounted_for = True 580 | break 581 | else: 582 | continue 583 | # Need to check 584 | tg_name = tg['path'].split('/')[-1][:-len('.gpickle')] 585 | vg_name = vg['func'] 586 | tg_time = int(tg['path'].split('/')[-4].split('_')[1]) 587 | vg_time = int(vg['hsh'].split('_')[1]) 588 | if tg_name == vg_name and tg_time < vg_time: 589 | # test_mod_tps.append((tg['path'],vg['cve'],vg['file'],vg['func'])) 590 | TP += 1 591 | else: 592 | # check in manual labels 593 | found=False 594 | for label in manual_labels: 595 | label_split = label.rstrip().split(' ') 596 | if label_split[1] == vg['cve'] and label_split[2] in tg['path']: 597 | if label_split[0] == 'TP': 598 | TP += 1 599 | else: 600 | FP += 1 601 | found=True 602 | break 603 | if not found: 604 | UNK += 1 605 | if(PRINT_UNK): 606 | print("UNK: %s %s/%s/%s/%s/%s)" % (tg['path'], vg['repo'], vg['cve'], vg['hsh'],vg['file'], vg['func'])) 607 | 608 | if not accounted_for: 609 | if '/patch/' in tg['path'] or '/after/' in tg['path']: 610 | TN +=1 611 | #score_by_line_mods.append([tg['path'],num_mods,'TN']) 612 | else: # something should have hit 613 | FN += 1 614 | #score_by_line_mods.append([tg['path'],num_mods,'FN']) 615 | #with open('vgraph_score_by_line_mods.txt','w') as fp: 616 | # for s in score_by_line_mods: 617 | # fp.write("%s %s %s\n" % (s[0], s[1], s[2])) 618 | 619 | 620 | 621 | try: 622 | P = TP/(TP+FP) 623 | except: 624 | P=0. 625 | try: 626 | R = TP/(TP+FN) 627 | except: 628 | R = 0. 629 | try: 630 | F1 = 2*(P*R)/(P+R) 631 | except: 632 | F1 = 0. 633 | print("Test Modified") 634 | print("TP\tFP\tTN\tFN\tUNK\tP\tR\tF1") 635 | print("%d\t%d\t%d\t%d\t%d\t%02f\t%02f\t%02f"%(TP,FP,TN,FN,UNK,P,R,F1)) 636 | 637 | ############################################################################################## 638 | # main 639 | ############################################################################################# 640 | PRINT_FN=True 641 | PRINT_FP=True 642 | PRINT_UNK=False 643 | CVG_THRESH=25 644 | PVG_THRESH=60 645 | NUM_PROCS=20 646 | 647 | print("Loading VGraph DB...") 648 | vgraph_db = load_vgraph_db('data/vgraph_db') 649 | func_list = [] 650 | for vg in vgraph_db: 651 | if vg['func'] not in func_list: 652 | func_list.append(vg['func']) 653 | 654 | # load manual labels 655 | with open('./manual_labels.txt', 'r') as fp: 656 | manual_labels = fp.readlines() 657 | 658 | print("Loading target graphs..") 659 | target_db = load_target_db('data/vuln_patch_graph_db', func_list) 660 | target_db_clean = [] 661 | for tg in target_db: 662 | cve = tg['path'].split('/')[3] 663 | func = tg['base_name'] 664 | for vg in vgraph_db: 665 | if vg['cve'] == cve and vg['func'] == func: 666 | # function has a vgraph, so we will compare 667 | target_db_clean.append(tg) 668 | break 669 | 670 | print("Calculating ground truth...") 671 | gt = generate_ground_truth(target_db_clean) 672 | #start_time = time.time() 673 | #for thresh_c in [ 0, 20, 40, 60, 80, 100 ]: 674 | # for thresh_p in [ 0, 20, 40, 60, 80, 100]: 675 | # print("thresh_c: %d" % thresh_c) 676 | # print("thresh_p: %d" % thresh_p) 677 | # PVG_THRESH = thresh_p 678 | # CVG_THRESH = thresh_c 679 | # scores = get_hits(vgraph_db, target_db_clean) 680 | # eval_vgraph_mods_only(vgraph_db, target_db_clean, gt, manual_labels) 681 | 682 | if os.path.exists('evaluate_vgraph_scores.pkl'): 683 | print("Loading saved results...") 684 | scores = pkl.load(open('evaluate_vgraph_scores.pkl', 'rb')) 685 | target_db_clean = pkl.load(open('evaluate_vgraph_target_db.pkl', 'rb')) 686 | else: 687 | start_time = time.time() 688 | scores = get_hits(vgraph_db, target_db_clean) 689 | pkl.dump(scores, open('evaluate_vgraph_scores.pkl', 'wb')) 690 | pkl.dump(target_db_clean, open('evaluate_vgraph_target_db.pkl', 'wb')) 691 | end_time = time.time() 692 | print("Time to generate results: %d" % (end_time - start_time)) 693 | #eval_vgraph_mods_only(vgraph_db, target_db_clean, gt, manual_labels) 694 | eval_vgraph(vgraph_db, target_db_clean, gt, manual_labels) 695 | 696 | 697 | #with open('eval_all_scores.txt','w') as fp: 698 | # for score in scores: 699 | # fp.write("%s/%s/%s %s %d %d %d\n" % (score[0], score[1], score[2], score[3], score[4], score[5], score[6])) 700 | -------------------------------------------------------------------------------- /find_vulns.py: -------------------------------------------------------------------------------- 1 | # Script for running expermints 2 | 3 | import os 4 | import sys 5 | import filecmp 6 | import pickle as pkl 7 | import networkx as nx 8 | import numpy as np 9 | import pickle as pkl 10 | from tqdm import tqdm 11 | 12 | from src.graph.utils import tripleize, load_vgraph_db, load_target_db 13 | from src.matching.triplet_match import * 14 | 15 | # Thresholds for when a function is flagged 16 | CVG_THRESH=50 17 | PVG_THRESH=50 18 | NVG_THRESH=50 19 | 20 | def decision_function(cvg_score, pvg_score, nvg_score): 21 | return cvg_score > CVG_THRESH and pvg_score > PVG_THRESH and nvg_score < NVG_THRESH 22 | 23 | def log(log_p, line): 24 | log_p.write(line) 25 | 26 | def print_usage(): 27 | print("Usage: python find_vulns.py ") 28 | print("\ttarget_path : Location of target code property graphs") 29 | print("\tscore_file : file to write all scores for each vGraph and target graph. Used for Evaluations") 30 | print("\thit_file : file to write all hits to (funcs that pass threshold)") 31 | print("\tconfig : [e]xact mathing or [a]pproximate matching") 32 | 33 | 34 | if __name__ == "__main__": 35 | if len(sys.argv) != 5: 36 | print_usage() 37 | exit() 38 | 39 | # Process args 40 | target_path = sys.argv[1] 41 | stats_file = sys.argv[2] 42 | hit_file = sys.argv[3] 43 | config = sys.argv[4] 44 | 45 | print("Target Path: ", target_path) 46 | print("Stats File: ", stats_file) 47 | print("Hit File: ", hit_file) 48 | print("Config: ", config) 49 | 50 | stats_fp = open(stats_file, 'w') 51 | hit_fp = open(hit_file, 'w') 52 | 53 | print("Loading target db...") 54 | target_db = load_target_db(target_path) 55 | print("Found %d target graphs" % len(target_db)) 56 | 57 | print("Loading vgraph db...") 58 | vgraph_db = load_vgraph_db('./data/vgraph_db') 59 | print("Found %d vgraphs" % len(vgraph_db)) 60 | 61 | print("Finding vulns...") 62 | num_hits = 0 63 | pbar = tqdm(total=len(target_db)*len(vgraph_db)) 64 | for vg in vgraph_db: 65 | for tg in target_db: 66 | t_trips = tg['triples'] 67 | 68 | if config == "e": # exact matching 69 | cvg_score, pvg_score, nvg_score = triplet_match_exact(vg, t_trips) 70 | elif config == "a": # approximate matching 71 | cvg_score, pvg_score, nvg_score = triplet_match_approx(vg, t_trips) 72 | 73 | pbar.update(1) 74 | 75 | # Log all results to stats file 76 | log(stats_fp, "%s/%s/%s/%s/%s %s %d %d %d\n" % (vg['repo'],vg['cve'],vg['hsh'],vg['file'],vg['func'],tg['path'], cvg_score, pvg_score, nvg_score)) 77 | 78 | if decision_function(cvg_score,pvg_score,nvg_score): 79 | # only log hits to the hits file 80 | log(hit_fp, "%s/%s/%s/%s/%s %s\n" % (vg['repo'],vg['cve'],vg['hsh'],vg['file'],vg['func'],tg['path'])) 81 | num_hits += 1 82 | pbar.close() 83 | print("Done! Found %d hits!" % num_hits) 84 | -------------------------------------------------------------------------------- /gen_target_graph_db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | JOERN='/mnt/raid0_huge/bbowman/joern_testing/joern/joern-parse' 3 | MAX_NUM_PROCS=20 4 | TARGET_GRAPH_DB=`pwd`"/data/target_graph_db" 5 | TARGET_CODE_DIR=$1 6 | 7 | mkdir -p $TARGET_GRAPH_DB 8 | 9 | echo "Generating CPGs for target directory: $TARGET_CODE_DIR" 10 | $JOERN $TARGET_CODE_DIR 11 | 12 | # Rename joern output dir 13 | mv parsed parsed_target 14 | 15 | echo "Generating vectors and triples..." 16 | for line in `find parsed_target -name 'nodes.csv'`; do 17 | name=`echo $line | awk '{print $1}'` 18 | id=`echo $line | awk '{print $2}'` 19 | path=`echo $line | awk '{print $3}'`1 20 | echo "Generating CPG for $name" 21 | mkdir -p $TARGET_GRAPH_DB/$path 22 | python parsed_to_networkx.py $line $TARGET_GRAPH_DB & 23 | 24 | while [ "`jobs | wc -l`" -gt "$MAX_NUM_PROCS" ]; do 25 | sleep 0.5 26 | done 27 | 28 | done 29 | 30 | -------------------------------------------------------------------------------- /gen_vgraph.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import networkx as nx 4 | import pickle as pkl 5 | 6 | from src.graph.utils import tripleize, vectorize 7 | 8 | 9 | def gen_triplets(V,P): 10 | ''' 11 | Gen a set of triplets from V,P in form of CODE,RELATIONSHIP,CODE or TYPE,RELA,TYPE if no code 12 | CVG= { set shared by both } 13 | PVG = { set contained by V but not P } 14 | NVG = { set contained by P but not V } 15 | ''' 16 | V_trips = tripleize(V) 17 | P_trips = tripleize(P) 18 | 19 | print("Num V triplets: %d" % len(V_trips)) 20 | print("Num P triplets: %d" % len(P_trips)) 21 | 22 | cvg=V_trips.intersection(P_trips) 23 | pvg=V_trips.difference(P_trips) 24 | nvg=P_trips.difference(V_trips) 25 | 26 | return cvg, pvg, nvg, V_trips, P_trips 27 | 28 | 29 | 30 | def print_statistics(file_path, v_size, p_size, cvg_size, pvg_size, nvg_size): 31 | print("%s\t%d\t%d\t%d\t%d\t%d" % (file_path, v_size, p_size,cvg_size, pvg_size, nvg_size)) 32 | 33 | 34 | 35 | 36 | 37 | 38 | if __name__ == "__main__": 39 | 40 | if len(sys.argv) != 5: 41 | print("Usage: python gen_vgraph.py ") 42 | exit() 43 | 44 | # Read inputs 45 | vuln_graph = sys.argv[1] 46 | patch_graph = sys.argv[2] 47 | output_path = sys.argv[3] 48 | output_name= sys.argv[4] 49 | 50 | # vgraph ID 51 | vuln_function = output_path + '/' + output_name 52 | 53 | # Graph Outputs 54 | pvg_output_file = output_path + '/' + output_name + "_pvg.pkl" 55 | nvg_output_file = output_path + '/' + output_name + "_nvg.pkl" 56 | cvg_output_file = output_path + '/' + output_name + "_cvg.pkl" 57 | v_output_file = output_path + '/' + output_name + "_v.pkl" 58 | p_output_file = output_path + '/' + output_name + "_p.pkl" 59 | # Vector Output 60 | vec_output_file = output_path + '/' + output_name + "_vec.pkl" 61 | 62 | # Read in the vulnerable and patched graphs 63 | V = nx.read_gpickle(vuln_graph) 64 | P = nx.read_gpickle(patch_graph) 65 | print("V size: %d" % len(V.nodes)) 66 | print("P size: %d" % len(P.nodes)) 67 | 68 | cvg, pvg, nvg, V_trips, P_trips = gen_triplets(V,P) 69 | vec = vectorize(V) 70 | # Check here to make sure we generated some meanigful information 71 | if len(cvg) == 0 or len(pvg) == 0 or len(nvg) == 0: 72 | print("Error: vGraph critical component empty. Skipping") 73 | print_statistics(vuln_function, len(V.nodes), len(P.nodes), len(cvg), len(pvg), len(nvg)) 74 | exit() 75 | 76 | # if we get here were good 77 | if not os.path.exists(output_path): 78 | os.makedirs(output_path) 79 | 80 | pkl.dump(cvg, open(cvg_output_file, 'wb')) 81 | pkl.dump(pvg, open(pvg_output_file, 'wb')) 82 | pkl.dump(nvg, open(nvg_output_file, 'wb')) 83 | pkl.dump(V_trips, open(v_output_file, 'wb')) 84 | pkl.dump(P_trips, open(p_output_file, 'wb')) 85 | pkl.dump(vec, open(vec_output_file, 'wb')) 86 | 87 | # Print final statistics 88 | print_statistics(vuln_function, len(V.nodes), len(P.nodes), len(cvg), len(pvg), len(nvg)) 89 | -------------------------------------------------------------------------------- /gen_vgraph_db.sh: -------------------------------------------------------------------------------- 1 | # This function will find a CPG of patch and vuln function and generate the core graphs and nodes 2 | MAX_NUM_PROCS=4 3 | LOG_FILE='gen_vgraph_db.log' 4 | VULN_PATCH_DB='data/vuln_patch_graph_db' 5 | VGRAPH_DB='data/vgraph_db' 6 | 7 | echo "Logging to $LOG_FILE..." 8 | 9 | for repo in `ls $VULN_PATCH_DB`; do 10 | for cve in `ls $VULN_PATCH_DB/$repo`; do 11 | for hsh in `ls $VULN_PATCH_DB/$repo/$cve/vuln/`; do 12 | for src_file in `ls $VULN_PATCH_DB/$repo/$cve/vuln/$hsh`; do 13 | for g in `ls $VULN_PATCH_DB/$repo/$cve/vuln/$hsh/$src_file/graph | grep 'gpickle'`; do 14 | func=`echo $g | sed 's/.gpickle//'` 15 | if [ ! -f $VULN_PATCH_DB/$repo/$cve/vuln/$hsh/$src_file/graph/${func}.gpickle ]; then 16 | echo "Missing vulnerable graph for ${repo} ${cve} ${func}...Skipping" >> $LOG_FILE 17 | elif [ ! -f $VULN_PATCH_DB/$repo/$cve/patch/$hsh/$src_file/graph/${func}.gpickle ]; then 18 | echo "Missing patched graph for ${repo} ${cve} ${func}...Skipping" >> $LOG_FILE 19 | else 20 | # Should have everything we need 21 | python gen_vgraph.py $VULN_PATCH_DB/$repo/$cve/vuln/$hsh/$src_file/graph/${func}.gpickle $VULN_PATCH_DB/$repo/$cve/patch/$hsh/$src_file/graph/${func}.gpickle $VGRAPH_DB/$repo/$cve/$hsh/$src_file $func >> $LOG_FILE & 22 | fi 23 | while [ "`jobs | wc -l`" -gt "$MAX_NUM_PROCS" ]; do 24 | sleep 0.5 25 | done 26 | done 27 | done 28 | done 29 | done 30 | done 31 | 32 | -------------------------------------------------------------------------------- /mine.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | REPO_DIR=`pwd`"/data/repos" 3 | COMMIT_DIR=`pwd`"/data/commits" 4 | COMMIT_GREP_STRING='CVE-20' 5 | VULN_PATCH_DIR=`pwd`"/data/vuln_patch_src_db" 6 | VULN_PATCH_GRAPH_DIR=`pwd`"/data/vuln_patch_graph_db" 7 | SCRATCH_FILE=`pwd`"/mine.scratch" 8 | JOERN="path-to-joern-parse" 9 | 10 | if [ "${JOERN}" == "path-to-joern-parse" ]; then 11 | echo "You need to edit this script to tell it where the joern-parse executable is." 12 | echo "Edit the script and enter this as the JOERN variable at top of the file." 13 | exit 14 | fi 15 | exit 16 | 17 | 18 | 19 | source src/code/gen_src_files.sh 20 | 21 | # Checkout Repositories 22 | echo "Checking out all repositories in repos.config" 23 | mkdir -p $REPO_DIR 24 | START_TIME=$(date +%s) 25 | while read line; do 26 | name=`echo $line | awk '{print $1}'` 27 | url=`echo $line | awk '{print $2}'` 28 | if [ -d "$REPO_DIR/$name" ]; then 29 | echo "Repository Exists: $name" 30 | continue 31 | fi 32 | mkdir $REPO_DIR/$name 33 | echo "Checking our Repository: $name" 34 | git clone $url $REPO_DIR/$name 35 | done < ./repos.config 36 | END_TIME=$(date +%s) 37 | let ELAPSED_TIME=$END_TIME-$START_TIME 38 | echo "Elapsed time: $ELAPSED_TIME" 39 | 40 | 41 | # Generate Relevant Commits and download associated src files 42 | echo "Searching for interesting commits" 43 | mkdir -p $COMMIT_DIR 44 | START_TIME=$(date +%s) 45 | for d in `ls $REPO_DIR`; do 46 | if [ -f "$COMMIT_DIR/$d.commits" ]; then 47 | echo "Commits exist: $d" 48 | continue 49 | fi 50 | cd $REPO_DIR/$d # need to change directories to use the git commands 51 | git log --no-merges --grep=$COMMIT_GREP_STRING | grep "^commit" | awk '{print $2}' > $COMMIT_DIR/$d.commits 52 | cd - > /dev/null 53 | 54 | echo "Downloading src code for commits" 55 | mkdir -p $VULN_PATCH_DIR 56 | # Download code associated with commit 57 | if [ -f "$COMMIT_DIR/$d.commits" ]; then 58 | # Found some commits to process 59 | process_commit_file "$COMMIT_DIR/$d.commits" $REPO_DIR/$d $d 60 | fi 61 | done 62 | END_TIME=$(date +%s) 63 | let ELAPSED_TIME=$END_TIME-$START_TIME 64 | echo "Elapsed time: $ELAPSED_TIME" 65 | 66 | # Use Joern to parse the directory 67 | echo "Parsing $VULN_PATCH_DIR with Joern" 68 | START_TIME=$(date +%s) 69 | $JOERN $VULN_PATCH_DIR # generates a parsed directory containing our parsed data 70 | END_TIME=$(date +%s) 71 | let ELAPSED_TIME=$END_TIME-$START_TIME 72 | echo "Elapsed time: $ELAPSED_TIME" 73 | 74 | # Rename parsed dir so it doesn't get confuseled 75 | mv parsed parsed_mine 76 | 77 | # Generating graphs and extracting code 78 | echo "Generating vuln patch graph database..." 79 | START_TIME=$(date +%s) 80 | python convert_parsed.py $VULN_PATCH_DIR parsed_mine $VULN_PATCH_GRAPH_DIR 81 | END_TIME=$(date +%s) 82 | let ELAPSED_TIME=$END_TIME-$START_TIME 83 | echo "Elapsed time: $ELAPSED_TIME" 84 | -------------------------------------------------------------------------------- /parsed_to_networkx.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import csv 4 | import networkx as nx 5 | import pickle as pkl 6 | from src.graph.utils import * 7 | 8 | 9 | def write_graph(graph, output_dir, func_name): 10 | graph_path = output_dir + '/' + func_name + '.gpickle' 11 | triple_path = output_dir + '/' + func_name + '.triples' 12 | vector_path = output_dir + '/' + func_name + '.vec' 13 | 14 | if not os.path.exists(output_dir): 15 | os.makedirs(output_dir) 16 | 17 | print("Writing graph: %s" % graph_path) 18 | nx.write_gpickle(graph, graph_path) 19 | 20 | trips = tripleize(graph) 21 | print("Writing triples: %s" % triple_path) 22 | pkl.dump(trips, open(triple_path, 'wb')) 23 | 24 | vec = vectorize(graph) 25 | print("Writing vector: %s" % vector_path) 26 | pkl.dump(vec, open(vector_path, 'wb')) 27 | 28 | 29 | def print_usage(): 30 | print("Usage: python parsed_to_networkx.py ") 31 | 32 | if __name__ == "__main__": 33 | if len(sys.argv) != 3: 34 | print_usage() 35 | exit() 36 | parsed_nodes_file = sys.argv[1] 37 | output_dir = sys.argv[2] 38 | 39 | base_dir = parsed_nodes_file[:-len('nodes.csv')] 40 | 41 | parsed_edges_file = base_dir + 'edges.csv' 42 | 43 | print("Nodes: %s" % parsed_nodes_file) 44 | print("Edges: %s" % parsed_edges_file) 45 | print("Output: %s" % output_dir) 46 | 47 | graphs = joern_to_networkx(parsed_nodes_file, parsed_edges_file) 48 | for g in graphs: 49 | write_graph(g['graph'], output_dir + '/' + base_dir, g['name']) 50 | -------------------------------------------------------------------------------- /repos.config: -------------------------------------------------------------------------------- 1 | ffmpeg https://github.com/FFmpeg/FFmpeg 2 | libav https://github.com/libav/libav 3 | openssl https://github.com/openssl/openssl.git 4 | libtiff https://gitlab.com/libtiff/libtiff.git 5 | linux https://github.com/torvalds/linux.git 6 | tcpdump https://github.com/the-tcpdump-group/tcpdump.git 7 | qemu https://github.com/qemu/qemu.git 8 | xen https://github.com/xen-project/xen.git 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | decorator==4.4.0 2 | networkx==2.3 3 | numpy==1.17.2 4 | tqdm==4.36.1 5 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bbowman410/VGraph/ddeaf07e4f5864a1788d62b8a7893d415f44a525/src/__init__.py -------------------------------------------------------------------------------- /src/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bbowman410/VGraph/ddeaf07e4f5864a1788d62b8a7893d415f44a525/src/__init__.pyc -------------------------------------------------------------------------------- /src/code/gen_src_files.sh: -------------------------------------------------------------------------------- 1 | # This is the next step in the vulnerability source code database 2 | # This will go through all relevant commit files for each repository 3 | # and inspect further the commit 4 | 5 | # We will skip this commit (and in effect this CVE) if: 6 | # - there are more than 5 files covered in the commit 7 | # - The file name modified is not a C/C++ fle 8 | 9 | # If we make it past the previous two checks, we will extract 10 | # the ID of the original vulnerable file, and the patched file. 11 | # Then we perform a 'git show ' and direct that to a file 12 | # in the directory structure ./src_files///(vuln|patch)/ 13 | # Additionally, we will write all modified functions to a file 14 | # named 'funcnames' in the directory as well. This way 15 | # we know which functions are modified from vuln -> patch. This helps with 16 | # processing later. 17 | 18 | 19 | function download_data { 20 | # Read arguments 21 | local commit_hash=$1 22 | local codebase=$2 23 | local cve=$3 24 | local path=$4 25 | local funcs=$5 26 | local vuln_hash=$6 27 | local patch_hash=$7 28 | 29 | # before and after commit vars 30 | local before_commit="" 31 | local before_commit_1mo="" 32 | local before_commit_6mo="" 33 | local after_commit="" 34 | local after_commit_1mo="" 35 | local after_commit_6mo="" 36 | 37 | # Get linux epoch time of commit 38 | local timestamp=`git log -1 --pretty=format:"%at" $commit_hash` 39 | 40 | # Compute cutoff times for harvesting more before/after commits 41 | let local timestamp_1mo_before=$timestamp-2592000 42 | let local timestamp_6mo_before=$timestamp-15552000 43 | let local timestamp_1mo_after=$timestamp+2592000 44 | let local timestamp_6mo_after=$timestamp+15552000 45 | 46 | log "Downloading source code: $cve $codebase $path $funcs $commit_hash" 47 | 48 | # Make any necessary directories 49 | mkdir -p $VULN_PATCH_DIR/$codebase/$cve/vuln/${commit_hash}_${timestamp} 50 | mkdir -p $VULN_PATCH_DIR/$codebase/$cve/patch/${commit_hash}_${timestamp} 51 | mkdir -p $VULN_PATCH_DIR/$codebase/$cve/before 52 | mkdir -p $VULN_PATCH_DIR/$codebase/$cve/after 53 | 54 | # Download vulnerable file and patched file. 55 | git show $vuln_hash > $VULN_PATCH_DIR/$codebase/$cve/vuln/${commit_hash}_${timestamp}/$curr_file 56 | git show $patch_hash > $VULN_PATCH_DIR/$codebase/$cve/patch/${commit_hash}_${timestamp}/$curr_file 57 | 58 | # Save function names so we know which functions to parse out later 59 | echo "$funcs" | tr '|' '\n' | sed '/^$/d' | sort | uniq > $VULN_PATCH_DIR/$codebase/$cve/funcnames 60 | 61 | # Immediately before the patching commit 62 | local before_commit=`git rev-list $commit_hash -- $path | head -2 | grep -v $commit_hash` 63 | 64 | if [ "$before_commit" != "" ]; then 65 | # If we found a before commit, let's look for 1 month before 66 | local before_commit_1mo=`git rev-list --min-age=$timestamp_1mo_before $commit_hash -- $path | grep -v $before_commit | head -1` 67 | fi 68 | 69 | if [ "$before_commit_1mo" != "" ]; then 70 | # If we found a 1 month before commit, lets look for a 6 month before 71 | local before_commit_6mo=`git rev-list --min-age=$timestamp_6mo_before $commit_hash -- $path | grep -v $before_commit | grep -v $before_commit_1mo | head -1` 72 | fi 73 | 74 | # Immediately after the patching commit 75 | local after_commit=`git rev-list --ancestry-path ${commit_hash}..HEAD -- $path | tail -1` 76 | 77 | # There is no easy way to get after commits after some time (annoying) 78 | # Instead, we get all commits between the patching commit, and our cutoff time 79 | # Then we choose the one directly after the last one we found in that range. 80 | 81 | # Only makes sense to conitnue if we found an initial after commit 82 | if [ "$after_commit" != "" ]; then 83 | # this gets the NEWEST commit that is still OLDER than time cutoff. 84 | local tmp_hash=`git rev-list --ancestry-path --before=$timestamp_1mo_after ${commit_hash}..HEAD -- $path | head -1` 85 | if [ "$tmp_hash" == "" ]; then 86 | # if tmp_hash is empty, then all commits are after the cutoff time... so just grab the next one 87 | # don't forget to remove to he $after_commit we just harvested (provided its not empty) 88 | local after_commit_1mo=`git rev-list --ancestry-path ${commit_hash}..HEAD -- $path | grep -v $after_commit | tail -1` 89 | else 90 | # otherwise use the tmp hash 91 | local after_commit_1mo=`git rev-list --ancestry-path ${tmp_hash}..HEAD -- $path | tail -1` 92 | fi 93 | fi 94 | 95 | # only makes sense to continue if we found a 1month commit 96 | if [ "$after_commit_1mo" != "" ]; then 97 | local tmp_hash=`git rev-list --ancestry-path --before=$timestamp_6mo_after ${commit_hash}..HEAD -- $path | head -1` 98 | if [ "$tmp_hash" == "" ]; then 99 | local after_commit_6mo=`git rev-list --ancestry-path ${commit_hash}..HEAD -- $path | grep -v $after_commit | grep -v $after_commit_1mo | tail -1` 100 | else 101 | # otherwise use the tmp hash 102 | local after_commit_6mo=`git rev-list --ancestry-path ${tmp_hash}..HEAD -- $path | tail -1` 103 | fi 104 | fi 105 | 106 | # If we identified any before or after commits, we download them 107 | # Before commits 108 | if [ "$before_commit" != "" ]; then 109 | local before_timestamp=`git log -1 --pretty=format:"%at" $before_commit` 110 | log "Downloading commit immediately before patch: $cve $codebase $path $funcs $before_commit" 111 | mkdir -p $VULN_PATCH_DIR/$codebase/$cve/before/${before_commit}_${before_timestamp} 112 | git show $before_commit:$path > $VULN_PATCH_DIR/$codebase/$cve/before/${before_commit}_${before_timestamp}/${curr_file} 113 | fi 114 | 115 | if [ "$before_commit_1mo" != "" ]; then 116 | local before_timestamp_1mo=`git log -1 --pretty=format:"%at" $before_commit_1mo` 117 | log "Downloading commit 1 month before patch: $cve $codebase $path $funcs $before_commit_1mo" 118 | mkdir -p $VULN_PATCH_DIR/$codebase/$cve/before/${before_commit_1mo}_${before_timestamp_1mo} 119 | git show $before_commit_1mo:$path > $VULN_PATCH_DIR/$codebase/$cve/before/${before_commit_1mo}_${before_timestamp_1mo}/${curr_file} 120 | fi 121 | 122 | if [ "$before_commit_6mo" != "" ]; then 123 | local before_timestamp_6mo=`git log -1 --pretty=format:"%at" $before_commit_6mo` 124 | log "Downloading commit 6 month before patch: $cve $codebase $path $funcs $before_commit_6mo" 125 | mkdir -p $VULN_PATCH_DIR/$codebase/$cve/before/${before_commit_6mo}_${before_timestamp_6mo} 126 | git show $before_commit_6mo:$path > $VULN_PATCH_DIR/$codebase/$cve/before/${before_commit_6mo}_${before_timestamp_6mo}/${curr_file} 127 | fi 128 | # After Commits 129 | if [ "$after_commit" != "" ]; then 130 | local after_timestamp=`git log -1 --pretty=format:"%at" $after_commit` 131 | log "Downloading commit immediately after patch: $cve $codebase $path $funcs $after_commit" 132 | mkdir -p $VULN_PATCH_DIR/$codebase/$cve/after/${after_commit}_${after_timestamp} 133 | git show $after_commit:$path > $VULN_PATCH_DIR/$codebase/$cve/after/${after_commit}_${after_timestamp}/${curr_file} 134 | fi 135 | 136 | if [ "$after_commit_1mo" != "" ]; then 137 | local after_timestamp_1mo=`git log -1 --pretty=format:"%at" $after_commit_1mo` 138 | log "Downloading commit 1 month after patch: $cve $codebase $path $funcs $after_commit_1mo" 139 | mkdir -p $VULN_PATCH_DIR/$codebase/$cve/after/${after_commit_1mo}_${after_timestamp_1mo} 140 | git show $after_commit_1mo:$path > $VULN_PATCH_DIR/$codebase/$cve/after/${after_commit_1mo}_${after_timestamp_1mo}/${curr_file} 141 | fi 142 | 143 | if [ "$after_commit_6mo" != "" ]; then 144 | local after_timestamp_6mo=`git log -1 --pretty=format:"%at" $after_commit_6mo` 145 | log "Downloading commit 6 month after patch: $cve $codebase $path $funcs $after_commit_6mo" 146 | mkdir -p $VULN_PATCH_DIR/$codebase/$cve/after/${after_commit_6mo}_${after_timestamp_6mo} 147 | git show $after_commit_6mo:$path > $VULN_PATCH_DIR/$codebase/$cve/after/${after_commit_6mo}_${after_timestamp_6mo}/${curr_file} 148 | fi 149 | } 150 | 151 | # This function is called after the git show command is called 152 | # and the output is written to SCRATCH_FILE 153 | # This function is responsible for actually parsing the file lines 154 | # and downloading the relevant source code 155 | function process_commit_lines { 156 | local commit_hash=$1 157 | local codebase=$2 158 | local cve=$3 159 | 160 | local curr_file="" 161 | local curr_funcs="" 162 | local curr_vuln_id="" 163 | local curr_patch_id="" 164 | local path="" 165 | 166 | while read line; do 167 | if [ "`echo $line | grep "^diff --git"`" != "" ]; then 168 | log "Found diff line: $line" 169 | if [ "$curr_file" != "" ] && [ "$curr_funcs" != "" ]; then 170 | download_data $commit_hash $codebase $cve $path $curr_funcs $curr_vuln_id $curr_patch_id 171 | fi 172 | # Set new filename. If its not C/C++ file then curr_filew ill be blank 173 | local curr_file=`echo $line | grep -o -m 1 "[a-zA-Z0-9_]*\.c[pp]*" | uniq` 174 | local path=`echo $line | grep -o -m 1 "a\/[\/a-zA-Z0-9_-]*\.c[pp]*" | uniq` 175 | local path=${path:2} # remove the 'a/' from front 176 | local curr_funcs="" 177 | local curr_vuln_id="" 178 | local curr_patch_id="" 179 | if [ "$curr_file" != "" ]; then 180 | log "Parsing modification to file: $curr_file" 181 | fi 182 | elif [ "$curr_file" != "" ] && [ "`echo $line | grep -o "^index [a-z0-9]*\.\.[a-z0-9]*"`" != "" ]; then 183 | log "Found index line: $line" 184 | local curr_vuln_id=`echo $line | grep -o "^index [a-z0-9]*\.\.[a-z0-9]*" | awk '{print $2}' | tr '..' ' ' | awk '{print $1}'` 185 | local curr_patch_id=`echo $line | grep -o "^index [a-z0-9]*\.\.[a-z0-9]*" | awk '{print $2}' | tr '..' ' ' | awk '{print $2}'` 186 | elif [ "$curr_file" != "" ] && [ "$curr_vuln_id" != "" ] && [ "$curr_patch_id" != "" ] && [ "`echo $line | grep "@@" | grep -o "[a-zA-Z0-9_]*(" | sed "s/(//g" | uniq`" != "" ]; then 187 | log "Found function line: $line" 188 | local func_name="`echo $line | grep "@@" | grep -o "[a-zA-Z0-9_]*(" | sed "s/(//g" | tail -1`" # only get the last occurence in case there are some weird return types 189 | local curr_funcs="${curr_funcs}|${func_name}" # keep track of all 190 | else 191 | # Nothing important on this line.. skip it 192 | continue 193 | fi 194 | done < $SCRATCH_FILE 195 | 196 | # We still may need to write out a file 197 | if [ "$curr_file" != "" ] && [ "$curr_funcs" != "" ] && [ "$curr_vuln_id" != "" ] && [ "$curr_patch_id" != "" ]; then 198 | download_data $commit_hash $codebase $cve $path $curr_funcs $curr_vuln_id $curr_patch_id 199 | fi 200 | } 201 | 202 | function process_commit { 203 | local commit=$1 204 | local codebase=$2 205 | log "Processing commit: $codebase $commit" 206 | git show $commit > $SCRATCH_FILE 207 | local cve=`cat $SCRATCH_FILE | grep -o "CVE-[0-9]*-[0-9]*" | head -1` # Just get first reference to a CVE 208 | local num_files_covered=`grep "^diff --git" $SCRATCH_FILE | wc -l` 209 | if [ $num_files_covered -gt 5 ] || [ $num_files_covered -eq 0 ]; then 210 | log "ERROR: This commit covers $num_files_covered files. Skipping it." # Probably a merge 211 | let SKIPPED=$SKIPPED+1 212 | elif [ -d $VULN_PATCH_DIR/$codebase/$cve ]; then 213 | log "ERROR: This CVE already covered by previous commit. Skipping." # Probaly should change this... 214 | let SKIPPED=$SKIPPED+1 215 | else 216 | process_commit_lines $commit $codebase $cve 217 | let PROCESSED=$PROCESSED+1 218 | fi 219 | rm $SCRATCH_FILE 220 | } 221 | 222 | function process_commit_file { 223 | local commit_file=$1 224 | local repo_dir=$2 225 | local codebase=$3 226 | local commits=`cat $commit_file` 227 | 228 | echo "Processing commit file: $commit_file" 229 | 230 | cd $repo_dir # We have to change directory to use git commands 231 | for c in $commits; do 232 | process_commit $c $codebase $output_dir 233 | done 234 | cd - >/dev/null 235 | } 236 | 237 | function log { 238 | echo "$1" 239 | } 240 | 241 | function main { 242 | mkdir -p src_files 243 | cat /dev/null > $LOG_FILE # Reset log 244 | 245 | echo "Logging to $LOG_FILE..." 246 | 247 | for commit_file in `ls commits`; do 248 | process_commit_file $commit_file 249 | done 250 | log "Finished. Processed $PROCESSED, Skipped $SKIPPED" 251 | } 252 | -------------------------------------------------------------------------------- /src/graph/utils.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import os 3 | import pickle as pkl 4 | import csv 5 | 6 | 7 | def joern_edge_to_edgelist(edge_file): 8 | ''' converts an edges.csv file generated by Joern into a simple edgelist ''' 9 | edge_list = {} 10 | with open(edge_file, 'r') as csv_file: 11 | csv_reader = csv.reader(csv_file, delimiter='\t') 12 | first_line = True 13 | for row in csv_reader: 14 | # Skip first line 15 | if first_line: 16 | first_line = False 17 | continue 18 | if row[0] not in edge_list: 19 | edge_list[row[0]] = [ (row[1], row[2]) ] 20 | else: 21 | edge_list[row[0]].append((row[1], row[2])) 22 | 23 | return edge_list 24 | 25 | 26 | def joern_to_networkx(nodes_file, edge_file, func_names=None): 27 | ''' Converts a joern nodes.csv and edges.csv into a list of NetworkX graphs ''' 28 | 29 | edge_list = joern_edge_to_edgelist(edge_file) 30 | 31 | graphs = [] 32 | total_funcs_parsed = 0 33 | with open(nodes_file, 'r') as csv_file: 34 | csv_reader = csv.reader(csv_file, delimiter='\t') 35 | first_line = True 36 | processing_func = False 37 | curr_meta = {} 38 | for row in csv_reader: 39 | # Skip first line 40 | if first_line: 41 | first_line = False 42 | continue 43 | if row[2] == "Function": 44 | total_funcs_parsed += 1 45 | if processing_func: # New function so stop previous function processing 46 | # add edges 47 | for src_n in curr_meta['graph'].nodes(): 48 | if src_n in edge_list: 49 | for (dst_n, e_type) in edge_list[src_n]: 50 | curr_meta['graph'].add_edge(src_n, dst_n, type=e_type) 51 | graphs.append(curr_meta) 52 | processing_func = False 53 | curr_meta = {} 54 | 55 | # Found a new function 56 | # row[4] is function name 57 | # row[5] is function location in line_num:x:x:x 58 | if not func_names or row[3] in func_names: 59 | curr_meta['location'] = row[4] 60 | curr_meta['graph'] = nx.MultiDiGraph() 61 | curr_meta['name'] = row[3] 62 | processing_func = True 63 | else: 64 | # not a function start. so just see if we processing or not 65 | if processing_func: 66 | curr_meta['graph'].add_node(row[1]) # add node to graph 67 | curr_meta['graph'].node[row[1]]['type'] = row[2] 68 | curr_meta['graph'].node[row[1]]['code'] = row[3] 69 | curr_meta['graph'].node[row[1]]['functionId'] = row[5] 70 | # end of csv file 71 | # lets check to make sure we didnt end on a function we were processing 72 | if processing_func: 73 | # need to finish off this function 74 | # add edges 75 | for src_n in curr_meta['graph'].nodes(): 76 | if src_n in edge_list: 77 | for (dst_n, e_type) in edge_list[src_n]: 78 | curr_meta['graph'].add_edge(src_n, dst_n, type=e_type) 79 | graphs.append(curr_meta) 80 | processing_func = False 81 | # now we have processed both the nodes.csv and edges.csv for this source code file 82 | return graphs, total_funcs_parsed 83 | 84 | def tripleize(G): 85 | ''' Turns a graph into a set of code -> Relationship -> Code triples ''' 86 | G_trips=set([]) 87 | 88 | for n1, n2, k in G.edges(keys=True): 89 | if G.node[n1]['type'] in ['CFGEntryNode','CFGExitNode','ENTRY','EXIT']: 90 | continue 91 | if G.node[n2]['type'] in ['CFGEntryNode','CFGExitNode','ENTRY','EXIT']: 92 | continue 93 | relationship=G[n1][n2][k]['type'] 94 | # first add pure type relationships. This is most abstract form 95 | G_trips.add((G.node[n1]['type'],relationship,G.node[n2]['type'])) 96 | # One node set to concrete src code 97 | if G.node[n1]['code'] != '': 98 | G_trips.add((G.node[n1]['code'],relationship,G.node[n2]['type'])) 99 | # Other node set to concrete src code 100 | if G.node[n2]['code'] != '': 101 | G_trips.add((G.node[n1]['type'],relationship,G.node[n2]['code'])) 102 | # Both nodes set to concrete src code 103 | if G.node[n1]['code'] != '' and G.node[n2]['code'] != '': 104 | G_trips.add((G.node[n1]['code'],relationship,G.node[n2]['code'])) 105 | 106 | return G_trips 107 | 108 | 109 | def vectorize(G): 110 | ''' Converts a graph to a vector based on node and edge types. Can be used for quick filtering ''' 111 | vector_dims = [ 'FLOWS_TO','DECLARES','IS_CLASS_OF','REACHES','CONTROLS','DOM','POST_DOM','USE','DEF','IS_AST_PARENT','CallExpression','Callee','Function','ArgumentList','AssignmentExpr','File','IdentifierDeclStatement','Parameter','Symbol', 'PostIncDecOperationExpression', 'Identifier', 'IncDec', 'ExpressionStatement', 'AssignmentExpression', 'ArrayIndexing','IfStatement', 'Condition', 'AdditiveExpression', 'Argument' , 'PrimaryExpression', 'CastExpression', 'CastTarget', 'PtrMemberAccess','Statement', 'ReturnStatement', 'EqualityExpression', 'ElseStatement', 'ParameterType', 'ParameterList', 'SizeofExpression', 'IdentifierDeclType', 'UnaryOperator', 'MultiplicativeExpression', 'MemberAccess', 'FunctionDef', 'AndExpression', 'CFGEntryNode', 'UnaryOperationExpression', 'ForStatement', 'ForInit', 'ShiftExpression', 'ReturnType', 'Sizeof', 'BreakStatement', 'OrExpression', 'WhileStatement', 'SizeofOperand', 'IdentifierDecl', 'CompoundStatement', 'CFGExitNode', 'RelationalExpression', 'BitAndExpression','CFGErrorNode','ClassDef','ClassDefStatement','ConditionalExpression','ContinueStatement','Decl','DeclStmt','DoStatement','ExclusiveOrExpression','Expression','GotoStatement','InclusiveOrExpression','InitializerList','Label','SwitchStatement','UnaryExpression','InfiniteForNode'] 112 | vec = [0] * len(vector_dims) 113 | for n in G.nodes(): 114 | t = G.node[n]['type'] 115 | if t in vector_dims: 116 | vec[vector_dims.index(t)] += 1 117 | else: 118 | print("Missing node type: ", t) 119 | 120 | for (n1,n2) in G.edges(): 121 | for e in G[n1][n2]: # multi edges 122 | t = G[n1][n2][e]['type'] 123 | if t in vector_dims: 124 | vec[vector_dims.index(t)] += 1 125 | else: 126 | print("Missing edge type: ", t) 127 | return vec 128 | 129 | 130 | 131 | 132 | def load_vgraph_db(root): 133 | vgraph_db=[] 134 | for repo in os.listdir(root): 135 | for cve in os.listdir('/'.join([root,repo])): 136 | for hsh in os.listdir('/'.join([root,repo,cve])): 137 | for f in os.listdir('/'.join([root,repo,cve,hsh])): 138 | for func in os.listdir('/'.join([root,repo,cve,hsh,f])): 139 | if func.endswith('_pvg.pkl'): 140 | # Found vGraph 141 | func_root = str(func[:-len('_pvg.pkl')]) 142 | cvg=pkl.load(open(root + '/%s/%s/%s/%s/%s_%s'%(repo,cve,hsh,f,func_root,'cvg.pkl'),'rb')) 143 | pvg=pkl.load(open(root + '/%s/%s/%s/%s/%s_%s'%(repo,cve,hsh,f,func_root,'pvg.pkl'),'rb')) 144 | nvg=pkl.load(open(root + '/%s/%s/%s/%s/%s_%s'%(repo,cve,hsh,f,func_root,'nvg.pkl'),'rb')) 145 | v=pkl.load(open(root + '/%s/%s/%s/%s/%s_%s'%(repo,cve,hsh,f,func_root,'v.pkl'),'rb')) 146 | p=pkl.load(open(root + '/%s/%s/%s/%s/%s_%s'%(repo,cve,hsh,f,func_root,'p.pkl'),'rb')) 147 | vec=pkl.load(open(root + '/%s/%s/%s/%s/%s_%s'%(repo,cve,hsh,f,func_root,'vec.pkl'),'rb')) 148 | vgraph_db.append({ 149 | 'repo':repo, 150 | 'cve':cve, 151 | 'hsh':hsh, 152 | 'file':f, 153 | 'func':func_root, 154 | 'cvg':cvg, 155 | 'pvg':pvg, 156 | 'nvg':nvg, 157 | 'v':v, 158 | 'p':p, 159 | 'vec':vec 160 | }) 161 | 162 | return vgraph_db 163 | 164 | def load_target_db(root, func_list=None): 165 | target_graph_db = [] 166 | for root, dirs, files in os.walk(root): 167 | for f in files: 168 | if f.endswith(".gpickle"): # this is a target graph 169 | base_name = f[:-len('.gpickle')] 170 | if func_list and base_name not in func_list: 171 | continue 172 | try: 173 | target_graph_db.append({ 174 | 'dir': root, 175 | 'base_name': base_name, 176 | 'path':"%s/%s" % (root, f), 177 | #'graph': nx.read_gpickle("%s/%s" % (root, f)), 178 | 'triples': pkl.load(open("%s/%s" % (root, base_name + '.triples'), 'rb')), 179 | 'vec': pkl.load(open("%s/%s" % (root, base_name + '.vec'), 'rb')) 180 | }) 181 | except: 182 | # error loading target. skip. 183 | continue 184 | return target_graph_db 185 | 186 | 187 | -------------------------------------------------------------------------------- /src/graph/vgraph.py: -------------------------------------------------------------------------------- 1 | # vGraph: 2 | # positive 3 | # negative 4 | # context 5 | 6 | # constructor should take a vulnerable graph and a patch graph and generate all internal representations 7 | import networkx as nx 8 | class VGraph: 9 | MIN_NODES=50 10 | 11 | def __init__(self, vuln_graph_file, patch_graph_file): 12 | self.v = nx.read_gpickle(vuln_graph_file) 13 | self.p = nx.read_gpickle(patch_graph_file) 14 | self.v_to_p, self.p_to_v = self.__align_graphs() 15 | 16 | self.positive = self.__gen_positive_vg() 17 | self.negative = self.__gen_negative_vg() 18 | self.context = self.__gen_context_vg() 19 | 20 | self.positive_index = self.__gen_index(self.positive) 21 | self.negative_index = self.__gen_index(self.negative) 22 | self.context_index = self.__gen_index(self.context) 23 | 24 | self.positive_imp_nodes = self.__gen_imp_nodes(self.positive) 25 | self.negative_imp_nodes = self.__gen_imp_nodes(self.negative) 26 | self.context_imp_nodes = self.__gen_imp_nodes(self.context) 27 | 28 | self.positive_bfs_trees = self.__gen_bfs_trees(self.positive_imp_nodes, self.positive) 29 | self.negative_bfs_trees = self.__gen_bfs_trees(self.negative_imp_nodes, self.negative) 30 | self.context_bfs_trees = self.__gen_bfs_trees(self.context_imp_nodes, self.context) 31 | 32 | 33 | def match(self, q, t, q_prepared, t_prepared): 34 | raise NotImplementedError 35 | 36 | def prepare_query(self, q): 37 | raise NotImplementedError 38 | 39 | def prepare_target(self, t): 40 | raise NotImplementedError 41 | 42 | def __align_graphs(self): 43 | src_graph = self.v 44 | dst_graph = self.p 45 | 46 | src_to_dst_mapping = {} 47 | dst_to_src_mapping = {} 48 | 49 | # First lets match CFGEntryNode, CFGExitNode, and FunctionDef to get our skeleton 50 | for src_node in src_graph.nodes: 51 | if src_graph.node[src_node]['type'] == 'CFGEntryNode': 52 | for dst_node in dst_graph.nodes: 53 | if dst_graph.node[dst_node]['type'] == 'CFGEntryNode': 54 | src_to_dst_mapping[src_node] = dst_node 55 | dst_to_src_mapping[dst_node] = src_node 56 | break 57 | elif src_graph.node[src_node]['type'] == 'CFGExitNode': 58 | for dst_node in dst_graph.nodes: 59 | if dst_graph.node[dst_node]['type'] == 'CFGExitNode': 60 | src_to_dst_mapping[src_node] = dst_node 61 | dst_to_src_mapping[dst_node] = src_node 62 | break 63 | elif src_graph.node[src_node]['type'] == 'FunctionDef': 64 | for dst_node in dst_graph.nodes: 65 | if dst_graph.node[dst_node]['type'] == 'FunctionDef': 66 | src_to_dst_mapping[src_node] = dst_node 67 | dst_to_src_mapping[dst_node] = src_node 68 | break 69 | 70 | # Now match all other nodes as best we can 71 | for src_node in src_graph.nodes: 72 | if src_node in src_to_dst_mapping.keys(): 73 | continue 74 | for dst_node in dst_graph.nodes: 75 | if dst_node in dst_to_src_mapping.keys(): 76 | continue 77 | 78 | if src_graph.node[src_node]['code'] == dst_graph.node[dst_node]['code'] and src_graph.node[src_node]['type'] == dst_graph.node[dst_node]['type'] and src_graph.in_degree(src_node) == dst_graph.in_degree(dst_node) and src_graph.out_degree(src_node) == dst_graph.out_degree(dst_node): 79 | src_to_dst_mapping[src_node] = dst_node 80 | dst_to_src_mapping[dst_node] = src_node 81 | break # to next src node 82 | 83 | return src_to_dst_mapping, dst_to_src_mapping 84 | 85 | def __gen_positive_vg(self): 86 | pvg = nx.DiGraph() 87 | for v_node in set(self.v.nodes).difference(set(self.v_to_p.keys())): 88 | pvg.add_node(v_node) 89 | pvg.node[v_node]['type'] = self.v.node[v_node]['type'] 90 | pvg.node[v_node]['code'] = self.v.node[v_node]['code'] 91 | pvg.node[v_node]['style'] = 'o' 92 | 93 | self.__add_edges(pvg, self.v) 94 | self.__connect_graph(pvg, self.v) 95 | while len(pvg.nodes) < self.MIN_NODES and len(pvg.nodes) < len(self.v.nodes): 96 | self.__expand_graph(pvg, self.v) 97 | 98 | return pvg 99 | 100 | 101 | def __gen_negative_vg(self): 102 | nvg = nx.DiGraph() 103 | # Add all nodes in P that were missing from V (i.e. added during patch) 104 | for p_node in set(self.p.nodes).difference(set(self.p_to_v.keys())): 105 | nvg.add_node(p_node) 106 | nvg.node[p_node]['type'] = self.p.node[p_node]['type'] 107 | nvg.node[p_node]['code'] = self.p.node[p_node]['code'] 108 | nvg.node[p_node]['style'] = 'o' 109 | 110 | self.__add_edges(nvg, self.p) 111 | self.__connect_graph(nvg, self.p) 112 | while len(nvg.nodes) < self.MIN_NODES and len(nvg.nodes) < len(self.p.nodes): 113 | self.__expand_graph(nvg, self.p) 114 | 115 | return nvg 116 | 117 | def __gen_context_vg(self): 118 | cvg = nx.DiGraph() 119 | for n in self.v_to_p: # These are all shared nodes 120 | if n in self.positive.nodes or self.v_to_p[n] in self.negative.nodes: 121 | # these nodes were added during expand_graph 122 | # skip them so we dont overlap (or should we overlap??) 123 | continue 124 | 125 | #context_graph.add_node(n) 126 | #context_graph.node[n]['type'] = V.node[n]['type'] 127 | #context_graph.node[n]['code'] = V.node[n]['code'] 128 | 129 | added=False 130 | for n2 in list(self.v.predecessors(n)) + list(self.v.successors(n)): 131 | if n2 in self.positive.nodes: 132 | # Found context node because it has edge into positive vGraph 133 | cvg.add_node(n) 134 | cvg.node[n]['type'] = self.v.node[n]['type'] 135 | cvg.node[n]['code'] = self.v.node[n]['code'] 136 | added=True 137 | break 138 | if added: 139 | continue # already added so just move on 140 | # otherwise lets check patch nodes 141 | 142 | for n2 in list(self.p.predecessors(self.v_to_p[n])) + list(self.p.successors(self.v_to_p[n])): 143 | if n2 in self.negative.nodes: 144 | # Found context node because it has edge into negative vGraph 145 | cvg.add_node(n) 146 | cvg.node[n]['type'] = self.v.node[n]['type'] 147 | cvg.node[n]['code'] = self.v.node[n]['code'] 148 | break 149 | 150 | self.__add_edges(cvg, self.v) 151 | self.__connect_graph(cvg, self.v) 152 | 153 | # Now we added some nodes, lets keep going until 154 | while len(cvg.nodes) < self.MIN_NODES: 155 | self.__expand_graph(cvg, self.v) 156 | 157 | return cvg 158 | 159 | def __gen_index(self, g): 160 | pass 161 | 162 | def __gen_imp_nodes(self, g): 163 | pass 164 | 165 | def __gen_bfs_trees(self, g, imp_nodes): 166 | pass 167 | 168 | # We want our graphs to remain connected, so we do that 169 | def __connect_graph(self,small_graph, big_graph): 170 | small_graph_undirected = small_graph.to_undirected() 171 | big_graph_undirected = big_graph.to_undirected() 172 | 173 | while not nx.is_connected(small_graph_undirected): 174 | 175 | # Get list of ccs 176 | ccs = list(nx.connected_components(small_graph_undirected)) 177 | 178 | # sort ccs based on size (first element is biggest cc) 179 | ccs.sort(key=len, reverse=True) 180 | 181 | # merge largest two ccs by shortest path between them 182 | sp = None 183 | for n1 in ccs[0]: 184 | for n2 in ccs[1]: 185 | # Find shortest path in big graph 186 | sp_n1_n2 = nx.shortest_path(big_graph_undirected, n1, n2) 187 | if sp is None or len(sp_n1_n2) < len(sp): 188 | sp = sp_n1_n2 189 | # now we know the shortest path from biggest_cc to other cc 190 | for n in sp: 191 | small_graph.add_node(n) 192 | small_graph.node[n]['type'] = big_graph.node[n]['type'] 193 | small_graph.node[n]['code'] = big_graph.node[n]['code'] 194 | 195 | # Need to add edges now so we know when to stop 196 | for n in sp: 197 | for neighbor in big_graph[n]: 198 | if neighbor in sp: 199 | small_graph.add_edge(n,neighbor) 200 | small_graph[n][neighbor]['type'] = big_graph[n][neighbor]['type'] 201 | 202 | # Update undirected version 203 | small_graph_undirected = small_graph.to_undirected() 204 | 205 | # Grow a graph by expanding to neighbors num_hops away 206 | def __expand_graph(self, small_graph, big_graph, num_hops=1): 207 | for _ in range(num_hops): 208 | # loop through all nodes in small graph and add neighbors 209 | 210 | small_nodes = list(small_graph.nodes)[:] 211 | 212 | for n in small_nodes: 213 | for neighbor in big_graph.successors(n): 214 | small_graph.add_node(neighbor) 215 | small_graph.node[neighbor]['type'] = big_graph.node[neighbor]['type'] 216 | small_graph.node[neighbor]['code'] = big_graph.node[neighbor]['code'] 217 | small_graph.add_edge(n, neighbor) 218 | small_graph[n][neighbor]['type'] = big_graph[n][neighbor]['type'] 219 | for neighbor in big_graph.predecessors(n): 220 | small_graph.add_node(neighbor) 221 | small_graph.node[neighbor]['type'] = big_graph.node[neighbor]['type'] 222 | small_graph.node[neighbor]['code'] = big_graph.node[neighbor]['code'] 223 | small_graph.add_edge(neighbor, n) 224 | small_graph[neighbor][n]['type'] = big_graph[neighbor][n]['type'] 225 | 226 | def __add_edges(self, graph_nodes_only, full_graph): 227 | # finish graph by adding relevant edges 228 | for (src, dst) in full_graph.edges(): 229 | if src in graph_nodes_only.nodes and dst in graph_nodes_only.nodes: 230 | graph_nodes_only.add_edge(src, dst) 231 | graph_nodes_only[src][dst]['type'] = full_graph[src][dst]['type'] 232 | 233 | -------------------------------------------------------------------------------- /src/graph/vgraph.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bbowman410/VGraph/ddeaf07e4f5864a1788d62b8a7893d415f44a525/src/graph/vgraph.pyc -------------------------------------------------------------------------------- /src/matching/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bbowman410/VGraph/ddeaf07e4f5864a1788d62b8a7893d415f44a525/src/matching/__init__.py -------------------------------------------------------------------------------- /src/matching/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bbowman410/VGraph/ddeaf07e4f5864a1788d62b8a7893d415f44a525/src/matching/__init__.pyc -------------------------------------------------------------------------------- /src/matching/__pycache__/exact_matcher.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bbowman410/VGraph/ddeaf07e4f5864a1788d62b8a7893d415f44a525/src/matching/__pycache__/exact_matcher.cpython-37.pyc -------------------------------------------------------------------------------- /src/matching/exact_matcher.py: -------------------------------------------------------------------------------- 1 | from matcher import Matcher 2 | from networkx.algorithms import isomorphism 3 | import networkx as nx 4 | 5 | class ExactMatcher(Matcher): 6 | 7 | 8 | def __init__(self): 9 | pass 10 | 11 | def match(self, q, t, q_prepared, t_prepared): 12 | GM = isomorphism.DiGraphMatcher(t,q, node_match=self.custom_node_match,edge_match=self.custom_edge_match) 13 | res = GM.subgraph_is_isomorphic() 14 | if res: 15 | return {}, 100 16 | else: 17 | return {}, 0 18 | 19 | 20 | def prepare_query(self, q): 21 | return q 22 | 23 | def prepare_target(self, t): 24 | return t 25 | 26 | 27 | def custom_node_match(self,n1, n2): 28 | if n1['type'] == n2['type']: 29 | return True 30 | else: 31 | return False 32 | 33 | def custom_edge_match(self,e1, e2): 34 | if e1['type'] == e2['type']: 35 | return True 36 | else: 37 | return False 38 | -------------------------------------------------------------------------------- /src/matching/exact_matcher.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bbowman410/VGraph/ddeaf07e4f5864a1788d62b8a7893d415f44a525/src/matching/exact_matcher.pyc -------------------------------------------------------------------------------- /src/matching/matcher.py: -------------------------------------------------------------------------------- 1 | # Base class for vGraph matching algs 2 | 3 | # Typically there is some type of preprocessing that must be applied to 4 | # all target and query graphs. This is captured in the prepare function. 5 | 6 | # Then the match function takes a query and target graph and actually 7 | # performs matching. In some cases it may assume that prepare has already 8 | # been called on the data. 9 | class Matcher: 10 | 11 | def __init__(self): 12 | pass 13 | 14 | def match(self, q, t, q_prepared, t_prepared): 15 | raise NotImplementedError 16 | 17 | def prepare_query(self, q): 18 | raise NotImplementedError 19 | 20 | def prepare_target(self, t): 21 | raise NotImplementedError 22 | -------------------------------------------------------------------------------- /src/matching/matcher.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bbowman410/VGraph/ddeaf07e4f5864a1788d62b8a7893d415f44a525/src/matching/matcher.pyc -------------------------------------------------------------------------------- /src/matching/path_matcher.py: -------------------------------------------------------------------------------- 1 | from matcher import Matcher 2 | 3 | import networkx as nx 4 | import copy 5 | 6 | DEBUG=False 7 | 8 | class PathMatcher(Matcher): 9 | 10 | def match(self, q, t, prepared_q_data, prepared_t_data): 11 | q_index, q_imp_nodes, q_imp_bfs = prepared_q_data 12 | t_index = prepared_t_data 13 | #for n in q_imp_nodes: 14 | # for q in q_index: 15 | # if q not in q_imp_bfs_f[n] and q not in q_imp_bfs_r[n]: 16 | # print "WTF somehow node not in any bfs...%s" % q 17 | 18 | if DEBUG: 19 | print "=============Important nodes=============" 20 | for node in q_imp_nodes: 21 | print node 22 | print q.node[node] 23 | #imp_node_matches = self.__match_imp_nodes_2(q_index, t_index, q_imp_nodes) 24 | imp_node_matches = self.__match_imp_nodes_3(q, t, q_index, t_index, q_imp_nodes) 25 | matches_sorted = imp_node_matches 26 | #for blah in reversed(sorted(imp_node_matches.iteritems(), key=lambda (k, v):q_index[k]['degree'])): 27 | # matches_sorted.append((blah[0], blah[1][0], blah[1][1])) 28 | 29 | if DEBUG: 30 | print "Important node matching results:" 31 | print matches_sorted[:3] 32 | for (a,b,c,d) in matches_sorted[:3]: 33 | print a 34 | print q.node[a] 35 | print b 36 | print t.node[b] 37 | # Let's try each one and choose the best! 38 | best_mapping = {} 39 | best_score = 0. 40 | for (imp_q, imp_t, _, imp_score) in matches_sorted[:1]:# cap it at 3 attempts to get an importat node right. Otherwise it can take just too long depending on size of graph 41 | # Forward BFS tree for this important node (precomputed) 42 | q_bfs_tree = q_imp_bfs[imp_q] 43 | 44 | q_depth_map = {} 45 | max_depth = self.__get_depth(q_bfs_tree, imp_q, q_depth_map) 46 | 47 | for n in q_index: 48 | q_index[n]['height'] = q_depth_map[n] 49 | 50 | # do same for target graph i guess? 51 | # This seems too expensive... 52 | #T_bfs_tree = nx.bfs_tree(T.to_undirected(), imp_t) 53 | #if DEBUG: 54 | # print "bfs tree: %d" % len(T_bfs_tree) 55 | #T_weight_map = {} 56 | #max_weight = get_depth(T_bfs_tree, imp_t, T_weight_map) 57 | #for n in T.nodes: 58 | # T_index[n]['height'] = T_weight_map[n] 59 | #if DEBUG: 60 | # print "length of weight map: %d" % len(T_weight_map.keys()) 61 | # print "Max weight: %d" % max_weight 62 | 63 | 64 | # Step 5: Set important node match, and start growing 65 | current_match = {} 66 | current_match[imp_q] = (imp_t, imp_score) 67 | score = self.__grow_match(q, t, q_bfs_tree, q_index, t_index, current_match, (imp_q,imp_t)) 68 | score += imp_score# for our important node 69 | if score > best_score: 70 | best_score = score 71 | best_mapping = current_match 72 | 73 | final_score = (best_score / (1.0*len(q_index))) 74 | return best_mapping, final_score 75 | 76 | def prepare_query(self, q): 77 | # Step 1: Generate index 78 | q_index = {} 79 | for n in q.nodes: 80 | q_index[n] = self.__gen_node_index(q, n) 81 | 82 | # step 2: Identify important nodes 83 | q_imp_nodes = self.__get_important_nodes(q) 84 | 85 | # Step 3: for each important node, build forward and reverse BFS trees 86 | # No just do 1 bfs tree. This is the guide for the match. the directionality of the edges 87 | # will be considered during matching. i dont care if bfs tree node consists of forward/rev 88 | # edges. all i care is that below it there is a bunch of nodes so i want to make sure 89 | # to give that node priority over others. will 90 | q_imp_bfs = {} 91 | for n in q_imp_nodes: 92 | q_imp_bfs[n] = nx.bfs_tree(q.to_undirected(), n) 93 | 94 | 95 | 96 | return (q_index, q_imp_nodes, q_imp_bfs) 97 | 98 | def prepare_target(self, t): 99 | t_index = {} 100 | for n in t.nodes: 101 | t_index[n] = self.__gen_node_index(t, n) 102 | 103 | return t_index 104 | 105 | def __gen_node_index(self, g, n): 106 | d = {} 107 | d['label'] = g.node[n]['type'] 108 | d['degree'] = g.degree(n) 109 | d['in_degree'] = g.in_degree(n) 110 | d['out_degree'] = g.out_degree(n) 111 | if 'code' in g.node[n]: 112 | d['code'] = g.node[n]['code'] 113 | else: 114 | d['code'] = '' 115 | 116 | # Look at neighborhood 117 | d['in_nbArray'] = set([]) 118 | d['in_edgeArray'] = set([]) 119 | d['out_nbArray'] = set([]) 120 | d['out_edgeArray'] = set([]) 121 | for succ in g.successors(n): 122 | d['out_nbArray'].add(g.node[succ]['type']) 123 | d['out_edgeArray'].add(g[n][succ]['type']) 124 | 125 | for pred in g.predecessors(n): 126 | d['in_nbArray'].add(g.node[pred]['type']) 127 | d['in_edgeArray'].add(g[pred][n]['type']) 128 | 129 | return d 130 | 131 | def __get_important_nodes(self, G, p=0.1): 132 | # import based on degree 133 | node_degree_dict = {} 134 | nodes_to_return = int(len(G.nodes) * p) 135 | for n in G.nodes: 136 | node_degree_dict[n] = G.degree(n) 137 | 138 | important_nodes = [] 139 | 140 | for node_id, degree in reversed(sorted(node_degree_dict.iteritems(), key=lambda (k,v): (v,k))): 141 | important_nodes.append(node_id) 142 | nodes_to_return -= 1 143 | 144 | if nodes_to_return <= 0: 145 | break 146 | 147 | return important_nodes 148 | 149 | 150 | def __match_imp_nodes(self, q_index, t_index, imp_nodes): 151 | all_mappings = nx.Graph() 152 | for n in imp_nodes: 153 | for t in t_index.keys(): 154 | score = self.__match_node(q_index[n], t_index[t]) 155 | if score > 0.0: 156 | all_mappings.add_edge('Q_%s'% n, 'T_%s'%t) 157 | all_mappings['Q_%s'%n]['T_%s'%t]['weight'] = score 158 | 159 | if DEBUG: 160 | print "All mappings before max_weight_matching" 161 | print all_mappings.nodes() 162 | for (a,b) in all_mappings.edges(): 163 | print "%s %s" % (a, b) 164 | print all_mappings[a][b] 165 | 166 | max_weight_mapping = nx.max_weight_matching(all_mappings) 167 | 168 | final_mapping = {} 169 | for n in imp_nodes: 170 | if 'Q_%s'%n in max_weight_mapping.keys(): 171 | target = max_weight_mapping['Q_%s'%n] 172 | final_mapping[n] = (target[2:], all_mappings['Q_%s'%n][target]['weight']) 173 | 174 | return final_mapping 175 | 176 | # Ok new try. This time were going to not matching important nodes. were matching important 177 | # NEIGHBORHOOD. This is defined as the 1-hop neighborhood with nighest combined degree 178 | def __match_imp_nodes_3(self, q_graph, t_graph, q_index, t_index, imp_nodes): 179 | # For each important node: 180 | # Find match in target 181 | # loop through query and target neighbors 182 | # compute matches 183 | # score of this important node is based on important node and neighbhood matches 184 | all_mappings = [] 185 | for n in imp_nodes: 186 | for t in t_index.keys(): 187 | score = self.__match_node(q_index[n], t_index[t]) 188 | score_only_one = score 189 | if score > 0.0: # possible match 190 | q_neibs = set(list(q_graph.predecessors(n))).union(set(list(q_graph.successors(n)))) 191 | t_neibs = set(list(t_graph.predecessors(t))).union(set(list(t_graph.successors(t)))) 192 | matching_n = nx.Graph() 193 | for qn in q_neibs: 194 | for tn in t_neibs: 195 | score_nb = self.__match_node(q_index[qn], t_index[tn]) 196 | if score_nb > 0.0: 197 | matching_n.add_edge('Q_%s'%qn, 'T_%s'%tn) 198 | matching_n['Q_%s'%qn]['T_%s'%tn]['weight'] = score_nb 199 | # now we have neighborhood matching 200 | max_weight_mapping = nx.max_weight_matching(matching_n) 201 | for qn in max_weight_mapping: 202 | tn = max_weight_mapping[qn] 203 | score += matching_n[qn][tn]['weight'] # get the original weight 204 | all_mappings.append((n,t,score, score_only_one)) # total neighborhood match score 205 | 206 | # Now we sort and return... 207 | # I think i want to sort this on score now... 208 | sorted_mappings = [ (a, b, c,d) for (a, b, c,d) in reversed(sorted(all_mappings, key=lambda (q,t,s,soo):s))] 209 | if DEBUG: 210 | print "==================Sorted mappings==============" 211 | for sm in sorted_mappings: 212 | print sm 213 | 214 | return sorted_mappings 215 | 216 | # This function is broken. It needs to be more robust when matching important nodes 217 | # If thanything, this should take the MOST time becuase an error here can be very bad for rest of 218 | # matching algorithm 219 | def __match_imp_nodes_2(self, q_index, t_index, imp_nodes): 220 | all_mappings = [] 221 | for n in imp_nodes: 222 | for t in t_index.keys(): 223 | score = self.__match_node(q_index[n], t_index[t]) 224 | if score > 0.0: # found a possible match 225 | all_mappings.append((n, t, score)) 226 | # now lets sort 227 | 228 | sorted_mappings = [ (a, b, c) for (a, b, c) in reversed(sorted(all_mappings, key=lambda (q,t,s):(q_index[q]['degree'],s)))] 229 | if DEBUG: 230 | print "==================Sorted mappings==============" 231 | for sm in sorted_mappings: 232 | print sm 233 | 234 | return sorted_mappings 235 | 236 | def __match_node(self, q_index, t_index): 237 | # Returns match score [0.0,1.0] 238 | # 0.0 = no match 239 | # >0.0 means partial match 240 | 241 | r = 1. # threshold below which we don't consider it a partial match 242 | #r = 0.0 # percentage of allowed difference from query node 243 | 244 | if q_index['label'] != t_index['label']: 245 | return 0. 246 | 247 | # Minimum of , or absolute value of difference. At most different by 248 | if q_index['in_degree'] < t_index['in_degree']: 249 | in_degree_delta = 0. 250 | else: 251 | # cap difference at most q_index['in_degree'] 252 | in_degree_delta = min(q_index['in_degree'], abs(q_index['in_degree'] - t_index['in_degree'])) 253 | 254 | if q_index['out_degree'] < t_index['out_degree']: 255 | out_degree_delta = 0. 256 | else: 257 | out_degree_delta = min(q_index['out_degree'], abs(q_index['out_degree'] - t_index['out_degree'])) 258 | 259 | # These will contain neighbors/edges that are not covered in target 260 | in_nbArray_delta = q_index['in_nbArray'].difference(q_index['in_nbArray'].intersection(t_index['in_nbArray'])) 261 | out_nbArray_delta = q_index['out_nbArray'].difference(q_index['out_nbArray'].intersection(t_index['out_nbArray'])) 262 | in_edgeArray_delta = q_index['in_edgeArray'].difference(q_index['in_edgeArray'].intersection(t_index['in_edgeArray'])) 263 | out_edgeArray_delta = q_index['out_edgeArray'].difference(q_index['out_edgeArray'].intersection(t_index['out_edgeArray'])) 264 | 265 | #if q_index['in_degree'] > t_index['in_degree'] + int(r*q_index['in_degree']): 266 | # return 0. 267 | #if in_degree_delta > int(r*q_index['in_degree']): 268 | # return 0. 269 | 270 | #if q_index['out_degree'] > t_index['out_degree'] + int(r*q_index['out_degree']): 271 | # return 0. 272 | #if out_degree_delta > int(r*q_index['out_degree']): 273 | # return 0. 274 | 275 | #if len(in_nbArray_delta) > int(r*len(q_index['in_nbArray'])): 276 | # return 0. 277 | 278 | #if len(out_nbArray_delta) > int(r*len(q_index['out_nbArray'])): 279 | # return 0. 280 | 281 | #if len(in_edgeArray_delta) > int(r*len(q_index['in_edgeArray'])): 282 | # return 0. 283 | 284 | #if len(out_edgeArray_delta) > int(r*len(q_index['out_edgeArray'])): 285 | # return 0. 286 | 287 | #if DEBUG: 288 | # print "Matched nodes:" 289 | # print q_index 290 | # print t_index 291 | #if r == 0.0: 292 | # return 1.0 293 | 294 | # if we get here, then were good 295 | # If total mismatch, this would be our score 296 | total_possible_score = 0. 297 | total_possible_score += q_index['in_degree'] 298 | total_possible_score += q_index['out_degree'] 299 | total_possible_score += len(q_index['in_nbArray']) 300 | total_possible_score += len(q_index['out_nbArray']) 301 | total_possible_score += len(q_index['in_edgeArray']) 302 | total_possible_score += len(q_index['out_edgeArray']) 303 | 304 | # Compute final score as total mismatch score - actual score. If actual score is total mismatch, we get 0. If actual score is 0 (i.e. perfect match), then we get 1 305 | final_score = (total_possible_score - float(in_degree_delta + out_degree_delta + len(in_nbArray_delta)+len(out_nbArray_delta)+len(in_edgeArray_delta)+len(out_edgeArray_delta))) / total_possible_score 306 | if final_score < r: 307 | return 0.0 308 | else: 309 | return final_score 310 | 311 | def __get_depth(self, bfs_tree, node_id, depth_map): 312 | children = bfs_tree[node_id] 313 | 314 | if len(children) == 0: 315 | depth_map[node_id] = 1 316 | return 1 317 | else: 318 | weight = 1 # for current node 319 | for c in children: 320 | c_subtree = self.__get_depth(bfs_tree, c, depth_map) 321 | weight += c_subtree 322 | depth_map[node_id] = weight 323 | return weight 324 | 325 | def __grow_match(self, q, t, q_bfs_tree, q_index, t_index, current_match, match_root, recursion_id = ''): 326 | 327 | # Step 1: Get neighbors of the match root in both query and target graphs 328 | q_root = match_root[0] 329 | t_root = match_root[1] 330 | if DEBUG: 331 | print "%sGrowing match from root: (%s = > %s)" % (recursion_id, q_root, t_root) 332 | 333 | # BFS tree will direct what nodes we use next 334 | q_root_nbors_f = set(q_bfs_tree.neighbors(q_root)).intersection(set(q.successors(q_root))).difference(current_match.keys()) 335 | q_root_nbors_r = set(q_bfs_tree.neighbors(q_root)).intersection(set(q.predecessors(q_root))).difference(current_match.keys()) 336 | t_root_nbors_f = set(t.successors(t_root)).difference(set([a for (a,b) in current_match.values()])) 337 | t_root_nbors_r = set(t.predecessors(t_root)).difference(set([a for (a,b) in current_match.values()])) 338 | 339 | if (len(q_root_nbors_f) == 0 and len(q_root_nbors_r) == 0) or (len(t_root_nbors_f) == 0 and len(t_root_nbors_r) == 0): 340 | if DEBUG: 341 | print "%sNo neighbors. End of match path" % (recursion_id) 342 | return 0.0 343 | 344 | # Step 2: Find ALL potential matchs of Query => Target. A potential match is any match with a score > 0.0 345 | potential_matches_found = 0 346 | potential_matches = {} 347 | for q_nb in q_root_nbors_f.union(q_root_nbors_r): 348 | potential_matches[q_nb] = {} 349 | 350 | # Find potential matches for forward neighbors 351 | for q_nb_f in q_root_nbors_f: 352 | for t_nb_f in t_root_nbors_f: 353 | score = self.__match_node(q_index[q_nb_f], t_index[t_nb_f]) 354 | if score > 0.0: 355 | potential_matches[q_nb_f][t_nb_f] = score 356 | potential_matches_found +=1 357 | 358 | # Find potential matches for backward neighbors 359 | for q_nb_r in q_root_nbors_r: 360 | for t_nb_r in t_root_nbors_r: 361 | score = self.__match_node(q_index[q_nb_r], t_index[t_nb_r]) 362 | if score > 0.0: 363 | potential_matches[q_nb_r][t_nb_r] = score 364 | potential_matches_found +=1 365 | 366 | # Another base-case condition checked here 367 | if potential_matches_found == 0: 368 | if DEBUG: 369 | print "%sNo potential matches found. End of match path" % (recursion_id) 370 | return 0.0 371 | 372 | if DEBUG: 373 | print "%sPotential Matches:" % (recursion_id) 374 | print potential_matches 375 | 376 | # Step 3: At this point we know we have some potential matches to score 377 | # - Sort query matches based on height parameter (i.e., nodes with most children go first) 378 | # - For every target node that matched with the query node 379 | # - set match root as Q=>T and grow match from that root 380 | 381 | total_score = 0.0 # This is the total score of all matches below the current match root. This value gets popped up the call stack to previous caller 382 | for q_nb in reversed(sorted(list(potential_matches), key=lambda x: q_index[x]['height'])): # priority by height of node 383 | if DEBUG: 384 | print "%sMatching query node: %s, weight: %d" % (recursion_id, q_nb, q_index[q_nb]['height']) 385 | print q.node[q_nb] 386 | best_match = None 387 | for t_nb in set(potential_matches[q_nb].keys()).difference(set([a for (a,b) in current_match.values()])): # only first one cuz why not?? 388 | # Found a potential match. Lets follow the path and see where it leads 389 | if DEBUG: 390 | print "%sFollowing match path: %s ==> %s" % (recursion_id,q_nb, t_nb) 391 | # Make deep copy of current_match dict 392 | current_match_copy = copy.deepcopy(current_match) 393 | # Set the new match root with potential match score 394 | current_match_copy[q_nb] = (t_nb, potential_matches[q_nb][t_nb]) 395 | # Recursive call 396 | score = self.__grow_match(q, t,q_bfs_tree, q_index, t_index, current_match_copy, (q_nb, t_nb), recursion_id=recursion_id+'++') 397 | if DEBUG: 398 | print "%sPotential %s ==> %s resulted in path score of %f" % (recursion_id, q_nb, t_nb, score) 399 | if best_match is None or score > best_match[2]: 400 | # TODO Problem here: if there are multiple best_matches with same score... 401 | # this algo just takes first one 402 | 403 | # need to keep track of all matches with same score 404 | best_match = (q_nb, t_nb, score, copy.deepcopy(current_match_copy)) # deep copy again? not sure if thats necesary 405 | 406 | if score >= q_index[q_nb]['height']: 407 | # we matched the full path for this query node. we can move onto next query node 408 | if DEBUG: 409 | print "%sFull path match. Breaking" % (recursion_id) 410 | break 411 | 412 | 413 | # No matching node found for this query node. Move on to next q_nb 414 | if not best_match: 415 | if DEBUG: 416 | print "Could not find a match for query node: %s" % q_nb 417 | continue 418 | 419 | # Step 4: We have a match for our q_nb. Now we need to update our current_match dict with all matches 420 | # that occured while matching that neighbor (could be an entire match path) 421 | if DEBUG: 422 | print "%sBest match: %s ==> %s (%f)" % (recursion_id, q_nb, best_match[1], best_match[2]) 423 | print q.node[q_nb] 424 | print t.node[best_match[1]] 425 | # Update current match based on best path match 426 | for n in best_match[3]: 427 | if n not in current_match: 428 | current_match[n] = best_match[3][n] 429 | # Set the best potential match as a real match 430 | current_match[q_nb] = (best_match[1], potential_matches[q_nb][best_match[1]]) 431 | 432 | # Update total score for this match root and potential matched node 433 | total_score += (best_match[2] + potential_matches[q_nb][best_match[1]]) 434 | 435 | # Onto the next q_nb 436 | 437 | # At this point we have evaluated each q_nb of match root for a t_nb of match root 438 | # We have chosen the best scoring match for each q_nb, ordered by how selective 439 | # that q_nb is in the total Q graph 440 | 441 | # Sanity check: We know that the total_score should not be more than the height of 442 | 443 | # The total score below represents the summation of all q_nb match paths 444 | if DEBUG: 445 | print "%sTotal score: %f" % (recursion_id, total_score) 446 | 447 | 448 | return total_score 449 | 450 | 451 | 452 | 453 | 454 | if __name__ == "__main__": 455 | # Simple graph test 456 | G = nx.DiGraph() 457 | G.add_edge('1','2') 458 | G.add_edge('2','3') 459 | G.add_edge('3','4') 460 | G.add_edge('1','5') 461 | G.add_edge('5','6') 462 | G.add_edge('1','7') 463 | G.node['1']['type'] = 'one' 464 | G.node['2']['type'] = 'two' 465 | G.node['3']['type'] = 'three' 466 | G.node['4']['type'] = 'four' 467 | G.node['5']['type'] = 'two' 468 | G.node['6']['type'] = 'three' 469 | G.node['7']['type'] = 'two' 470 | G['1']['2']['type'] = 'edge_1_2' 471 | G['2']['3']['type'] = 'edge_2_3' 472 | G['3']['4']['type'] = 'edge_3_4' 473 | G['1']['5']['type'] = 'edge_1_2' 474 | G['5']['6']['type'] = 'edge_2_3' 475 | G['1']['7']['type'] = 'edge_1_2' 476 | 477 | H = G.copy() 478 | H.node['4']['type'] = '2' 479 | 480 | # Create Pathmatcher 481 | pm = PathMatcher() 482 | # Prepare data 483 | prepared_q_data = pm.prepare_query(G) 484 | prepared_t_data = pm.prepare_target(H) 485 | # Perform matching 486 | results = pm.match(G, H, prepared_q_data, prepared_t_data) 487 | print results 488 | exit() 489 | 490 | #match(G, G) 491 | 492 | #exit() 493 | #FN vgraph_db/linux/CVE-2008-5033/tvaudio.c/chip_command vuln_src_db/vuln_patch_graph_db/linux/CVE-2008-5033/vuln/tvaudio.c/graph/chip_command.gpickle 0 76 90 494 | #FN vgraph_db/ffmpeg/CVE-2014-8547/gifdec.c/gif_read_image vuln_src_db/vuln_patch_graph_db/ffmpeg/CVE-2014-8547/vuln/gifdec.c/graph/gif_read_image.gpickle 0 0 50 495 | #FN vgraph_db/openssl/CVE-2015-1793/x509_vfy.c/X509_verify_cert vuln_src_db/vuln_patch_graph_db/openssl/CVE-2015-1793/vuln/x509_vfy.c/graph/X509_verify_cert.gpickle 0 0 0 496 | #FN vgraph_db/ffmpeg/CVE-2012-2775/alsdec.c/read_var_block_data vuln_src_db/vuln_patch_graph_db/libav/CVE-2012-2775/vuln/alsdec.c/graph/read_var_block_data.gpickle 2 0 0 497 | 498 | 499 | 500 | 501 | 502 | vGraph_context = nx.read_gpickle('vgraph_db/ffmpeg/CVE-2012-2775/alsdec.c/read_var_block_data_context.gpickle') 503 | vGraph_pos = nx.read_gpickle('vgraph_db/ffmpeg/CVE-2012-2775/alsdec.c/read_var_block_data_pvg.gpickle') 504 | vGraph_neg = nx.read_gpickle('vgraph_db/ffmpeg/CVE-2012-2775/alsdec.c/read_var_block_data_nvg.gpickle') 505 | 506 | V = nx.read_gpickle('vuln_src_db/vuln_patch_graph_db/ffmpeg/CVE-2012-2775/vuln/alsdec.c/graph/read_var_block_data.gpickle') 507 | P = nx.read_gpickle('vuln_src_db/vuln_patch_graph_db/ffmpeg/CVE-2012-2775/patch/alsdec.c/graph/read_var_block_data.gpickle') 508 | 509 | mapping, score = match(vGraph_neg, V) 510 | 511 | print "Lenght of mapping:" 512 | print len(mapping) 513 | print "Score:" 514 | print score * 100 515 | 516 | exit() 517 | 518 | v_to_p_mapping, _ = heuristic_match(V, P) 519 | pos_imp_nodes = set(V.nodes).difference(set(v_to_p_mapping.keys())) 520 | neg_imp_nodes = set(P.nodes).difference(set(v_to_p_mapping.values())) 521 | expanded = set([]) 522 | for n in neg_imp_nodes: 523 | expanded.add(n) 524 | for n in P.neighbors(n): 525 | expanded.add(n) 526 | 527 | neg_imp_nodes = expanded 528 | print "Num + nodes: %d" % len(pos_imp_nodes) 529 | print "Num - nodes: %d" % len(neg_imp_nodes) 530 | 531 | 532 | # perform match V against V 533 | mapping = match(P, V) 534 | 535 | # lets see how many pos imp nodes matched 536 | matches = 0 537 | for n in neg_imp_nodes: 538 | if n in mapping: 539 | matches += 1 540 | print "Neg Imp Nodes: %d / %d" % (matches, len(neg_imp_nodes)) 541 | -------------------------------------------------------------------------------- /src/matching/path_matcher.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bbowman410/VGraph/ddeaf07e4f5864a1788d62b8a7893d415f44a525/src/matching/path_matcher.pyc -------------------------------------------------------------------------------- /src/matching/tale_matcher.py: -------------------------------------------------------------------------------- 1 | from matcher import Matcher 2 | 3 | import networkx as nx 4 | 5 | 6 | class TaleMatcher(Matcher): 7 | 8 | def __init__(self, p=0.2, r=0.1): 9 | self.p=p 10 | self.r=r 11 | 12 | def match(self, q, t, prepared_q_data, prepared_t_data): 13 | q_nh_idx, important_nodes = prepared_q_data 14 | t_nh_idx = prepared_t_data 15 | 16 | # Match the important nodes to our target graph 17 | weight_mappings = self.__match_imp_nodes(q_nh_idx, t_nh_idx, important_nodes) 18 | 19 | # Grow the match 20 | res = self.__grow_match(q, t, q_nh_idx, t_nh_idx, weight_mappings) 21 | 22 | # Calculate overall match score 23 | # Each match can have a weight (aka score) of at most 2 24 | total_possible_score = 2 * len(q.nodes) 25 | achieved_score = 0. 26 | for n in res: 27 | achieved_score += res[n][1] 28 | 29 | overall_score = float(achieved_score) / float(total_possible_score) 30 | 31 | return res, overall_score 32 | 33 | def prepare_target(self, t): 34 | t_nh_idx = self.__generate_nh_index(t) 35 | return t_nh_idx 36 | 37 | def prepare_query(self, q): 38 | q_nh_idx = self.__generate_nh_index(q) 39 | important_nodes = self.__find_important_nodes(q) 40 | return (q_nh_idx, important_nodes) 41 | 42 | 43 | def __match_imp_nodes(self, q_index, t_index, imp_nodes): 44 | all_mappings = nx.Graph() 45 | for n in imp_nodes: 46 | for t in t_index.keys(): 47 | score = self.__match_node(q_index[n], t_index[t]) 48 | if score > 0.0: 49 | all_mappings.add_edge('Q_%s'% n, 'T_%s'%t) 50 | all_mappings['Q_%s'%n]['T_%s'%t]['weight'] = score 51 | 52 | max_weight_mapping = nx.max_weight_matching(all_mappings) 53 | 54 | final_mapping = {} 55 | for n in imp_nodes: 56 | if 'Q_%s'%n in max_weight_mapping.keys(): 57 | target = max_weight_mapping['Q_%s'%n] 58 | final_mapping[n] = (target[2:], all_mappings['Q_%s'%n][target]['weight']) 59 | 60 | return final_mapping 61 | 62 | def __grow_match(self, q, t, q_nh_idx, t_nh_idx, weight_mapping): 63 | processing_queue = [] 64 | final_results = {} 65 | # Add our important nodes to the processing queue 66 | # we build weight_mapping_helper which will make it easier to sort 67 | weight_mapping_helper = {} 68 | for n in weight_mapping: 69 | weight_mapping_helper[n] = weight_mapping[n][1] # This is the score 70 | 71 | # Add to processing_queue with highest scores first (FIFO) 72 | for n, v in reversed(sorted(weight_mapping_helper.iteritems(), key=lambda(k,v):(v,k))): 73 | # This is adding our 1-1 mapping of important nodes to processing queue 74 | processing_queue.append((n, weight_mapping[n][0])) 75 | 76 | while len(processing_queue) > 0: 77 | # adding single match to final result 78 | (n_query, n_target) = processing_queue.pop(0) 79 | final_results[n_query] = (n_target, weight_mapping[n_query][1]) 80 | 81 | # Need to check neighbors of n_query not yet matched 82 | nb_query = set(q.neighbors(n_query)).difference(set(final_results.keys())) 83 | 84 | # Get all nodes 2 hops away, not in our final results 85 | nb_query_2_hops = [] 86 | for x in q.neighbors(n_query): 87 | nb_query_2_hops = set(nb_query_2_hops).union(set(q.neighbors(x))) 88 | nb_query_2_hops = set(nb_query_2_hops).difference(set(nb_query)).difference(set(final_results.keys())) 89 | 90 | # Get all target nodes that are neighbors of last matching db node which are not 91 | # in final result or processing queue 92 | nb_target = set(t.neighbors(n_target)).difference(set([a for (a,b) in final_results.values()])).difference(set([b for (a,b) in processing_queue])) 93 | 94 | # target nodes that are 2 hops away 95 | nb_target_2_hops = [] 96 | for x in t.neighbors(n_target): 97 | nb_target_2_hops = set(nb_target_2_hops).union(set(t.neighbors(x))) 98 | nb_target_2_hops = set(nb_target_2_hops).difference(set(nb_target)).difference(set([a for (a,b) in final_results.values()])).difference(set([b for (a,b) in processing_queue])) 99 | 100 | self.__match_nodes(q_nh_idx, t_nh_idx, nb_query, nb_target, processing_queue, weight_mapping) 101 | self.__match_nodes(q_nh_idx, t_nh_idx, nb_query, nb_target_2_hops, processing_queue, weight_mapping) 102 | self.__match_nodes(q_nh_idx, t_nh_idx, nb_query_2_hops, nb_target, processing_queue, weight_mapping) 103 | 104 | return final_results 105 | 106 | 107 | def __match_node(self, query_nhi, target_nhi): 108 | '''This function will simultaneously match and score two neighborhood indices''' 109 | # IV.1 from paper 110 | if query_nhi['label'] != target_nhi['label']: 111 | return 0. 112 | 113 | # Compute allowed neighbor mismatch (nb_miss in paper) 114 | nb_allowed_misses = int(self.r * query_nhi['degree']) 115 | 116 | 117 | # Compute allowed neighbor connection missmatch (nbc_miss in paper) 118 | nbc_allowed_misses = nb_allowed_misses * ((nb_allowed_misses-1)/2) + (query_nhi['degree'] - nb_allowed_misses) * nb_allowed_misses 119 | 120 | # IV.2 from paper 121 | if target_nhi['degree'] < query_nhi['degree'] - nb_allowed_misses: 122 | return 0. 123 | 124 | # IV.3 test. Also compute nb_miss for later 125 | nb_miss = abs(len(query_nhi['nbArray']) - len(set(query_nhi['nbArray']).intersection(set(target_nhi['nbArray'])))) 126 | 127 | if nb_miss > nb_allowed_misses: 128 | return 0. 129 | 130 | # IV.4 test 131 | if target_nhi['nbConnection'] < query_nhi['nbConnection'] - nbc_allowed_misses: 132 | return 0. 133 | 134 | # Compute actual nbc_miss 135 | if target_nhi['nbConnection'] >= query_nhi['nbConnection']: 136 | nbc_miss = 0. 137 | else: 138 | nbc_miss = float(query_nhi['nbConnection'] - target_nhi['nbConnection']) 139 | 140 | #if nbc_miss > num_allowed_misses: 141 | # return 0. 142 | 143 | #Now score match 144 | if query_nhi['degree'] == 0: 145 | f_nb = 0. 146 | else: 147 | f_nb = float(nb_miss) / float(query_nhi['degree']) 148 | 149 | if query_nhi['nbConnection'] == 0: 150 | f_nbc = 0. 151 | else: 152 | f_nbc = float(nbc_miss) / float(query_nhi['nbConnection']) 153 | 154 | 155 | if nb_miss == 0: 156 | w = 2. - f_nbc 157 | else: 158 | w = 2. - (f_nb + (f_nbc / nb_miss)) 159 | 160 | return w 161 | 162 | 163 | def __match_nodes(self, q_nh_idx, t_nh_idx, query_nodes, target_nodes, processing_queue, weight_mapping): 164 | for q in query_nodes: 165 | best_match = None 166 | for target in target_nodes: 167 | score = self.__match_node(q_nh_idx[q], t_nh_idx[target]) 168 | if score > 0.: 169 | if best_match is None: 170 | best_match = (target, score) 171 | else: 172 | if score > best_match[1]: 173 | best_match = (target, score) 174 | if best_match is None: 175 | # Unable to match this node...just skip it... 176 | continue 177 | 178 | if q not in [a for (a,b) in processing_queue]: 179 | if best_match[0] in [b for (a,b) in processing_queue]: 180 | print "somehow we are tryping to add a DB node thats already in processing queu" 181 | exit() 182 | processing_queue.append((q, best_match[0])) 183 | weight_mapping[q] = best_match 184 | target_nodes.remove(best_match[0]) 185 | else: 186 | # This node already in processing q 187 | # need to check score and see if we should replace 188 | if best_match[1] > weight_mapping[q][1]: 189 | processing_queue[processing_queue.index((q, weight_mapping[q][0]))] = (q, best_match[0]) 190 | weight_mapping[q] = best_match 191 | target_nodes.remove(best_match[0]) 192 | 193 | 194 | # Generate neighborhood index for all nodes in graph g 195 | def __generate_nh_index(self, g): 196 | nh_idx = {} 197 | for n in g.nodes: 198 | nh_idx[n] = self.__node_nh_idx(g,n) 199 | return nh_idx 200 | 201 | # Generate neighhood index for an individual node 202 | def __node_nh_idx(self, g, n): 203 | d = {} 204 | d['label'] = g.node[n]['type'] 205 | if 'code' in g.node[n]: 206 | d['code'] = g.node[n]['code'] 207 | d['degree'] = g.degree(n) 208 | d['nbConnection'] = 0 209 | d['nbArray'] = [] 210 | d['edgeArray'] = [] 211 | for nb in g.neighbors(n): 212 | # Building list of labels 213 | if g.node[nb]['type'] not in d['nbArray']: 214 | d['nbArray'].append(g.node[nb]['type']) 215 | 216 | # Keeping track of neighbor connectedness 217 | d['nbConnection'] = d['nbConnection'] + len(set(g.neighbors(nb)).intersection(set(g.neighbors(n)))) 218 | 219 | # Double counted...so divide by two 220 | d['nbConnection'] = d['nbConnection'] / 2 221 | return d 222 | 223 | def __find_important_nodes(self, graph): 224 | """ Returns a list of important nodes (based on degree, top p percent) """ 225 | 226 | # import based on degree centrality 227 | node_degree_dict = {} 228 | nodes_to_return = int(len(graph.nodes) * self.p) 229 | for n in graph.nodes: 230 | node_degree_dict[n] = graph.degree(n) 231 | 232 | important_nodes = [] 233 | for node_id, degree in reversed(sorted(node_degree_dict.iteritems(), key=lambda (k,v): (v,k))): 234 | important_nodes.append(node_id) 235 | nodes_to_return = nodes_to_return - 1 236 | if nodes_to_return <= 0: 237 | break 238 | 239 | return important_nodes 240 | 241 | 242 | 243 | if __name__ == "__main__": 244 | # Simple graph test 245 | G = nx.DiGraph() 246 | G.add_edge('1','2') 247 | G.add_edge('2','3') 248 | G.add_edge('3','4') 249 | G.add_edge('1','5') 250 | G.add_edge('5','6') 251 | G.add_edge('1','7') 252 | G.node['1']['type'] = 'one' 253 | G.node['2']['type'] = 'two' 254 | G.node['3']['type'] = 'three' 255 | G.node['4']['type'] = 'four' 256 | G.node['5']['type'] = 'two' 257 | G.node['6']['type'] = 'three' 258 | G.node['7']['type'] = 'two' 259 | G['1']['2']['type'] = 'edge_1_2' 260 | G['2']['3']['type'] = 'edge_2_3' 261 | G['3']['4']['type'] = 'edge_3_4' 262 | G['1']['5']['type'] = 'edge_1_2' 263 | G['5']['6']['type'] = 'edge_2_3' 264 | G['1']['7']['type'] = 'edge_1_2' 265 | 266 | H = G.copy() 267 | H.node['4']['type'] = '2' 268 | 269 | # Create Pathmatcher 270 | tm = TaleMatcher() 271 | # Prepare data 272 | prepared_data = tm.prepare(G, H) 273 | # Perform matching 274 | results = tm.match(G, H, prepared_data) 275 | print results 276 | -------------------------------------------------------------------------------- /src/matching/tale_matcher.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bbowman410/VGraph/ddeaf07e4f5864a1788d62b8a7893d415f44a525/src/matching/tale_matcher.pyc -------------------------------------------------------------------------------- /src/matching/test.py: -------------------------------------------------------------------------------- 1 | from exact_matcher import ExactMatcher 2 | import networkx as nx 3 | EM=ExactMatcher() 4 | G = nx.DiGraph() 5 | H = nx.DiGraph() 6 | G.add_edge('0','1') 7 | H.add_edge('2','3') 8 | H.add_edge('3','4') 9 | G.node['0']['type'] = 'type_1' 10 | G.node['0']['code'] = 'something else' 11 | G.node['1']['type'] = 'type_1' 12 | G.node['0']['type'] = 'type_0' 13 | H.node['2']['type'] = 'type_0' 14 | H.node['3']['type'] = 'type_1' 15 | H.node['4']['type'] = 'type_2' 16 | G['0']['1']['type'] = 'edge_type_0' 17 | H['2']['3']['type'] = 'edge_type_0' 18 | H['3']['4']['type'] = 'edge_type_2' 19 | res = EM.match(G,H,None,None) 20 | print(res) 21 | res = EM.match(G,H,None,None) 22 | print(res) 23 | 24 | 25 | vgraph = nx.read_gpickle('../../vgraph_db/ffmpeg/CVE-2014-8547/0b39ac6f54505a538c21fe49a626de94c518c903/gifdec.c/gif_read_image_pvg.gpickle') 26 | 27 | vuln_func = nx.read_gpickle('../../vuln_src_db/vuln_patch_graph_db/ffmpeg/CVE-2014-8547/vuln/0b39ac6f54505a538c21fe49a626de94c518c903/gifdec.c/graph/gif_read_image.gpickle') 28 | 29 | test = nx.DiGraph() 30 | test.add_node('2128756') 31 | test.node['2128756']['type'] = 'Symbol' 32 | test.node['2128756']['code'] = 'pass' 33 | 34 | test.add_node('2128631') 35 | test.node['2128631']['type'] = 'ExpressionStatement' 36 | test.node['2128631']['code'] = 'y1 = pass ? 2 : 4' 37 | 38 | test.add_edge('2128631','2128756') 39 | test['2128631']['2128756']['type'] = 'USE' 40 | 41 | print("vgraph num nodes: %d" % len(vgraph.nodes())) 42 | print("vuln func num nodes: %d" % len(vuln_func.nodes())) 43 | 44 | print("Testing big match...") 45 | res = EM.match(test,vuln_func,None,None) 46 | 47 | print(res) 48 | -------------------------------------------------------------------------------- /src/matching/triplet_match.py: -------------------------------------------------------------------------------- 1 | from difflib import SequenceMatcher 2 | import sys,os 3 | import pickle as pkl 4 | 5 | def triplet_match_exact(vg, target_trips): 6 | 7 | cvg_overlap = vg['cvg'].intersection(target_trips) 8 | pvg_overlap = vg['pvg'].intersection(target_trips) 9 | nvg_overlap = vg['nvg'].intersection(target_trips) 10 | 11 | cvg_score = (len(cvg_overlap)*100)/len(vg['cvg']) 12 | pvg_score=(len(pvg_overlap)*100/len(vg['pvg'])) 13 | nvg_score=(len(nvg_overlap)*100/len(vg['nvg'])) 14 | 15 | return cvg_score, pvg_score, nvg_score 16 | 17 | 18 | def approx_overlap(src_trips, target_trips): 19 | APPROX_THRESH = .7 20 | match_score = 0. 21 | already_matched = [] 22 | completed=0 23 | for (first, rela, second) in src_trips: 24 | local_max = 0 25 | local_match = None 26 | for (tg_first, tg_rela, tg_second) in target_trips: 27 | 28 | #if (tg_first, tg_rela, tg_second) in already_matched: 29 | # continue # bring down to nlogn complexity 30 | 31 | if rela == tg_rela: # same edge type required 32 | if first == tg_first: # if equal don't do expensive sequence matching 33 | score_first = 1. 34 | else: 35 | #score_first = SequenceMatcher(first, tg_first).ratio() 36 | score_first=set(first).intersection(set(tg_first)) 37 | score_first = float(len(score_first))/float((len(set(first).union(set(tg_first))))) 38 | if second == tg_second: 39 | score_second = 1. 40 | else: 41 | #score_second = SequenceMatcher(second, tg_second).ratio() 42 | score_second=set(second).intersection(set(tg_second)) 43 | score_second=float(len(score_second))/float((len(set(second).union(set(tg_second))))) 44 | 45 | # check if they are both over match threshold 46 | #if score_first > APPROX_THRESH and score_second > APPROX_THRESH: 47 | score_avg = (score_first + score_second) / 2. 48 | if score_avg > APPROX_THRESH and score_avg > local_max: 49 | local_max = score_avg 50 | local_match = (tg_first, tg_rela, tg_second) 51 | 52 | if local_match: # Found a match for this src node 53 | match_score += local_max 54 | already_matched.append(local_match) 55 | completed += 1 56 | if (1.*(len(src_trips)-completed) + match_score)/len(src_trips) < .50: 57 | # Even if rest of triples found a perfect match, no way to get abouve 50% 58 | # so we break 59 | break 60 | 61 | # at most match_score would be +1 for each trip in src_trips 62 | return match_score 63 | 64 | 65 | def triplet_match_approx(vg, target_trips): 66 | ''' Approximate overlap function using string matching on code ''' 67 | cvg_overlap = approx_overlap(vg['cvg'],target_trips) 68 | cvg_score = (cvg_overlap*100)/len(vg['cvg']) 69 | if(cvg_score > 50): 70 | pvg_overlap = approx_overlap(vg['pvg'],target_trips) 71 | pvg_score = (pvg_overlap*100)/len(vg['pvg']) 72 | nvg_overlap = approx_overlap(vg['nvg'],target_trips) 73 | nvg_score = (nvg_overlap*100)/len(vg['nvg']) 74 | else: # no need to do pvg, nvg 75 | pvg_score = 0 76 | nvg_score = 0 77 | 78 | return cvg_score, pvg_score, nvg_score 79 | 80 | if __name__ == "__main__": 81 | src_dir=sys.argv[1] 82 | target_dir=sys.argv[2] 83 | for f in os.listdir(src_dir): 84 | if f.endswith("_cvg.pkl"): 85 | print("Loading cvg: ", f) 86 | cvg=pkl.load(open(src_dir + '/' + f, 'rb')) 87 | elif f.endswith("_pvg.pkl"): 88 | print("Loading pvg: ", f) 89 | pvg=pkl.load(open(src_dir + '/' + f,'rb')) 90 | elif f.endswith("_nvg.pkl"): 91 | print("Loading nvg: ", f) 92 | nvg=pkl.load(open(src_dir + '/' + f,'rb')) 93 | 94 | for f in os.listdir(target_dir): 95 | if f.endswith("triples"): 96 | print("Loading target triples: ", f) 97 | target_trips = pkl.load(open(target_dir + '/' + f,'rb')) 98 | 99 | vg={'cvg':cvg,'pvg':pvg,'nvg':nvg} 100 | 101 | print("Performing exact matching...") 102 | res = triplet_match_exact(vg, target_trips) 103 | print(res) 104 | print("Performing approximate matching...") 105 | res = triplet_match_approx(vg, target_trips) 106 | print(res) 107 | 108 | 109 | --------------------------------------------------------------------------------