├── README.md ├── raw-feature-extractor ├── cfg_constructor.py ├── discovRe.py ├── func.py ├── graph_analysis_ida.py ├── graph_property.py ├── preprocessing_ida.py └── raw_graphs.py └── search-engine └── db.py /README.md: -------------------------------------------------------------------------------- 1 | This project provides two components of Genius, a graph-based bug search framework. The first component is the raw feature extraction. The second is the online bug search engine. 2 | 3 | 1. The raw feature extraction is designed to achieve following two goals: 4 | 5 | -> Extract the control flow graph for each binary function 6 | 7 | -> Extract the attributes for each node in the grap 8 | 9 | The feature extraction is built on top of IDA-pro. We wrote the scripts based on ida-python and extract the attributed control flow graph. ``preprocessing_ida.py'' is the main program to extract the ACFG. 10 | 11 | 2. The online bug search engine is used for real-time search: 12 | 13 | -> It utilized localality sensitive hashing for indexing 14 | 15 | -> Nearest-neighbor search algorithm for search 16 | 17 | The online search is based on nearpy (https://github.com/pixelogik/NearPy). 18 | 19 | -------------------------------------------------------------------------------- /raw-feature-extractor/cfg_constructor.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import networkx as nx 3 | from idautils import * 4 | from idaapi import * 5 | from idc import * 6 | 7 | import copy 8 | import networkx as nx 9 | from idautils import * 10 | from idaapi import * 11 | from idc import * 12 | from graph_analysis_ida import * 13 | 14 | 15 | def getCfg(func, externs_eas, ea_externs): 16 | func_start = func.startEA 17 | func_end = func.endEA 18 | cfg = nx.DiGraph() 19 | control_blocks, main_blocks = obtain_block_sequence(func) 20 | i = 0 21 | visited = {} 22 | start_node = None 23 | for bl in control_blocks: 24 | start = control_blocks[bl][0] 25 | end = control_blocks[bl][1] 26 | src_node = (start, end) 27 | if src_node not in visited: 28 | src_id = len(cfg) 29 | visited[src_node] = src_id 30 | cfg.add_node(src_id) 31 | cfg.node[src_id]['label'] = src_node 32 | else: 33 | src_id = visited[src_node] 34 | 35 | #if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp': 36 | if start == func_start: 37 | cfg.node[src_id]['c'] = "start" 38 | start_node = src_node 39 | if end == func_end: 40 | cfg.node[src_id]['c'] = "end" 41 | #print control_ea, 1 42 | refs = CodeRefsTo(start, 0) 43 | for ref in refs: 44 | if ref in control_blocks: 45 | dst_node = control_blocks[ref] 46 | if dst_node not in visited: 47 | visited[dst_node] = len(cfg) 48 | dst_id = visited[dst_node] 49 | cfg.add_edge(dst_id, src_id) 50 | cfg.node[dst_id]['label'] = dst_node 51 | #print control_ea, 1 52 | refs = CodeRefsTo(start, 1) 53 | for ref in refs: 54 | if ref in control_blocks: 55 | dst_node = control_blocks[ref] 56 | if dst_node not in visited: 57 | visited[dst_node] = len(cfg) 58 | dst_id = visited[dst_node] 59 | cfg.add_edge(dst_id, src_id) 60 | cfg.node[dst_id]['label'] = dst_node 61 | #print "attributing" 62 | attributingRe(cfg, externs_eas, ea_externs) 63 | # removing deadnodes 64 | #old_cfg = copy.deepcopy(cfg) 65 | #transform(cfg) 66 | return cfg, 0 67 | 68 | def transform(cfg): 69 | merging(cfg) 70 | filtering(cfg) 71 | 72 | def merging(cfg): 73 | bb_ids = cfg.nodes() 74 | for bb_id in bb_ids: 75 | try: 76 | bb = cfg.node[bb_id]['label'] 77 | bb_start = bb[0] 78 | bb_end = bb[1] 79 | succs = cfg.successors(bb_id) 80 | #preds = cfg.predecessors(bb_id) 81 | if len(succs) == 1: 82 | preds = cfg.predecessors(succs[0]) 83 | if len(preds) == 1: 84 | domerge(cfg, bb_id, succs[0]) 85 | except: 86 | pass 87 | 88 | def domerge(cfg, bb_id, suc_node): 89 | suc_nodes = cfg.successors(suc_node) 90 | for node in suc_nodes: 91 | cfg.add_edge(bb_id, node) 92 | cfg.remove_node(suc_node) 93 | 94 | 95 | def filtering(cfg): 96 | rm_sets = [] 97 | for bb_id in cfg: 98 | bb = cfg.node[bb_id]['label'] 99 | bb_start = bb[0] 100 | bb_end = bb[1] 101 | re = remove(bb_start, bb_end) 102 | print bb_id, re, bb_start, bb_end 103 | if re: 104 | print re, bb_id 105 | rm_sets.append(bb_id) 106 | print rm_sets 107 | for bb_id in rm_sets: 108 | cfg.remove_node(bb_id) 109 | 110 | def remove(bb_start, bb_end): 111 | seqs = getSequences(bb_start, bb_end) 112 | if matchseq(seqs): 113 | return True 114 | return False 115 | 116 | def matchseq(seqs): 117 | mips = set(['lw', "jr", "addiu"]) 118 | x86 = set(['add', 'pop', 'retn']) 119 | b_mips = set(['b', ('move','$v0')]) 120 | b_x86 = set(['b', ('mov','$eax')]) 121 | re_mips = set([('move','$v0')]) 122 | re_x86 = set([('mov','$eax')]) 123 | diff_mips = set(seqs).difference(set(mips)) 124 | if len(diff_mips) == 0: 125 | return True 126 | diff_x86 = set(seqs).difference(set(x86)) 127 | if len(diff_x86) == 0: 128 | return True 129 | if set(seqs) == b_mips: 130 | return True 131 | if set(seqs) == b_x86: 132 | return True 133 | if set(seqs) == re_mips: 134 | return True 135 | if set(seqs) == re_x86: 136 | return True 137 | return False 138 | 139 | def attributingRe(cfg, externs_eas, ea_externs): 140 | for node_id in cfg: 141 | bl = cfg.node[node_id]['label'] 142 | numIns = calInsts(bl) 143 | cfg.node[node_id]['numIns'] = numIns 144 | numCalls = calCalls(bl) 145 | cfg.node[node_id]['numCalls'] = numCalls 146 | numLIs = calLogicInstructions(bl) 147 | cfg.node[node_id]['numLIs'] = numLIs 148 | numAs = calArithmeticIns(bl) 149 | cfg.node[node_id]['numAs'] = numAs 150 | strings, consts = getBBconsts(bl) 151 | cfg.node[node_id]['numNc'] = len(strings) + len(consts) 152 | cfg.node[node_id]['consts'] = consts 153 | cfg.node[node_id]['strings'] = strings 154 | externs = retrieveExterns(bl, ea_externs) 155 | cfg.node[node_id]['externs'] = externs 156 | numTIs = calTransferIns(bl) 157 | cfg.node[node_id]['numTIs'] = numTIs 158 | 159 | 160 | def attributing(cfg): 161 | ga = graph_analysis() 162 | ga.gwithoffspring(cfg) 163 | print "finishing offspring" 164 | for node in cfg: 165 | stmt_num = getStmtNum(node) 166 | binary_value = getBinaryValue(node) 167 | cfg.node[node]['stmt_num'] = stmt_num 168 | cfg.node[node]['binary_value'] = binary_value 169 | ga.domChecking(cfg) 170 | print "finishing domChecking" 171 | ga.loopChecking(cfg) 172 | print "finishing loopChecking" 173 | 174 | 175 | def getStmtNum(node): 176 | start = node[0] 177 | end = node[1] 178 | stmt_num = 0 179 | inst_addr = start 180 | while inst_addr < end: 181 | inst_addr = NextHead(inst_addr) 182 | stmt_num += 1 183 | return stmt_num 184 | 185 | def getBinaryValue(node): 186 | start = node[0] 187 | inst_addr = NextHead(start) 188 | value = 0 189 | addr = 0 190 | for x in xrange((inst_addr - start)-1): 191 | addr = start + x 192 | y = GetOriginalByte(addr) 193 | print value, addr, y 194 | value = value | y 195 | value = value << 8 196 | print value 197 | 198 | addr = inst_addr - 1 199 | y = GetOriginalByte(addr) 200 | print value, addr, y 201 | value = value | y 202 | print node 203 | print bin(value) 204 | return value 205 | 206 | 207 | def cfg_construct(func): 208 | func_start = func.startEA 209 | func_end = func.endEA 210 | cfg = nx.DiGraph() 211 | seq_blocks, main_blocks = obtain_block_sequence(func) 212 | i = 0 213 | visited = {} 214 | for bl in seq_blocks: 215 | start = seq_blocks[bl][0] 216 | end = seq_blocks[bl][1] 217 | src_node = (start, end) 218 | if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp': 219 | next_start = seq_blocks[end][0] 220 | next_end = seq_blocks[end][1] 221 | next_node = (next_start, next_end) 222 | cfg.add_edge(src_node, next_node) 223 | if start == func_start: 224 | cfg.add_node(src_node, c='start') 225 | start_node = src_node 226 | if end == func_end: 227 | cfg.add_node(src_node, c='end') 228 | refs = CodeRefsFrom(PrevHead(end), 0) 229 | 230 | for ref in refs: 231 | #print ref 232 | if ref in seq_blocks: 233 | dst_node = (seq_blocks[ref][0], seq_blocks[ref][1]) 234 | cfg.add_edge(src_node, dst_node) 235 | return cfg, start_node 236 | 237 | 238 | def obtain_allpaths( cfg, node, path, allpaths): 239 | path.append(node) 240 | if 'c' in cfg.node[node] and cfg.node[node]['c'] == 'end': 241 | allpaths.append(path) 242 | return 243 | else: 244 | for suc in cfg.successors(node): 245 | if suc not in path: 246 | path_copy = copy.copy(path) 247 | obtain_allpaths(cfg, suc, path_copy, allpaths) 248 | 249 | 250 | def obtain_block_sequence(func): 251 | control_blocks = {} 252 | main_blocks = {} 253 | blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] 254 | for bl in blocks: 255 | base = bl[0] 256 | end = PrevHead(bl[1]) 257 | control_ea = checkCB(bl) 258 | control_blocks[control_ea] = bl 259 | control_blocks[end] = bl 260 | if func.startEA <= base <= func.endEA: 261 | main_blocks[base] = bl 262 | x = sorted(main_blocks) 263 | return control_blocks, x 264 | 265 | def checkCB(bl): 266 | start = bl[0] 267 | end = bl[1] 268 | ea = start 269 | while ea < end: 270 | if checkCondition(ea): 271 | return ea 272 | ea = NextHead(ea) 273 | 274 | return PrevHead(end) 275 | 276 | def checkCondition(ea): 277 | mips_branch = {"beqz":1, "beq":1, "bne":1, "bgez":1, "b":1, "bnez":1, "bgtz":1, "bltz":1, "blez":1, "bgt":1, "bge":1, "blt":1, "ble":1, "bgtu":1, "bgeu":1, "bltu":1, "bleu":1} 278 | x86_branch = {"jz":1, "jnb":1, "jne":1, "je":1, "jg":1, "jle":1, "jl":1, "jge":1, "ja":1, "jae":1, "jb":1, "jbe":1, "jo":1, "jno":1, "js":1, "jns":1} 279 | arm_branch = {"B":1, "BAL":1, "BNE":1, "BEQ":1, "BPL":1, "BMI":1, "BCC":1, "BLO":1, "BCS":1, "BHS":1, "BVC":1, "BVS":1, "BGT":1, "BGE":1, "BLT":1, "BLE":1, "BHI":1 ,"BLS":1 } 280 | conds = {} 281 | conds.update(mips_branch) 282 | conds.update(x86_branch) 283 | opcode = GetMnem(ea) 284 | if opcode in conds: 285 | return True 286 | return False 287 | -------------------------------------------------------------------------------- /raw-feature-extractor/discovRe.py: -------------------------------------------------------------------------------- 1 | # 2 | # Reference Lister 3 | # 4 | # List all functions and all references to them in the current section. 5 | # 6 | # Implemented with the idautils module 7 | # 8 | import networkx as nx 9 | import cPickle as pickle 10 | import pdb 11 | from graph_analysis_ida import * 12 | from graph_property import * 13 | #import wingdbstub 14 | #wingdbstub.Ensure() 15 | 16 | def get_funcs(ea): 17 | funcs = {} 18 | # Get current ea 19 | # Loop from start to end in the current segment 20 | for funcea in Functions(SegStart(ea)): 21 | funcname = GetFunctionName(funcea) 22 | func = get_func(funcea) 23 | blocks = FlowChart(func) 24 | funcs[funcname] = [] 25 | for bl in blocks: 26 | start = bl.startEA 27 | end = bl.endEA 28 | funcs[funcname].append((start, end)) 29 | return funcs 30 | 31 | def get_funcs_for_discoverRe(ea): 32 | features = {} 33 | for funcea in Functions(SegStart(ea)): 34 | funcname = GetFunctionName(funcea) 35 | print funcname 36 | func = get_func(funcea) 37 | feature = get_discoverRe_feature(func) 38 | features[funcname] = feature 39 | return features 40 | 41 | def get_discoverRe_feature(func, icfg): 42 | start = func.startEA 43 | end = func.endEA 44 | features = [] 45 | FunctionCalls = getFuncCalls(func) 46 | #1 47 | features.append(FunctionCalls) 48 | LogicInstr = getLogicInsts(func) 49 | #2 50 | features.append(LogicInstr) 51 | Transfer = getTransferInsts(func) 52 | #3 53 | features.append(Transfer) 54 | Locals = getLocalVariables(func) 55 | #4 56 | features.append(Locals) 57 | BB = getBasicBlocks(func) 58 | #5 59 | features.append(BB) 60 | Edges = len(icfg.edges()) 61 | #6 62 | features.append(Edges) 63 | Incoming = getIncommingCalls(func) 64 | #7 65 | features.append(Incoming) 66 | #8 67 | Instrs = getIntrs(func) 68 | features.append(Instrs) 69 | between = retrieveGP(icfg) 70 | #9 71 | features.append(between) 72 | 73 | strings, consts = getfunc_consts(func) 74 | features.append(strings) 75 | features.append(consts) 76 | return features 77 | 78 | def get_func_names(ea): 79 | funcs = {} 80 | for funcea in Functions(SegStart(ea)): 81 | funcname = GetFunctionName(funcea) 82 | funcs[funcname] = funcea 83 | return funcs 84 | 85 | def get_func_bases(ea): 86 | funcs = {} 87 | for funcea in Functions(SegStart(ea)): 88 | funcname = GetFunctionName(funcea) 89 | funcs[funcea] = funcname 90 | return funcs 91 | 92 | def get_func_range(ea): 93 | funcs = {} 94 | for funcea in Functions(SegStart(ea)): 95 | funcname = GetFunctionName(funcea) 96 | func = get_func(funcea) 97 | funcs[funcname] = (func.startEA, func.endEA) 98 | return funcs 99 | 100 | def get_func_sequences(ea): 101 | funcs_bodylist = {} 102 | funcs = get_funcs(ea) 103 | for funcname in funcs: 104 | if funcname not in funcs_bodylist: 105 | funcs_bodylist[funcname] = [] 106 | for start, end in funcs[funcname]: 107 | inst_addr = start 108 | while inst_addr <= end: 109 | opcode = GetMnem(inst_addr) 110 | funcs_bodylist[funcname].append(opcode) 111 | inst_addr = NextHead(inst_addr) 112 | return funcs_bodylist 113 | 114 | def get_func_cfgs(ea): 115 | func_cfglist = {} 116 | i = 0 117 | start, end = get_section('LOAD') 118 | #print start, end 119 | for funcea in Functions(SegStart(ea)): 120 | if start <= funcea <= end: 121 | funcname = GetFunctionName(funcea) 122 | func = get_func(funcea) 123 | print i 124 | i += 1 125 | try: 126 | icfg = cfg.cfg_construct(func) 127 | func_cfglist[funcname] = icfg 128 | except: 129 | pass 130 | 131 | return func_cfglist 132 | 133 | def get_section(t): 134 | base = SegByName(t) 135 | start = SegByBase(base) 136 | end = SegEnd(start) 137 | return start, end 138 | 139 | 140 | def get_func_cfg_sequences(func_cfglist): 141 | func_cfg_seqlist = {} 142 | for funcname in func_cfglist: 143 | func_cfg_seqlist[funcname] = {} 144 | cfg = func_cfglist[funcname][0] 145 | for start, end in cfg: 146 | codesq = get_sequences(start, end) 147 | func_cfg_seqlist[funcname][(start,end)] = codesq 148 | 149 | return func_cfg_seqlist 150 | 151 | 152 | def get_sequences(start, end): 153 | seq = [] 154 | inst_addr = start 155 | while inst_addr <= end: 156 | opcode = GetMnem(inst_addr) 157 | seq.append(opcode) 158 | inst_addr = NextHead(inst_addr) 159 | return seq 160 | 161 | def get_stack_arg(func_addr): 162 | print func_addr 163 | args = [] 164 | stack = GetFrame(func_addr) 165 | if not stack: 166 | return [] 167 | firstM = GetFirstMember(stack) 168 | lastM = GetLastMember(stack) 169 | i = firstM 170 | while i <=lastM: 171 | mName = GetMemberName(stack,i) 172 | mSize = GetMemberSize(stack,i) 173 | if mSize: 174 | i = i + mSize 175 | else: 176 | i = i+4 177 | if mName not in args and mName and ' s' not in mName and ' r' not in mName: 178 | args.append(mName) 179 | return args 180 | 181 | #pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w')) 182 | 183 | def processDataSegs(): 184 | funcdata = {} 185 | datafunc = {} 186 | for n in xrange(idaapi.get_segm_qty()): 187 | seg = idaapi.getnseg(n) 188 | ea = seg.startEA 189 | segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE) 190 | if segtype in [idc.SEG_DATA, idc.SEG_BSS]: 191 | start = idc.SegStart(ea) 192 | end = idc.SegEnd(ea) 193 | cur = start 194 | while cur <= end: 195 | refs = [v for v in DataRefsTo(cur)] 196 | for fea in refs: 197 | name = GetFunctionName(fea) 198 | if len(name)== 0: 199 | continue 200 | if name not in funcdata: 201 | funcdata[name] = [cur] 202 | else: 203 | funcdata[name].append(cur) 204 | if cur not in datafunc: 205 | datafunc[cur] = [name] 206 | else: 207 | datafunc[cur].append(name) 208 | cur = NextHead(cur) 209 | return funcdata, datafunc 210 | 211 | def obtainDataRefs(callgraph): 212 | datarefs = {} 213 | funcdata, datafunc = processDataSegs() 214 | for node in callgraph: 215 | if node in funcdata: 216 | datas = funcdata[node] 217 | for dd in datas: 218 | refs = datafunc[dd] 219 | refs = list(set(refs)) 220 | if node in datarefs: 221 | print refs 222 | datarefs[node] += refs 223 | datarefs[node] = list(set(datarefs[node])) 224 | else: 225 | datarefs[node] = refs 226 | return datarefs 227 | 228 | 229 | -------------------------------------------------------------------------------- /raw-feature-extractor/func.py: -------------------------------------------------------------------------------- 1 | # 2 | # Reference Lister 3 | # 4 | # List all functions and all references to them in the current section. 5 | # 6 | # Implemented with the idautils module 7 | # 8 | from idautils import * 9 | from idaapi import * 10 | from idc import * 11 | import networkx as nx 12 | import cfg_constructor as cfg 13 | import cPickle as pickle 14 | import pdb 15 | from raw_graphs import * 16 | #from discovRe_feature.discovRe import * 17 | from discovRe import * 18 | #import wingdbstub 19 | #wingdbstub.Ensure() 20 | def gt_funcNames(ea): 21 | funcs = [] 22 | plt_func, plt_data = processpltSegs() 23 | for funcea in Functions(SegStart(ea)): 24 | funcname = get_unified_funcname(funcea) 25 | if funcname in plt_func: 26 | print funcname 27 | continue 28 | funcs.append(funcname) 29 | return funcs 30 | 31 | def get_funcs(ea): 32 | funcs = {} 33 | # Get current ea 34 | # Loop from start to end in the current segment 35 | plt_func, plt_data = processpltSegs() 36 | for funcea in Functions(SegStart(ea)): 37 | funcname = get_unified_funcname(funcea) 38 | if funcname in plt_func: 39 | continue 40 | func = get_func(funcea) 41 | blocks = FlowChart(func) 42 | funcs[funcname] = [] 43 | for bl in blocks: 44 | start = bl.startEA 45 | end = bl.endEA 46 | funcs[funcname].append((start, end)) 47 | return funcs 48 | 49 | # used for the callgraph generation. 50 | def get_func_namesWithoutE(ea): 51 | funcs = {} 52 | plt_func, plt_data = processpltSegs() 53 | for funcea in Functions(SegStart(ea)): 54 | funcname = get_unified_funcname(funcea) 55 | if 'close' in funcname: 56 | print funcea 57 | if funcname in plt_func: 58 | print funcname 59 | continue 60 | funcs[funcname] = funcea 61 | return funcs 62 | 63 | # used for the callgraph generation. 64 | def get_func_names(ea): 65 | funcs = {} 66 | for funcea in Functions(SegStart(ea)): 67 | funcname = get_unified_funcname(funcea) 68 | funcs[funcname] = funcea 69 | return funcs 70 | 71 | def get_func_bases(ea): 72 | funcs = {} 73 | plt_func, plt_data = processpltSegs() 74 | for funcea in Functions(SegStart(ea)): 75 | funcname = get_unified_funcname(funcea) 76 | if funcname in plt_func: 77 | continue 78 | funcs[funcea] = funcname 79 | return funcs 80 | 81 | def get_func_range(ea): 82 | funcs = {} 83 | for funcea in Functions(SegStart(ea)): 84 | funcname = get_unified_funcname(funcea) 85 | func = get_func(funcea) 86 | funcs[funcname] = (func.startEA, func.endEA) 87 | return funcs 88 | 89 | def get_unified_funcname(ea): 90 | funcname = GetFunctionName(ea) 91 | if len(funcname) > 0: 92 | if '.' == funcname[0]: 93 | funcname = funcname[1:] 94 | return funcname 95 | 96 | def get_func_sequences(ea): 97 | funcs_bodylist = {} 98 | funcs = get_funcs(ea) 99 | for funcname in funcs: 100 | if funcname not in funcs_bodylist: 101 | funcs_bodylist[funcname] = [] 102 | for start, end in funcs[funcname]: 103 | inst_addr = start 104 | while inst_addr <= end: 105 | opcode = GetMnem(inst_addr) 106 | funcs_bodylist[funcname].append(opcode) 107 | inst_addr = NextHead(inst_addr) 108 | return funcs_bodylist 109 | 110 | def get_func_cfgs_c(ea): 111 | binary_name = idc.GetInputFile() 112 | raw_cfgs = raw_graphs(binary_name) 113 | externs_eas, ea_externs = processpltSegs() 114 | i = 0 115 | for funcea in Functions(SegStart(ea)): 116 | funcname = get_unified_funcname(funcea) 117 | func = get_func(funcea) 118 | print i 119 | i += 1 120 | icfg = cfg.getCfg(func, externs_eas, ea_externs) 121 | func_f = get_discoverRe_feature(func, icfg[0]) 122 | raw_g = raw_graph(funcname, icfg, func_f) 123 | raw_cfgs.append(raw_g) 124 | 125 | return raw_cfgs 126 | 127 | def get_func_cfgs_ctest(ea): 128 | binary_name = idc.GetInputFile() 129 | raw_cfgs = raw_graphs(binary_name) 130 | externs_eas, ea_externs = processpltSegs() 131 | i = 0 132 | diffs = {} 133 | for funcea in Functions(SegStart(ea)): 134 | funcname = get_unified_funcname(funcea) 135 | func = get_func(funcea) 136 | print i 137 | i += 1 138 | icfg, old_cfg = cfg.getCfg(func, externs_eas, ea_externs) 139 | diffs[funcname] = (icfg, old_cfg) 140 | #raw_g = raw_graph(funcname, icfg) 141 | #raw_cfgs.append(raw_g) 142 | 143 | return diffs 144 | 145 | def get_func_cfgs(ea): 146 | func_cfglist = {} 147 | i = 0 148 | for funcea in Functions(SegStart(ea)): 149 | funcname = get_unified_funcname(funcea) 150 | func = get_func(funcea) 151 | print i 152 | i += 1 153 | try: 154 | icfg = cfg.getCfg(func) 155 | func_cfglist[funcname] = icfg 156 | except: 157 | pass 158 | 159 | return func_cfglist 160 | 161 | def get_func_cfg_sequences(func_cfglist): 162 | func_cfg_seqlist = {} 163 | for funcname in func_cfglist: 164 | func_cfg_seqlist[funcname] = {} 165 | cfg = func_cfglist[funcname][0] 166 | for start, end in cfg: 167 | codesq = get_sequences(start, end) 168 | func_cfg_seqlist[funcname][(start,end)] = codesq 169 | 170 | return func_cfg_seqlist 171 | 172 | 173 | def get_sequences(start, end): 174 | seq = [] 175 | inst_addr = start 176 | while inst_addr <= end: 177 | opcode = GetMnem(inst_addr) 178 | seq.append(opcode) 179 | inst_addr = NextHead(inst_addr) 180 | return seq 181 | 182 | def get_stack_arg(func_addr): 183 | print func_addr 184 | args = [] 185 | stack = GetFrame(func_addr) 186 | if not stack: 187 | return [] 188 | firstM = GetFirstMember(stack) 189 | lastM = GetLastMember(stack) 190 | i = firstM 191 | while i <=lastM: 192 | mName = GetMemberName(stack,i) 193 | mSize = GetMemberSize(stack,i) 194 | if mSize: 195 | i = i + mSize 196 | else: 197 | i = i+4 198 | if mName not in args and mName and ' s' not in mName and ' r' not in mName: 199 | args.append(mName) 200 | return args 201 | 202 | #pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w')) 203 | 204 | def processExternalSegs(): 205 | funcdata = {} 206 | datafunc = {} 207 | for n in xrange(idaapi.get_segm_qty()): 208 | seg = idaapi.getnseg(n) 209 | ea = seg.startEA 210 | segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE) 211 | if segtype in [idc.SEG_XTRN]: 212 | start = idc.SegStart(ea) 213 | end = idc.SegEnd(ea) 214 | cur = start 215 | while cur <= end: 216 | name = get_unified_funcname(cur) 217 | funcdata[name] = hex(cur) 218 | cur = NextHead(cur) 219 | return funcdata 220 | 221 | def processpltSegs(): 222 | funcdata = {} 223 | datafunc = {} 224 | for n in xrange(idaapi.get_segm_qty()): 225 | seg = idaapi.getnseg(n) 226 | ea = seg.startEA 227 | segname = SegName(ea) 228 | if segname in ['.plt', 'extern', '.MIPS.stubs']: 229 | start = seg.startEA 230 | end = seg.endEA 231 | cur = start 232 | while cur < end: 233 | name = get_unified_funcname(cur) 234 | funcdata[name] = hex(cur) 235 | datafunc[cur]= name 236 | cur = NextHead(cur) 237 | return funcdata, datafunc 238 | 239 | 240 | def processDataSegs(): 241 | funcdata = {} 242 | datafunc = {} 243 | for n in xrange(idaapi.get_segm_qty()): 244 | seg = idaapi.getnseg(n) 245 | ea = seg.startEA 246 | segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE) 247 | if segtype in [idc.SEG_DATA, idc.SEG_BSS]: 248 | start = idc.SegStart(ea) 249 | end = idc.SegEnd(ea) 250 | cur = start 251 | while cur <= end: 252 | refs = [v for v in DataRefsTo(cur)] 253 | for fea in refs: 254 | name = get_unified_funcname(fea) 255 | if len(name)== 0: 256 | continue 257 | if name not in funcdata: 258 | funcdata[name] = [cur] 259 | else: 260 | funcdata[name].append(cur) 261 | if cur not in datafunc: 262 | datafunc[cur] = [name] 263 | else: 264 | datafunc[cur].append(name) 265 | cur = NextHead(cur) 266 | return funcdata, datafunc 267 | 268 | def obtainDataRefs(callgraph): 269 | datarefs = {} 270 | funcdata, datafunc = processDataSegs() 271 | for node in callgraph: 272 | if node in funcdata: 273 | datas = funcdata[node] 274 | for dd in datas: 275 | refs = datafunc[dd] 276 | refs = list(set(refs)) 277 | if node in datarefs: 278 | print refs 279 | datarefs[node] += refs 280 | datarefs[node] = list(set(datarefs[node])) 281 | else: 282 | datarefs[node] = refs 283 | return datarefs 284 | 285 | 286 | -------------------------------------------------------------------------------- /raw-feature-extractor/graph_analysis_ida.py: -------------------------------------------------------------------------------- 1 | from idautils import * 2 | from idaapi import * 3 | from idc import * 4 | 5 | def getfunc_consts(func): 6 | strings = [] 7 | consts = [] 8 | blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] 9 | for bl in blocks: 10 | strs, conts = getBBconsts(bl) 11 | strings += strs 12 | consts += conts 13 | return strings, consts 14 | 15 | def getConst(ea, offset): 16 | strings = [] 17 | consts = [] 18 | optype1 = GetOpType(ea, offset) 19 | if optype1 == idaapi.o_imm: 20 | imm_value = GetOperandValue(ea, offset) 21 | if 0<= imm_value <= 10: 22 | consts.append(imm_value) 23 | else: 24 | if idaapi.isLoaded(imm_value) and idaapi.getseg(imm_value): 25 | str_value = GetString(imm_value) 26 | if str_value is None: 27 | str_value = GetString(imm_value+0x40000) 28 | if str_value is None: 29 | consts.append(imm_value) 30 | else: 31 | re = all(40 <= ord(c) < 128 for c in str_value) 32 | if re: 33 | strings.append(str_value) 34 | else: 35 | consts.append(imm_value) 36 | else: 37 | re = all(40 <= ord(c) < 128 for c in str_value) 38 | if re: 39 | strings.append(str_value) 40 | else: 41 | consts.append(imm_value) 42 | else: 43 | consts.append(imm_value) 44 | return strings, consts 45 | 46 | def getBBconsts(bl): 47 | strings = [] 48 | consts = [] 49 | start = bl[0] 50 | end = bl[1] 51 | invoke_num = 0 52 | inst_addr = start 53 | while inst_addr < end: 54 | opcode = GetMnem(inst_addr) 55 | if opcode in ['la','jalr','call', 'jal']: 56 | inst_addr = NextHead(inst_addr) 57 | continue 58 | strings_src, consts_src = getConst(inst_addr, 0) 59 | strings_dst, consts_dst = getConst(inst_addr, 1) 60 | strings += strings_src 61 | strings += strings_dst 62 | consts += consts_src 63 | consts += consts_dst 64 | try: 65 | strings_dst, consts_dst = getConst(inst_addr, 2) 66 | consts += consts_dst 67 | strings += strings_dst 68 | except: 69 | pass 70 | 71 | inst_addr = NextHead(inst_addr) 72 | return strings, consts 73 | 74 | def getFuncCalls(func): 75 | blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] 76 | sumcalls = 0 77 | for bl in blocks: 78 | callnum = calCalls(bl) 79 | sumcalls += callnum 80 | return sumcalls 81 | 82 | def getLogicInsts(func): 83 | blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] 84 | sumcalls = 0 85 | for bl in blocks: 86 | callnum = calLogicInstructions(bl) 87 | sumcalls += callnum 88 | return sumcalls 89 | 90 | def getTransferInsts(func): 91 | blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] 92 | sumcalls = 0 93 | for bl in blocks: 94 | callnum = calTransferIns(bl) 95 | sumcalls += callnum 96 | return sumcalls 97 | 98 | def getIntrs(func): 99 | blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] 100 | sumcalls = 0 101 | for bl in blocks: 102 | callnum = calInsts(bl) 103 | sumcalls += callnum 104 | return sumcalls 105 | 106 | def getLocalVariables(func): 107 | args_num = get_stackVariables(func.startEA) 108 | return args_num 109 | 110 | def getBasicBlocks(func): 111 | blocks = [(v.startEA, v.endEA) for v in FlowChart(func)] 112 | return len(blocks) 113 | 114 | def getIncommingCalls(func): 115 | refs = CodeRefsTo(func.startEA, 0) 116 | re = len([v for v in refs]) 117 | return re 118 | 119 | 120 | def get_stackVariables(func_addr): 121 | #print func_addr 122 | args = [] 123 | stack = GetFrame(func_addr) 124 | if not stack: 125 | return 0 126 | firstM = GetFirstMember(stack) 127 | lastM = GetLastMember(stack) 128 | i = firstM 129 | while i <=lastM: 130 | mName = GetMemberName(stack,i) 131 | mSize = GetMemberSize(stack,i) 132 | if mSize: 133 | i = i + mSize 134 | else: 135 | i = i+4 136 | if mName not in args and mName and 'var_' in mName: 137 | args.append(mName) 138 | return len(args) 139 | 140 | 141 | 142 | def calArithmeticIns(bl): 143 | x86_AI = {'add':1, 'sub':1, 'div':1, 'imul':1, 'idiv':1, 'mul':1, 'shl':1, 'dec':1, 'inc':1} 144 | mips_AI = {'add':1, 'addu':1, 'addi':1, 'addiu':1, 'mult':1, 'multu':1, 'div':1, 'divu':1} 145 | calls = {} 146 | calls.update(x86_AI) 147 | calls.update(mips_AI) 148 | start = bl[0] 149 | end = bl[1] 150 | invoke_num = 0 151 | inst_addr = start 152 | while inst_addr < end: 153 | opcode = GetMnem(inst_addr) 154 | if opcode in calls: 155 | invoke_num += 1 156 | inst_addr = NextHead(inst_addr) 157 | return invoke_num 158 | 159 | def calCalls(bl): 160 | calls = {'call':1, 'jal':1, 'jalr':1} 161 | start = bl[0] 162 | end = bl[1] 163 | invoke_num = 0 164 | inst_addr = start 165 | while inst_addr < end: 166 | opcode = GetMnem(inst_addr) 167 | if opcode in calls: 168 | invoke_num += 1 169 | inst_addr = NextHead(inst_addr) 170 | return invoke_num 171 | 172 | def calInsts(bl): 173 | start = bl[0] 174 | end = bl[1] 175 | ea = start 176 | num = 0 177 | while ea < end: 178 | num += 1 179 | ea = NextHead(ea) 180 | return num 181 | 182 | def calLogicInstructions(bl): 183 | x86_LI = {'and':1, 'andn':1, 'andnpd':1, 'andpd':1, 'andps':1, 'andnps':1, 'test':1, 'xor':1, 'xorpd':1, 'pslld':1} 184 | mips_LI = {'and':1, 'andi':1, 'or':1, 'ori':1, 'xor':1, 'nor':1, 'slt':1, 'slti':1, 'sltu':1} 185 | calls = {} 186 | calls.update(x86_LI) 187 | calls.update(mips_LI) 188 | start = bl[0] 189 | end = bl[1] 190 | invoke_num = 0 191 | inst_addr = start 192 | while inst_addr < end: 193 | opcode = GetMnem(inst_addr) 194 | if opcode in calls: 195 | invoke_num += 1 196 | inst_addr = NextHead(inst_addr) 197 | return invoke_num 198 | 199 | def calSconstants(bl): 200 | start = bl[0] 201 | end = bl[1] 202 | invoke_num = 0 203 | inst_addr = start 204 | while inst_addr < end: 205 | opcode = GetMnem(inst_addr) 206 | if opcode in calls: 207 | invoke_num += 1 208 | inst_addr = NextHead(inst_addr) 209 | return invoke_num 210 | 211 | 212 | def calNconstants(bl): 213 | start = bl[0] 214 | end = bl[1] 215 | invoke_num = 0 216 | inst_addr = start 217 | while inst_addr < end: 218 | optype1 = GetOpType(inst_addr, 0) 219 | optype2 = GetOpType(inst_addr, 1) 220 | if optype1 == 5 or optype2 == 5: 221 | invoke_num += 1 222 | inst_addr = NextHead(inst_addr) 223 | return invoke_num 224 | 225 | def retrieveExterns(bl, ea_externs): 226 | externs = [] 227 | start = bl[0] 228 | end = bl[1] 229 | inst_addr = start 230 | while inst_addr < end: 231 | refs = CodeRefsFrom(inst_addr, 1) 232 | try: 233 | ea = [v for v in refs if v in ea_externs][0] 234 | externs.append(ea_externs[ea]) 235 | except: 236 | pass 237 | inst_addr = NextHead(inst_addr) 238 | return externs 239 | 240 | def calTransferIns(bl): 241 | x86_TI = {'jmp':1, 'jz':1, 'jnz':1, 'js':1, 'je':1, 'jne':1, 'jg':1, 'jle':1, 'jge':1, 'ja':1, 'jnc':1, 'call':1} 242 | mips_TI = {'beq':1, 'bne':1, 'bgtz':1, "bltz":1, "bgez":1, "blez":1, 'j':1, 'jal':1, 'jr':1, 'jalr':1} 243 | arm_TI = {'MVN':1, "MOV":1} 244 | calls = {} 245 | calls.update(x86_TI) 246 | calls.update(mips_TI) 247 | start = bl[0] 248 | end = bl[1] 249 | invoke_num = 0 250 | inst_addr = start 251 | while inst_addr < end: 252 | opcode = GetMnem(inst_addr) 253 | re = [v for v in calls if opcode in v] 254 | if len(re) > 0: 255 | invoke_num += 1 256 | inst_addr = NextHead(inst_addr) 257 | return invoke_num -------------------------------------------------------------------------------- /raw-feature-extractor/graph_property.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import pdb 3 | def betweeness(g): 4 | #pdb.set_trace() 5 | betweenness = nx.betweenness_centrality(g) 6 | return betweenness 7 | 8 | def eigenvector(g): 9 | centrality = nx.eigenvector_centrality(g) 10 | return centrality 11 | 12 | def closeness_centrality(g): 13 | closeness = nx.closeness_centrality(g) 14 | return closeness 15 | 16 | def retrieveGP(g): 17 | bf = betweeness(g) 18 | #close = closeness_centrality(g) 19 | #bf_sim = 20 | #close_sim = 21 | x = sorted(bf.values()) 22 | value = sum(x)/len(x) 23 | return round(value,5) 24 | 25 | -------------------------------------------------------------------------------- /raw-feature-extractor/preprocessing_ida.py: -------------------------------------------------------------------------------- 1 | from func import * 2 | from raw_graphs import * 3 | from idc import * 4 | import os 5 | import argparse 6 | 7 | def parse_command(): 8 | parser = argparse.ArgumentParser(description='Process some integers.') 9 | parser.add_argument("--path", type=str, help="The directory where to store the generated .ida file") 10 | args = parser.parse_args() 11 | return args 12 | 13 | if __name__ == '__main__': 14 | 15 | args = parse_command() 16 | path = args.path 17 | analysis_flags = idc.GetShortPrm(idc.INF_START_AF) 18 | analysis_flags &= ~idc.AF_IMMOFF 19 | # turn off "automatically make offset" heuristic 20 | idc.SetShortPrm(idc.INF_START_AF, analysis_flags) 21 | idaapi.autoWait() 22 | cfgs = get_func_cfgs_c(FirstSeg()) 23 | binary_name = idc.GetInputFile() + '.ida' 24 | fullpath = os.path.join(path, binary_name) 25 | pickle.dump(cfgs, open(fullpath,'w')) 26 | print binary_name 27 | idc.Exit(0) -------------------------------------------------------------------------------- /raw-feature-extractor/raw_graphs.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import sys 3 | sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/') 4 | import networkx as nx 5 | #import numpy as np 6 | from subprocess import Popen, PIPE 7 | import pdb 8 | import os 9 | import re,mmap 10 | #from graph_edit_new import * 11 | 12 | class raw_graph: 13 | def __init__(self, funcname, g, func_f): 14 | self.funcname = funcname 15 | self.old_g = g[0] 16 | self.g = nx.DiGraph() 17 | self.entry = g[1] 18 | self.fun_features = func_f 19 | self.attributing() 20 | 21 | def __len__(self): 22 | return len(self.g) 23 | 24 | def attributing(self): 25 | self.obtainOffsprings(self.old_g) 26 | for node in self.old_g: 27 | fvector = self.retrieveVec(node, self.old_g) 28 | self.g.add_node(node) 29 | self.g.node[node]['v'] = fvector 30 | 31 | for edge in self.old_g.edges(): 32 | node1 = edge[0] 33 | node2 = edge[1] 34 | self.g.add_edge(node1, node2) 35 | 36 | def obtainOffsprings(self,g): 37 | nodes = g.nodes() 38 | for node in nodes: 39 | offsprings = {} 40 | self.getOffsprings(g, node, offsprings) 41 | g.node[node]['offs'] = len(offsprings) 42 | return g 43 | 44 | def getOffsprings(self, g, node, offsprings): 45 | node_offs = 0 46 | sucs = g.successors(node) 47 | for suc in sucs: 48 | if suc not in offsprings: 49 | offsprings[suc] = 1 50 | self.getOffsprings(g, suc, offsprings) 51 | 52 | def retrieveVec(self, id_, g): 53 | feature_vec = [] 54 | #numC0 55 | numc = g.node[id_]['consts'] 56 | feature_vec.append(numc) 57 | #nums1 58 | nums = g.node[id_]['strings'] 59 | feature_vec.append(nums) 60 | #offsprings2 61 | offs = g.node[id_]['offs'] 62 | feature_vec.append(offs) 63 | #numAs3 64 | numAs = g.node[id_]['numAs'] 65 | feature_vec.append(numAs) 66 | # of calls4 67 | calls = g.node[id_]['numCalls'] 68 | feature_vec.append(calls) 69 | # of insts5 70 | insts = g.node[id_]['numIns'] 71 | feature_vec.append(insts) 72 | # of LIs6 73 | insts = g.node[id_]['numLIs'] 74 | feature_vec.append(insts) 75 | # of TIs7 76 | insts = g.node[id_]['numTIs'] 77 | feature_vec.append(insts) 78 | return feature_vec 79 | 80 | 81 | def enumerating(self, n): 82 | subgs = [] 83 | #pdb.set_trace() 84 | for sub_nodes in itertools.combinations(self.g.nodes(), n): 85 | subg = self.g.subgraph(sub_nodes) 86 | u_subg = subg.to_undirected() 87 | if nx.is_connected(u_subg): 88 | subgs.append(subg) 89 | return subgs 90 | 91 | 92 | def genMotifs(self, n): 93 | motifs = {} 94 | subgs = enumerating(n) 95 | for subg in subgs: 96 | if len(motifs) == 0: 97 | motifs[subg] = [subg] 98 | else: 99 | nomatch = True 100 | for mt in motifs: 101 | if nx.is_isomorphic(mt, subg): 102 | motifs[mt].append(subg) 103 | nomatch = False 104 | if nomatch: 105 | motifs[subg] = [subg] 106 | return motifs 107 | 108 | def enumerating_efficient(self, n): 109 | #pdb.set_trace() 110 | if len(self.g) >= 200: 111 | return [] 112 | with open('/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt','wb') as f: 113 | nx.write_edgelist(self.g,f,data=False) 114 | #pdb.set_trace() 115 | process = Popen(["/home/qian/workspace/FANMOD-command_line-source/executables/./fanmod_command_line_linux", str(n), "100000", "1", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt", "1", "0", "0", "2", "0", "0", "0", "1000", "3", "3", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt", "0", "1"], stdout=PIPE, stderr=PIPE) 116 | stdout, stderr = process.communicate() 117 | if process.returncode >= 0: 118 | #os.system("/home/qian/software/FANMOD-command_line-source/executables/./fanmod_command_line_linux " +str(n) + " 100000 1 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt 1 0 0 2 0 0 0 1000 3 3 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt 0 1") 119 | #pdb.set_trace() 120 | #pdb.set_trace() 121 | subgs = self.parseOutput("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump", n) 122 | #pdb.set_trace() 123 | os.remove("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump") 124 | return subgs 125 | return [] 126 | 127 | def parseOutput(self, path, n): 128 | pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+') 129 | subgraphs = [] 130 | with open(path,'r') as f: 131 | data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) 132 | mo = re.findall(pattern, data) 133 | if mo: 134 | results = [map(int, v.split(',')[1:]) for v in mo] 135 | subgraphs = self.createGraphDirectly(results) 136 | return subgraphs 137 | 138 | def parseOutputByconditions(self, path, n): 139 | pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+') 140 | subgraphs = [] 141 | with open(path,'r') as f: 142 | data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) 143 | mo = re.findall(pattern, data) 144 | if mo: 145 | results = [map(int, v.split(',')[1:]) for v in mo] 146 | subgraphs = self.create_Graphbycondition_Directly(results) 147 | return subgraphs 148 | 149 | def create_Graphbycondition_Directly(self, results): 150 | subgs = [] 151 | for indexes in results: 152 | tg = template_graph() 153 | subg = self.g.subgraph(indexes) 154 | tg.updateG(subg) 155 | subgs.append(tg) 156 | del tg 157 | return subgs 158 | 159 | def createGraphDirectly(self, results): 160 | #pdb.set_trace() 161 | #subgs = [self.g.subgraph(indexes) for indexes in results] 162 | subgs = [] 163 | for indexes in results: 164 | tg = template_graph() 165 | subg = self.g.subgraph(indexes) 166 | tg.updateG(subg) 167 | subgs.append(tg) 168 | del tg 169 | return subgs 170 | 171 | def createGraph(self, results, n): 172 | binary_value = int(results[0],2) 173 | indexes = [int(v) for v in results[1:]] 174 | fang = self.createG(results[0], n) 175 | if fang: 176 | tg = template_graph(binary_value) 177 | tg.updateG(fang, indexes, self.g) 178 | return tg 179 | pdb.set_trace() 180 | print "there is g which is none" 181 | 182 | def createG(self, binary_str, n): 183 | g = nx.DiGraph() 184 | l = [int(v) for v in binary_str] 185 | #pdb.set_trace() 186 | shape = (n, n) 187 | data = np.array(l) 188 | ad_matrix = data.reshape(shape) 189 | for i in xrange(n): 190 | for j in xrange(n): 191 | if ad_matrix[i][j] == 1: 192 | g.add_edge(i, j) 193 | return g 194 | 195 | 196 | 197 | class raw_graphs: 198 | def __init__(self, binary_name): 199 | self.binary_name = binary_name 200 | self.raw_graph_list = [] 201 | 202 | def append(self, raw_g): 203 | self.raw_graph_list.append(raw_g) 204 | 205 | def __len__(self): 206 | return len(self.raw_graph_list) 207 | 208 | 209 | class graphlets: 210 | def __init__(self, funcname): 211 | self.funcname = funcname 212 | self.graphlets_list = [] 213 | self.binary_name = None 214 | 215 | def updateBN(self, binary_name): 216 | self.binary_name = binary_name 217 | 218 | def append(self, subg): 219 | self.graphlets_list.append(subg) 220 | 221 | def appendSet(self, subgs): 222 | self.graphlets_list += subgs 223 | 224 | def __len__(self): 225 | return len(self.graphlets_list) 226 | 227 | class template_graph: 228 | def __init__(self, value=None): 229 | self.value = value 230 | self.g = None 231 | 232 | def updateG(self,g): 233 | self.g = g 234 | #def updateIndexes(self, indexes): 235 | # self.indexes = indexes 236 | 237 | #def updateAttributes(self, pg, indexes, maing): 238 | # for id_ in xrange(len(indexes)): 239 | # index = indexes[id_] 240 | # gnode = self.findNode(index, maing) 241 | # self.g.node[gnode] = pg.node[index] 242 | 243 | 244 | class template_graphs: 245 | def __init__(self, size): 246 | self.size = size 247 | self.gs = [] 248 | self.bit_len = None 249 | 250 | def enumeratingAll(self): 251 | subgs = [] 252 | binary_value = self.genBinValue() 253 | for i in xrange(binary_value): 254 | if i == 0 : 255 | continue 256 | g = self.createG(i) 257 | if g: 258 | tg = template_graph(i) 259 | tg.updateG(g) 260 | self.gs.append(tg) 261 | 262 | def genBinValue(self): 263 | n = self.size 264 | self.bit_len = n*n 265 | return 2**(self.bit_len) 266 | 267 | def createG(self, i): 268 | g = nx.DiGraph() 269 | l = self.genArray(i) 270 | #pdb.set_trace() 271 | shape = (self.size, self.size) 272 | data = np.array(l) 273 | ad_matrix = data.reshape(shape) 274 | for i in xrange(self.size): 275 | for j in xrange(self.size): 276 | if ad_matrix[i][j] == 1: 277 | g.add_edge(i, j) 278 | u_g = g.to_undirected() 279 | if len(g) == self.size and nx.is_connected(u_g): 280 | return g 281 | return False 282 | 283 | def genArray(self, i): 284 | l = [int(x) for x in bin(i)[2:]] 285 | x = [0 for v in xrange(self.bit_len - len(l))] 286 | return x + l 287 | -------------------------------------------------------------------------------- /search-engine/db.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | from search import * 3 | from nearpy import Engine 4 | from nearpy.hashes import RandomDiscretizedProjections 5 | from nearpy.filters import NearestFilter, UniqueFilter 6 | from nearpy.distances import EuclideanDistance 7 | from nearpy.distances import CosineDistance 8 | from nearpy.hashes import RandomBinaryProjections 9 | from nearpy.experiments import DistanceRatioExperiment 10 | from redis import Redis 11 | from nearpy.storage import RedisStorage 12 | from feature import * 13 | import numpy as np 14 | import os 15 | import pdb 16 | import argparse 17 | import time 18 | import numpy as np 19 | from refactoring import * 20 | import pymongo 21 | from pymongo import MongoClient 22 | 23 | def initDB(): 24 | client = MongoClient() 25 | client = MongoClient('localhost', 27017) 26 | client = MongoClient('mongodb://localhost:27017/') 27 | db = client.test_database 28 | db = client['iot-encoding'] 29 | return db 30 | 31 | db = initDB() 32 | posts = db.posts 33 | 34 | class db: 35 | 36 | def __init__(self): 37 | self.feature_list = {} 38 | self.engine = None 39 | 40 | def loadHashmap(self, feature_size, result_n): 41 | # Create redis storage adapter 42 | redis_object = Redis(host='localhost', port=6379, db=0) 43 | redis_storage = RedisStorage(redis_object) 44 | pdb.set_trace() 45 | try: 46 | # Get hash config from redis 47 | config = redis_storage.load_hash_configuration('test') 48 | # Config is existing, create hash with None parameters 49 | lshash = RandomBinaryProjections(None, None) 50 | # Apply configuration loaded from redis 51 | lshash.apply_config(config) 52 | 53 | except: 54 | # Config is not existing, create hash from scratch, with 10 projections 55 | lshash = RandomBinaryProjections('test', 0) 56 | 57 | 58 | # Create engine for feature space of 100 dimensions and use our hash. 59 | # This will set the dimension of the lshash only the first time, not when 60 | # using the configuration loaded from redis. Use redis storage to store 61 | # buckets. 62 | nearest = NearestFilter(1000) 63 | #self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) 64 | pdb.set_trace() 65 | self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance()) 66 | 67 | # Do some stuff like indexing or querying with the engine... 68 | 69 | # Finally store hash configuration in redis for later use 70 | redis_storage.store_hash_configuration(lshash) 71 | 72 | def appendToDB(self, binary_name, funcname, fvector, firmware_name=""): 73 | if fvector is None: 74 | return 75 | #ftuple = tuple([fvector]) 76 | self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname))) 77 | 78 | def batch_appendDB(self, binary_name, features, firmware_name=""): 79 | for funcname in features: 80 | feature = features[funcname] 81 | #pdb.set_trace() 82 | self.appendToDB(binary_name, funcname, feature, firmware_name) 83 | 84 | def batch_appendDBbyDir(self, base_dir): 85 | cursor = posts.find({"firmware_name":"ddwrt-r21676_result"}) 86 | i = 0 87 | for v in cursor: 88 | print i 89 | i+=1 90 | binary_name = v['binary_name'] 91 | funcname = v['func_name'] 92 | firmware_name = v['firmware_name'] 93 | feature = v['fvector'] 94 | self.appendToDB(binary_name, funcname, feature, firmware_name) 95 | 96 | def batch_appendDBbyDir1(self, base_dir): 97 | image_dir = os.path.join(base_dir, "image") 98 | firmware_featrues={} 99 | bnum = 0 100 | fnum = 0 101 | i = 0 102 | pdb.set_trace() 103 | for firmware_name in os.listdir(image_dir): 104 | print firmware_name 105 | firmware_featrues[firmware_name] = {} 106 | firmware_dir = os.path.join(image_dir, firmware_name) 107 | for binary_name in os.listdir(firmware_dir): 108 | if binary_name.endswith(".features"): 109 | bnum += 1 110 | featrues_dir = os.path.join(firmware_dir, binary_name) 111 | featrues = pickle.load(open(featrues_dir, "r")) 112 | for funcname in featrues: 113 | fnum +=1 114 | #pdb.set_trace() 115 | feature = featrues[funcname] 116 | self.appendToDB(binary_name, funcname, feature, firmware_name) 117 | del featrues 118 | print("bnum ", bnum) 119 | print("fnum ", fnum) 120 | 121 | def dump(self, base_dir): 122 | db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping") 123 | pickle.dump(self.feature_list, open(db_dir, 'w')) 124 | db_dir = os.path.join(base_dir, "data/db/busybox.hashmap") 125 | pickle.dump(self.engine, open(db_dir, 'w')) 126 | 127 | def loadDB(self, base_dir): 128 | db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping") 129 | self.feature_list = pickle.load(open(db_dir, 'r')) 130 | db_dir = os.path.join(base_dir, "data/db/busybox.hashmap") 131 | self.engine = pickle.load(open(db_dir, 'r')) 132 | 133 | def findF(self, binary_name, funcname): 134 | x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]] 135 | return x[0] 136 | 137 | def retrieveFeaturesByDir(n, base_dir): 138 | firmware_featrues={} 139 | i = 0 140 | for firmware_name in os.listdir(base_dir): 141 | if firmware_name.endWith(".features"): 142 | firmware_featrues[firmware_name] = {} 143 | firmware_dir = os.path.join(base_dir, firmware_name) 144 | if i > 0: 145 | break 146 | i += 1 147 | pdb.set_trace() 148 | for binary_name in os.listdir(firmware_dir): 149 | featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features") 150 | featrues = pickle.load(open(featrues_dir, "r")) 151 | for funcname in featrues: 152 | feature = featrues[funcname] 153 | self.appendToDB(firmware_name, binary_name, funcname, feature) 154 | del featrues 155 | 156 | def retrieveFeatures(n, base_dir, filename, funcs): 157 | feature_dic = {} 158 | featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features") 159 | featrues = pickle.load(open(featrues_dir, "r")) 160 | #featuresx = retrieveFeaturesx(filename) 161 | for name in featrues: 162 | #if name in funcs: 163 | x = featrues[name] 164 | #+ featuresx[name] 165 | feature_dic[name] = np.asarray(x) 166 | return feature_dic 167 | 168 | def retrieveVuldb(base_input_dir): 169 | vul_path = os.path.join(base_input_dir, "vul") 170 | vul_db = pickle.load(open(vul_path, "r")) 171 | return vul_db 172 | 173 | 174 | def retrieveFeaturesx(filename): 175 | ida_input_dir = os.path.join("./data/", filename + ".features") 176 | featuresx = pickle.load(open(ida_input_dir, "r")) 177 | return featuresx 178 | 179 | def retrieveQueries(n, base_dir, filename1, featrues_src): 180 | queries = {} 181 | featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features") 182 | featrues = pickle.load(open(featrues_dir, "r")) 183 | #featuresx = retrieveFeaturesx(filename1) 184 | for name in featrues: 185 | #if name in featrues_src: 186 | x = featrues[name] 187 | #+ featuresx[name] 188 | queries[name] = np.asarray(x) 189 | return queries 190 | 191 | def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1): 192 | queries = {} 193 | featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features") 194 | featrues = pickle.load(open(featrues_dir, "r")) 195 | for name in featrues: 196 | #del featrues[name][5] 197 | queries[name] = np.asarray(featrues[name]) 198 | return queries 199 | 200 | def retrieveQuery(n, base_dir, filename, funcname): 201 | featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features") 202 | featrues = pickle.load(open(featrues_dir, "r")) 203 | f = [featrues[v] for v in featrues if funcname in v ][0] 204 | return np.asarray(f) 205 | 206 | def parse_command(): 207 | parser = argparse.ArgumentParser(description='Process some integers.') 208 | parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training") 209 | parser.add_argument('--output_dir', type=str, help="output dir") 210 | parser.add_argument("--filename1", type=str, help="the size of each graphlet") 211 | parser.add_argument("--filename2", type=str, help="the size of each graphlet") 212 | parser.add_argument("--size", type=int, help="the size of each graphlet") 213 | #parser.add_argument("--size", type=int, help="the size of each graphlet") 214 | args = parser.parse_args() 215 | return args 216 | 217 | def loadFuncs(path): 218 | funcs = {} 219 | x86_dir = os.path.join(path, "func_candid") 220 | #mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida") 221 | fp = open(x86_dir,"r") 222 | for line in fp: 223 | items = line.split("\n") 224 | funcname = items[0] 225 | funcs[funcname] = 1 226 | return funcs 227 | 228 | def dump(path, featrues, queries): 229 | fp = open(path + "/" + "matrix", 'w') 230 | for name in featrues: 231 | row = [] 232 | row.append("x86") 233 | row.append(name) 234 | row += featrues[name] 235 | fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row)) 236 | for name in queries: 237 | row = [] 238 | row.append("mips") 239 | row.append(name) 240 | row += queries[name] 241 | fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row)) 242 | fp.close() 243 | 244 | 245 | def queryBytwo(base_input_dir, filename1, filename2, n): 246 | threthold = 50 247 | db_instance = db() 248 | funcs = loadFuncs(base_input_dir) 249 | db_instance.loadHashmap(n, 50000) 250 | #pdb.set_trace() 251 | featrues = retrieveFeatures(n, base_input_dir, filename1, funcs) 252 | queries = retrieveQueries(n, base_input_dir, filename2, funcs) 253 | #queries = refactoring(queries, featrues) 254 | vul_db = retrieveVuldb(base_input_dir) 255 | pdb.set_trace() 256 | #dump(base_input_dir, featrues, queries) 257 | #start = time.time() 258 | #db_instance.batch_appendDBbyDir(base_input_dir) 259 | #end = time.time() 260 | #total = end - start 261 | #print total 262 | db_instance.batch_appendDB(filename1, featrues) 263 | pdb.set_trace() 264 | ranks = [] 265 | times = [] 266 | for threthold in xrange(1, 210, 10): 267 | hit = [] 268 | i = 0 269 | for name in queries: 270 | #print i 271 | i += 1 272 | ''' 273 | if i == 1000: 274 | print (sum(times)/len(times)) 275 | pdb.set_trace() 276 | print "s" 277 | ''' 278 | #if name not in vul_db['openssl']: 279 | # continue 280 | if name not in featrues: 281 | continue 282 | #pdb.set_trace() 283 | query = queries[name] 284 | #start = time.time() 285 | x = db_instance.engine.neighbours(query) 286 | #end = time.time() 287 | #total = end - start 288 | #times.append(total) 289 | #print total 290 | #pdb.set_trace() 291 | try: 292 | rank = [v for v in xrange(len(x)) if name in x[v][1]][0] 293 | ranks.append((name, rank)) 294 | if rank <= threthold: 295 | hit.append(1) 296 | else: 297 | hit.append(0) 298 | except: 299 | #pdb.set_trace() 300 | hit.append(0) 301 | pass 302 | #pdb.set_trace() 303 | acc = sum(hit) * 1.0 / len(hit) 304 | print acc 305 | 306 | def queryAll(base_dir, firmware_name, filename1, n): 307 | threthold = 155 308 | db_instance = db() 309 | db_instance.loadHashmap(n, 50000) 310 | queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1) 311 | start = time.time() 312 | pdb.set_trace() 313 | db_instance.batch_appendDBbyDir(n, base_dir) 314 | end = time.time() 315 | dur = end - start 316 | print dur 317 | pdb.set_trace() 318 | hit = [] 319 | i = 0 320 | times = [] 321 | for name in queries: 322 | print i 323 | i += 1 324 | query = queries[name] 325 | start = time.clock() 326 | x = db_instance.engine.neighbours(query) 327 | end = time.clock() 328 | dur = end - start 329 | times.append(dur) 330 | #pdb.set_trace() 331 | try: 332 | rank = [v for v in xrange(len(x)) if name in x[v][1]] 333 | if len(rank) > 1: 334 | pdb.set_trace() 335 | print "stop" 336 | if rank[0] <= threthold: 337 | hit.append(1) 338 | else: 339 | hit.append(0) 340 | except: 341 | hit.append(0) 342 | 343 | acc = sum(hit) * 1.0 / len(hit) 344 | mean = np.mean(times) 345 | std = np.std(times) 346 | #pdb.set_trace() 347 | print acc 348 | 349 | if __name__ == "__main__": 350 | args = parse_command() 351 | base_dir = args.base_input_dir 352 | filename1 = args.filename1 353 | filename2 = args.filename2 354 | n = args.size 355 | pdb.set_trace() 356 | queryBytwo(base_dir, filename1, filename2, n) 357 | --------------------------------------------------------------------------------