├── Analysis_Param_Call.py ├── Feature_Of_Binary.py ├── LICENSE ├── LogRecorder.py ├── PYAPI_Featureofbinary.py ├── README.md ├── get_Call_instr_neighbor.py ├── nbsmtp ├── test.sh └── 特征.png /Analysis_Param_Call.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from idc import * 5 | from idaapi import * 6 | import idautils 7 | class AnayBinFil(object): 8 | def __init__(self): 9 | list = [] 10 | # 得到某一条汇编指令所指向的内存的内容 11 | def GetXref_String(self,ea,n): 12 | if (GetOpType(ea,n) == 2): 13 | ea = GetOperandValue(ea,n) 14 | if (not SegName(ea) == '.rodata'): 15 | addrx = idautils.DataRefsFrom(ea) 16 | for item in addrx: 17 | return self.GetXref_String(item,n) 18 | return idc.Dword(ea) 19 | return GetString(ea) 20 | 21 | 22 | #get the register's content whose number is i from ea forward search 23 | def get_content_register(self,ea,i): 24 | #print hex(ea) , idc.GetDisasm(ea), i 25 | 26 | if (GetOpType(ea,0) == 1 and GetOperandValue(ea,0) == i):# wanted register 27 | if (ua_mnem (ea) == 'LDR'): 28 | if (GetOpType(ea,1) == 2):#Optype is Memory Reference 29 | return self.GetXref_String(ea,1) 30 | elif (GetOpType(ea,1) == 4):#Base+index+Displacement 31 | if(GetOperandValue(ea,1) == 0): # like : LDR R3,[R3] 32 | return self.get_content_register(PrevHead(ea),i) 33 | else: 34 | return 35 | else : 36 | print 'unkown Optype:' ,hex(ea),idc.GetDisasm(ea) 37 | elif (ua_mnem(ea) == 'MOV'): 38 | if (GetOpType(ea,1) == 5): 39 | return GetOperandValue(ea,1) 40 | elif (GetOpType(ea,1) == 1): 41 | return self.get_content_register(PrevHead(ea),GetOperandValue(ea,1)) 42 | else: 43 | print 'unkown OpType:',hex(ea),idc.GetDisasm(ea) 44 | else: 45 | return self.get_content_register(PrevHead(ea),i) 46 | 47 | 48 | #from a call instruction BackForward search parameter 49 | def BackForward(self,addr,n): 50 | Reg_content = [] 51 | #addr = PrevHead(addr) 52 | i = 0 # register number 53 | for i in range(n): 54 | Reg_content.append(self.get_content_register(addr,i)) 55 | 56 | return Reg_content 57 | 58 | 59 | def Anayl_Func_Call(self, func_name, para_num): 60 | if func_name == "": 61 | return 62 | 63 | #get start address 64 | segkind = ['.text' , '.init' ,'.plt'] 65 | #startaddr = idc.SegByName('.rodata') 66 | startaddr = MinEA() 67 | #fun_addr = idc.LocByName(func_name) 68 | # search the address of the pattern text 69 | while True: 70 | fun_addr = FindText(startaddr,SEARCH_DOWN, 0, 0, func_name) 71 | if not (SegName(fun_addr)) in segkind: 72 | break 73 | startaddr = NextHead(fun_addr) 74 | 75 | print 'find pattern string addr',hex(fun_addr) 76 | 77 | #byte_str = [hex(y) for y in bytearray(func_name)] 78 | #print byte_str 79 | 80 | #print hex(fun_addr),idc.GetDisasm(fun_addr) 81 | 82 | call_addrs = idautils.DataRefsTo(fun_addr) 83 | dic = {} 84 | for item in call_addrs: 85 | if (not isCode(GetFlags(item))): 86 | continue 87 | #print hex(item),idc.GetDisasm(item) 88 | CALL_ADDR = item 89 | while ( not ua_mnem(CALL_ADDR) == 'BL' ): 90 | CALL_ADDR = NextHead(CALL_ADDR) 91 | CALL_ADDR = PrevHead(CALL_ADDR) 92 | #print 'from addr %s analyses' % (str(hex(CALL_ADDR))) 93 | para = self.BackForward(CALL_ADDR,para_num) 94 | xref_funname = GetFunctionName(CALL_ADDR) 95 | dic[xref_funname] = para 96 | return dic 97 | 98 | 99 | 100 | 101 | def print_help(): 102 | info = 'use this as : idal64/idal -S"Anaylise_All.py \'print1 %s\'" ' 103 | print info 104 | 105 | def main(): 106 | #test code 107 | if (len (idc.ARGV) < 2): 108 | print_help() 109 | ana_fun_name = '%s version %s protocol version %d%s' 110 | else: 111 | ana_fun_name = idc.ARGV[1]#要分析的函数名 112 | 113 | para_num = 0 #参数数量 114 | pos = ana_fun_name.find('%') 115 | while (not pos == -1): 116 | para_num += 1 117 | pos += 1 118 | pos = ana_fun_name.find('%',pos) 119 | 120 | ana = AnayBinFil() 121 | dic = ana.Anayl_Func_Call(ana_fun_name,para_num+1) 122 | 123 | print '在函数中','其调用参数为' 124 | for item in dic: 125 | print item , dic[item] 126 | 127 | sf = open("out.dat",'w') 128 | if not sf: 129 | 130 | sf.write ('parameter:'+str(idc.ARGV[0])+str(idc.ARGV[1])+'\n') 131 | idc.Exit(0) 132 | for item in dic: 133 | sf.write('In function : '+item+'\n') 134 | x = (dic[item]) 135 | s = ' ' 136 | for i in range(len(x)): 137 | if x[i] is None: 138 | continue 139 | s += str(x[i])+' , ' 140 | sf.write(s + '\n') 141 | 142 | sf.close() 143 | ''' 144 | # get all names and it's addr 145 | for x in Names(): 146 | print x 147 | ''' 148 | #idc.Exit(0) 149 | 150 | if __name__ == '__main__': 151 | main() 152 | -------------------------------------------------------------------------------- /Feature_Of_Binary.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | # time 2017:8:21 15:37 3 | # author ysg 4 | # function : extract features from executable binary file 5 | 6 | # revise in 20210129 for supporting IDAPro 7.x 7 | 8 | import sys, os 9 | 10 | sys.path.append("/usr/local/lib/python2.7/dist-packages") 11 | from idautils import * 12 | from idaapi import * 13 | from idc import * 14 | from idautils import DecodeInstruction 15 | 16 | import networkx as nx 17 | 18 | OPTYPEOFFSET = 1000 19 | IMM_MASK = 0xffffffff # 立即数的掩码 20 | # user defined op type 21 | o_string = o_imm + OPTYPEOFFSET 22 | o_calls = OPTYPEOFFSET + 100 23 | o_trans = OPTYPEOFFSET + 101 # Transfer instructions 24 | o_arith = OPTYPEOFFSET + 102 # arithmetic instructions 25 | # 将当前路径添加入搜索路径 26 | sys.path.append(os.getcwd()) 27 | # 28 | transfer_instructions = ['MOV', 'PUSH', 'POP', 'XCHG', 'IN', 'OUT', 'XLAT', 'LEA', 'LDS', 'LES', 'LAHF', 'SAHF', 29 | 'PUSHF', 'POPF'] 30 | arithmetic_instructions = ['ADD', 'SUB', 'MUL', 'DIV', 'XOR', 'INC', 'DEC', 'IMUL', 'IDIV', 'OR', 'NOT', 'SLL', 'SRL'] 31 | # is_type_arithmetic() 32 | from LogRecorder import CLogRecoder 33 | 34 | ymd = time.strftime("%Y-%m-%d", time.localtime()) 35 | logger = CLogRecoder(logfile='%s.log' % (ymd)) 36 | logger.addStreamHandler() 37 | logger.INFO("\n---------------------\n") 38 | IDA700 = False 39 | logger.INFO("IDA Version {}".format(IDA_SDK_VERSION)) 40 | if IDA_SDK_VERSION >= 700: 41 | # IDAPro 6.x To 7.x (https://www.hex-rays.com/products/ida/support/ida74_idapython_no_bc695_porting_guide.shtml) 42 | logger.INFO("Using IDA7xx API") 43 | IDA700 = True 44 | GetOpType = get_operand_type 45 | GetOperandValue = get_operand_type 46 | SegName = get_segm_name 47 | autoWait = auto_wait 48 | GetFunctionName = get_func_name 49 | import ida_pro 50 | Exit = ida_pro.qexit 51 | 52 | 53 | 54 | def wait_for_analysis_to_finish(): 55 | ''' 56 | 等待ida将二进制文件分析完毕再执行其他操作 57 | :return: 58 | ''' 59 | autoWait() 60 | 61 | wait_for_analysis_to_finish() 62 | 63 | # 通过file命令获得可执行文件的位数 64 | def get_ELF_bits(filename): 65 | # logger.INFO('file path and name: %s' % filename) 66 | import commands 67 | cmd = 'file -b %s' % filename 68 | s, o = commands.getstatusoutput(cmd) 69 | if s != 0: 70 | print 'error', s, o 71 | 72 | bits = o.strip().split(' ')[1] 73 | if (int(bits[:1]) == 32): 74 | return 32 75 | 76 | return 64 77 | 78 | 79 | if get_ELF_bits(get_input_file_path()) == 64: 80 | IMM_MASK = 0xffffffffffffffff 81 | 82 | 83 | class Attributes_BlockLevel(object): 84 | def __init__(self, func_t): 85 | self._Blocks = set() 86 | self._Blocks_list = [] 87 | self._func = func_t 88 | self._block_boundary = {} 89 | self._addr_func = func_t.startEA if not IDA700 else func_t.start_ea # first address of function 90 | self._name_func = str(GetFunctionName(func_t.startEA if not IDA700 else func_t.start_ea)) # GetFunctionName(startEA) returns the function name 91 | self._All_Calls = [] 92 | self._G = nx.DiGraph() 93 | self._pre_nodes = {} 94 | self._CFG = {} # key : Block startEA ; value : Block startEA of successors 95 | self._init_all_nodes() 96 | 97 | self.callee = set() # 被该函数调用的其他函数 98 | self.caller = set() # 调用该函数的其他函数集合 99 | 100 | # compute betweenness 101 | self._Betweenness = nx.betweenness_centrality(self._G) 102 | # self._Betweenness = {} 103 | # self._dijKstra() 104 | # normalize betweenness 105 | # n = len(self._Blocks) 106 | # if n>1: 107 | # for x in self._Betweenness: 108 | # self._Betweenness[x] /= (n*(n-1)/2)*1.0 109 | 110 | # compute offspring 111 | self._offspring = {} 112 | self.visit = set() 113 | # logger.INFO('computing offspring...') 114 | for node in self._Blocks: 115 | self.visit = set() 116 | self._offspring[node] = self.dfs(node) 117 | # logger.INFO('node: %s : offspring = %d' % (hex(node) , self._offspring[node])) 118 | 119 | logger.INFO('offspring computed!') 120 | # print betweenness 121 | # for key in self._Betweenness: 122 | # logger.INFO(hex(key) + str(self._Betweenness[key])) 123 | 124 | # TODO: 返回被该函数调用的其他函数 125 | def get_callees(self): 126 | 127 | # function body 128 | return self.callee 129 | 130 | # TODO: 返回调用该函数的其他函数 131 | def get_callers(self): 132 | 133 | # 给定一个指令地址,返回包含该指令地址的函数的起始地址和函数名 134 | def get_func_including_addr(addr): 135 | func_list = list(Functions(addr, addr + 1)) # 得到包含该地址addr和addr+1的函数列表,取出第一个函数即为目标函数 136 | if len(func_list) < 1: 137 | return None 138 | func_startEA = func_list[0] 139 | return func_startEA, Name(func_startEA) 140 | pass 141 | 142 | # function body 143 | addr = self._func.startEA if not IDA700 else self._func.start_ea 144 | # logger.INFO("function startEA {}".format(hex(addr))) 145 | for ref in CodeRefsTo(addr, 1): 146 | ref_func = get_func_including_addr(ref) 147 | if ref_func: 148 | self.caller.add(ref_func[0]) 149 | return self.caller 150 | 151 | # dfs to compute node's offspring 152 | # return node's offspring 153 | def dfs(self, node_startEA): 154 | 155 | if node_startEA in self.visit: 156 | return 0 157 | 158 | self.visit.add(node_startEA) 159 | offspring = 0 160 | for succ_node in self._CFG[node_startEA]: 161 | if succ_node not in self.visit: 162 | offspring += self.dfs(succ_node) + 1 163 | # logger.INFO('node %s returns offspring %d' % (hex(node_startEA), offspring)) 164 | return offspring 165 | 166 | # returns Oprand from ea 167 | # inst_t.Operands : ida_ua.operands_array 168 | # inst_t.Op1 : ida_ua.op_t 169 | def _get_OpRands(self, ea): 170 | inst_t = DecodeInstruction(ea) 171 | return inst_t.Operands 172 | 173 | # #get Betweenness 174 | # def _djstra(self): 175 | # self._Betweenness[self._addr_func] = 0 #首节点加入 176 | # added_node_set = set() 177 | # added_node_set.add(self._addr_func) 178 | # #记录其前驱节点 179 | # pre = {} 180 | # for node in self._Blocks: 181 | # pre[node] = [] 182 | # max_loop_time = len(pre) 183 | # i = 0 184 | # logger.INFO('in %s djstra running ...' % GetFunctionName(self._func.startEA)) 185 | # logger.INFO('first node address %s' % str(self._Betweenness)) 186 | # while True: 187 | # not_add_node = set(self._Blocks_list) - added_node_set 188 | # if len(not_add_node) == 0 : 189 | # break 190 | # if i > max_loop_time: 191 | # logger.INFO('function max loop !!!! please check CFG in ida') 192 | # break 193 | # 194 | # logger.INFO('not added node number: %d' % len(not_add_node)) 195 | # #待优化 196 | # #added_node集合数目比较大时,非常耗费计算资源 197 | # for added_node in copy.deepcopy(added_node_set): 198 | # for node in not_add_node: 199 | # if node in self._CFG[added_node]: 200 | # # added_node 以及其前驱节点加1 201 | # self._Betweenness[node] = 0 202 | # self._update_betweenness(added_node, pre) 203 | # pre[node].append(added_node) 204 | # added_node_set.add(node) 205 | # # logger.INFO('node %s added' % (hex(node))) 206 | # # logger.INFO('node '+ hex(node) + ' , pre_node ' + hex(added_node)) 207 | # i += 1 208 | # self._Betweenness[self._Blocks_list[0]] = 0 209 | # logger.INFO('djstra finished ...') 210 | 211 | # get Betweenness 212 | 213 | def _update_betweenness(self, added_node, pre): 214 | self._Betweenness[added_node] += 1 215 | if len(pre[added_node]) == 0: 216 | return 217 | queue = [] 218 | queue += pre[added_node] 219 | while len(queue) > 0: 220 | node = queue.pop(0) 221 | queue += pre[node] 222 | self._Betweenness[node] += 1 223 | 224 | # for pre_node in pre[added_node]: 225 | # self._update_betweenness(pre_node, pre) 226 | 227 | def _add_predecessors(self, cbs, preb): 228 | for cb in cbs: 229 | if cb not in self._pre_nodes: 230 | self._pre_nodes[cb] = [] 231 | self._pre_nodes[cb].append(preb) 232 | 233 | # initial block_boundary , get every node's range of address 234 | def _init_all_nodes(self): 235 | flowchart = FlowChart(self._func) 236 | for i in range(flowchart.size): 237 | basicblock = flowchart.__getitem__(i) 238 | self._Blocks.add(basicblock.startEA if not IDA700 else basicblock.start_ea) 239 | self._G.add_node(basicblock.startEA if not IDA700 else basicblock.start_ea) 240 | # 节点的前继节点 241 | # self._pre_nodes[basicblock.startEA if not IDA700 else basicblock.start_ea] = [] 242 | # logger.INFO(hex(basicblock.startEA if not IDA700 else basicblock.start_ea) + ' prenode: ' + str(self._pre_nodes[basicblock.startEA if not IDA700 else basicblock.start_ea])) 243 | self._CFG[basicblock.startEA if not IDA700 else basicblock.start_ea] = [b.startEA if not IDA700 else b.start_ea for b in basicblock.succs()] 244 | for b in basicblock.succs(): 245 | self._G.add_node(b.startEA if not IDA700 else b.start_ea) 246 | self._G.add_edge(basicblock.startEA if not IDA700 else basicblock.start_ea, b.startEA if not IDA700 else b.start_ea) 247 | self._add_predecessors([b.startEA if not IDA700 else b.start_ea for b in basicblock.succs()], basicblock.startEA if not IDA700 else basicblock.start_ea) 248 | self._block_boundary[basicblock.startEA if not IDA700 else basicblock.start_ea] = basicblock.endEA if not IDA700 else basicblock.end_ea 249 | self._Blocks_list = list(self._Blocks) 250 | self._Blocks_list.sort() 251 | # print CFG 252 | # for key in self._CFG: 253 | # succ = [hex(node) for node in self._CFG[key]] 254 | # logger.INFO('node : '+hex(key) + ' succ :' + str(succ)) 255 | 256 | # # return the n'th operation if it is reference to memory 257 | # def get_op(self, ea, n, op_type): 258 | # # Direct Memory Reference (DATA) addr 259 | # if (op_type == o_mem): 260 | # return GetString(GetOperandValue(ea, n)) 261 | # elif (op_type == o_phrase):# Memory Ref [Base Reg + Index Reg] phrase 262 | # return GetOperandValue(ea, n) 263 | 264 | ''' 265 | # return the string contained in this instruction 266 | # if nothing , returns NULL 267 | # something wrong 268 | def get_String_in_instruction(self, ea): 269 | # logger.INFO('ea: ' + hex(ea) + ' inst: '+ GetDisasm(ea)) 270 | All_strings = [] 271 | op = 0 272 | op_type = GetOpType(ea, op) 273 | while (op_type != o_void): 274 | # logger.INFO( 'op: %d, op_type : %d' % (op, op_type)) 275 | if (op_type == o_imm): 276 | addr = GetOperandValue(ea, op) 277 | if (not SegName(addr) == '.rodata'): 278 | addrx = list(DataRefsFrom(addr)) 279 | if len(addrx) == 0: 280 | op += 1 281 | op_type = GetOpType(ea, op) 282 | continue 283 | addr = addrx[0] 284 | All_strings.append(GetString(addr)) 285 | # logger.INFO("imm") 286 | # logger.INFO(GetString(addr)) 287 | 288 | op += 1 289 | try: 290 | op_type = GetOpType(ea, op) 291 | except RuntimeError: 292 | print 'runtime error in', hex(ea), 'op', str(op) ,'OP_TYPE', op_type 293 | 294 | if (len(All_strings) == 0): 295 | return None 296 | 297 | return All_strings 298 | ''' 299 | 300 | # 返回该节点的前继节点的首地址 301 | def get_PreNodes_of_blocks(self, startEA): 302 | 303 | if startEA not in self._Blocks: 304 | return 305 | if startEA not in self._pre_nodes: 306 | return [] 307 | 308 | return self._pre_nodes[startEA] 309 | 310 | # returns all Strings referenced in one block 311 | # return generator of Strings 312 | def get_All_Strings_of_Block(self, block_startEA): 313 | return self.get_OpValue_Block(block_startEA, my_op_type=o_string) 314 | 315 | ''' 316 | All_String = [] 317 | # address is not right 318 | if (block_startEA not in self._block_boundary): 319 | return 320 | 321 | strings = [] 322 | endEA = self._block_boundary[block_startEA] 323 | it_code = func_item_iterator_t(self._func, block_startEA) 324 | ea = it_code.current() 325 | while (ea < endEA): 326 | strings = self.get_String_in_instruction(ea) 327 | if strings: 328 | All_String += strings 329 | # see if arrive end of the blocks 330 | if (not it_code.next_code()): 331 | break 332 | ea = it_code.current() 333 | 334 | return All_String 335 | ''' 336 | 337 | # return a instruction's n'th oprand's reference 338 | # ea : the address of the instruction 339 | # n : order of the operand , 0-the first operand 340 | def get_reference(self, ea, n): 341 | if (GetOpType(ea, n) == -1): 342 | return 343 | if (GetOpType(ea, n) == 1): 344 | print 345 | 'General Register' 346 | if (GetOpType(ea, n) == 2): 347 | addr = GetOperandValue(ea, n) 348 | print 349 | 'addr :', hex(Dword(addr)) 350 | print 351 | ' reference' 352 | print 353 | 'segment type :', GetSegmentAttr(addr, SEGATTR_TYPE) 354 | return GetString(Dword(addr)) 355 | elif (GetOpType(ea, n) == 3): 356 | print 357 | 'base + index' 358 | elif (GetOpType(ea, n) == 4): 359 | print 360 | 'B+i+Displacement' 361 | elif (GetOpType(ea, n) == 5): 362 | print 363 | 'immediate' 364 | elif (GetOpType(ea, n) == 6): 365 | print 366 | 'far address' 367 | return GetOperandValue(ea, n) 368 | 369 | def get_AdjacencyMatrix(self): 370 | list = [] 371 | for node in self._Blocks_list: 372 | newlist = [] 373 | for node2 in self._Blocks_list: 374 | # if node2 == node: 375 | # newlist.append(0) 376 | # else: 377 | if node2 in self._CFG[node]: 378 | newlist.append(1) 379 | else: 380 | newlist.append(0) 381 | list.append(newlist) 382 | 383 | return list 384 | 385 | # offspring means children nodes in CFG 386 | def get_Offspring_of_Block(self, startEA): 387 | if startEA not in self._Blocks_list: 388 | return None 389 | return self._offspring[startEA] 390 | 391 | # there is some error to be solved 392 | # returns the next address of instruction which are in same basic block 393 | def get_next_instruction_addr(self, ea): 394 | return next(ea) 395 | 396 | # get_reference_data_one_block 397 | def get_reference_data_one_block(self, startEA): 398 | 399 | # address is not right 400 | if (startEA not in self._block_boundary): 401 | return 402 | 403 | endEA = self._block_boundary[startEA] 404 | it_code = func_item_iterator_t(self._func, startEA) 405 | ea = it_code.current() 406 | while (ea < endEA): 407 | yield (''.join(self.get_instruction(ea))) 408 | 409 | # see if arrive end of the blocks 410 | if (not it_code.next_code()): 411 | break 412 | ea = it_code.current() 413 | 414 | # get the whole instruction 415 | def get_instruction(self, ea): 416 | return idc.GetDisasm(ea) 417 | 418 | def get_Trans_of_block(self, ea): 419 | return len(self.get_OpValue_Block(ea, o_trans)) 420 | 421 | # startEA:basicblock's start address 422 | # return all instruction in one block 423 | # it is replaced by function get_reference_data_one_block 424 | def get_All_instr_in_one_block(self, startEA): 425 | 426 | return list(self.get_reference_data_one_block(startEA)) 427 | 428 | ''' 429 | # 430 | # instr_list = [] 431 | # if (startEA not in self._block_boundary): 432 | # return instr_list 433 | # 434 | # endEA = self._block_boundary[startEA] 435 | # it_code = func_item_iterator_t(self._func, startEA) 436 | # ea = it_code.current() 437 | # while ((ea) < endEA): 438 | # newlist = [] 439 | # newlist.append(ua_mnem(ea)) 440 | # i = 0 441 | # op = GetOpnd(ea, i) 442 | # while not op == "": 443 | # newlist.append(op) 444 | # i += 1 445 | # op = GetOpnd(ea, i) 446 | # 447 | # instr_list.append(newlist) 448 | # if (not it_code.next_code()): 449 | # break 450 | # ea = it_code.current() 451 | # 452 | # return instr_list 453 | ''' 454 | 455 | # return function's name 456 | def getFuncName(self): 457 | return self._name_func 458 | 459 | def FrameSize(self): 460 | return GetFrameSize(self._func.startEA if not IDA700 else self._func.start_ea) # get full size of function frame 461 | 462 | def getHexAddr(self, addr): 463 | return hex(addr) 464 | 465 | def FrameArgsSize(self): # get size of arguments in function frame which are purged upon return 466 | return GetFrameArgsSize(self._func.startEA if not IDA700 else self._func.start_ea) 467 | 468 | def FrameRegsSize(self): # get size of 469 | return GetFrameRegsSize(self._func.startEA if not IDA700 else self._func.start_ea) 470 | 471 | # get operand value in one block 472 | def get_OpValue_Block(self, startEA, my_op_type): 473 | OPs = [] 474 | # address is not right 475 | if (startEA not in self._block_boundary): 476 | return 477 | 478 | endEA = self._block_boundary[startEA] 479 | it_code = func_item_iterator_t(self._func, startEA) 480 | ea = it_code.current() 481 | while (ea < endEA): 482 | OPs += self.get_OpValue(ea, my_op_type) 483 | # see if arrive end of the blocks 484 | if (not it_code.next_code()): 485 | break 486 | ea = it_code.current() 487 | 488 | return OPs 489 | 490 | def get_Arithmetics_Of_Block(self, ea): 491 | return len(self.get_OpValue_Block(ea, o_arith)) 492 | 493 | # return all function or api names called by this function 494 | def get_Calls_BLock(self, ea): 495 | return len(self.get_OpValue_Block(ea, o_calls)) 496 | # ref = xrefblk_t() 497 | # 498 | # for ea in self._Blocks_list: 499 | # it = func_item_iterator_t(self._func, ea) 500 | # 501 | # while it.next_code(): 502 | # nea = it.current() 503 | # logger.INFO('address :' + hex(nea)) 504 | # frm = [hex(x.frm) for x in XrefsFrom(nea)] 505 | # logger.INFO(str(frm)) 506 | 507 | # this is an abstract interface 508 | # it can replace functions like get_Numeric_Constant 509 | def get_OpValue(self, ea, my_op_type=o_void): 510 | OV = [] 511 | 512 | # instruction level features 513 | if (my_op_type == o_trans): 514 | # it's a transfer instruction if data transfered between reg and mem 515 | # logger.INFO('disasm:' + GetDisasm(ea)) 516 | inst = GetDisasm(ea).split(' ')[0].upper() 517 | if (inst in transfer_instructions): 518 | # logger.INFO('in trans') 519 | OV.append(inst) 520 | return OV 521 | 522 | elif (my_op_type == o_arith): 523 | inst = GetDisasm(ea).split(' ')[0].upper() 524 | # logger.INFO('disasm:' + GetDisasm(ea)) 525 | if (inst in arithmetic_instructions): 526 | # logger.INFO('in arithmetic') 527 | OV.append(inst) 528 | return OV 529 | 530 | op = 0 531 | op_type = GetOpType(ea, op) 532 | while (op_type != o_void): 533 | 534 | # o_calls 535 | if (my_op_type == o_calls): 536 | # logger.INFO("disasm : " + GetDisasm(ea)) 537 | if (GetDisasm(ea).split(' ')[0].upper() == "CALL"): 538 | # logger.INFO('in o_calls : ' + self.get_instruction(ea)) 539 | OV.append(GetDisasm(ea).split(' ')[-1]) 540 | break 541 | 542 | if (op_type == my_op_type % OPTYPEOFFSET): 543 | ov = GetOperandValue(ea, op) 544 | ov &= 0xffffffff # 强制转化成32位 545 | if (my_op_type == o_imm): 546 | # if SegName(ov) == "": 547 | # OV.append(ov) 548 | logger.INFO(hex(ea) + ' imm : ' + hex(ov)) 549 | if ov != 0: 550 | OV.append(hex(ov)) 551 | elif (my_op_type == o_string): 552 | if (not SegName(ov) == '.rodata'): 553 | addrx = list(DataRefsFrom(ov)) 554 | if len(addrx) == 0: 555 | op += 1 556 | op_type = GetOpType(ea, op) 557 | continue 558 | ov = addrx[0] 559 | OV.append(GetString(ov)) 560 | 561 | op += 1 562 | op_type = GetOpType(ea, op) 563 | return OV 564 | 565 | ''' 566 | #return the Numeric Constants in the linear address ea 567 | def get_Numeric_Constants(self, ea): 568 | # op_enum() 569 | Con = [] 570 | op = 0 571 | op_type = GetOpType(ea, op) 572 | while ( op_type != o_void ): 573 | 574 | if (op_type == o_imm): 575 | if (SegName(GetOperandValue(ea, op)) == ""):# if the immediate number is not an address 576 | Con.append(GetOperandValue(ea, op)) 577 | 578 | op += 1 579 | op_type = GetOpType(ea, op) 580 | 581 | logger.INFO( "get_Numeric_Constants : " + self.get_instruction(ea) +' : '+ str(Con) ) 582 | return Con 583 | ''' 584 | 585 | # get immediate num in blocks 586 | def get_Numeric_Constants_One_block(self, startEA): 587 | return self.get_OpValue_Block(startEA, my_op_type=o_imm) 588 | 589 | # get Betweenness of Blocks 590 | def get_Betweenness_of_Block(self, startEA): 591 | if startEA not in self._Betweenness: 592 | return -0 593 | return self._Betweenness[startEA] 594 | 595 | def get_CFG(self): 596 | return self._CFG 597 | 598 | ''' 599 | def getCFG_OF_Func(self): 600 | # get the Control Flow Graph of the function , return a list in the format of [(current_block_startaddr:next_block_startaddr), ......] 601 | # if a function has only one node , it's cfg may be empty 602 | # flowchart for a function 603 | flowchart = FlowChart(self._func) 604 | list = [] 605 | for i in range(flowchart.size): 606 | basicblock = flowchart.__getitem__(i) 607 | suc = basicblock.succs() 608 | for item in suc: 609 | list.append(((basicblock.startEA if not IDA700 else basicblock.start_ea), (item.startEA))) 610 | # print basicblock.id,hex(basicblock.startEA if not IDA700 else basicblock.start_ea),hex(basicblock.endEA if not IDA700 else basicblock.end_ea) 611 | return list 612 | ''' 613 | 614 | # return all the start address of basicblock in form of set 615 | def get_All_Nodes_StartAddr(self): 616 | return self._Blocks_list 617 | 618 | # return a blocks end address 619 | def get_Block_Endaddr(self, startEA): 620 | if (startEA in self._block_boundary): 621 | return self._block_boundary[startEA] 622 | return -1 623 | 624 | 625 | # print how to use this script 626 | def print_help(): 627 | help = 'args not enough' 628 | print(help) 629 | 630 | 631 | # get block attributes 632 | # return a dic 633 | # which have keys : startEA, String_Constant, Numberic_Constant, No_Tran, No_Call, No_Instru, No_Arith, No_offspring, Betweenness 634 | def get_att_block(blockEA, Attribute_Block): 635 | AB = Attribute_Block 636 | dic = {} 637 | dic['startEA'] = blockEA 638 | dic['String_Constant'] = AB.get_All_Strings_of_Block(blockEA) 639 | dic['Numberic_Constant'] = AB.get_Numeric_Constants_One_block(blockEA) 640 | dic['No_Tran'] = AB.get_Trans_of_block(blockEA) 641 | dic['No_Call'] = AB.get_Calls_BLock(blockEA) 642 | dic['No_Instru'] = len(AB.get_All_instr_in_one_block(blockEA)) 643 | dic['No_Arith'] = AB.get_Arithmetics_Of_Block(blockEA) 644 | dic['No_offspring'] = AB.get_Offspring_of_Block(blockEA) 645 | dic['Betweenness'] = round(AB.get_Betweenness_of_Block(blockEA), 3) 646 | dic['pre'] = [hex(ea) for ea in AB.get_PreNodes_of_blocks(blockEA)] 647 | return dic 648 | 649 | 650 | def save_Json(filename, func_name): 651 | for i in range(0, get_func_qty()): 652 | fun = getn_func(i) 653 | segname = get_segm_name(fun.startEA if not IDA700 else fun.start_ea) 654 | if segname[1:3] not in ["OA", "OM", "te"]: 655 | continue 656 | if (func_name != '' and GetFunctionName(fun.startEA if not IDA700 else fun.start_ea) != func_name): 657 | continue 658 | with open(filename, 'a') as f: 659 | AB = Attributes_BlockLevel(fun) 660 | logger.INFO(AB.getFuncName()) 661 | CFG = AB.get_CFG() 662 | dic = {} 663 | dic['fun_name'] = AB.getFuncName() 664 | dic['adjacentmat'] = AB.get_AdjacencyMatrix() 665 | for ea in CFG: 666 | dic[hex(ea)] = get_att_block(ea, AB) 667 | dic[hex(ea)]['succ'] = [] 668 | for succ_ea in CFG[ea]: 669 | dic[hex(ea)]['succ'].append(hex(succ_ea)) 670 | 671 | # logger.INFO('dic' + str(dic)) 672 | json.dump(dic, f, ensure_ascii=False) 673 | 674 | f.write('\n') 675 | 676 | 677 | def main(): 678 | if len(idc.ARGV) < 2: 679 | return 680 | func_name = '' # 待提取特征的函数名,''空字符当成提取全部函数的特征 681 | filename = idc.ARGV[1] 682 | if len(idc.ARGV) >= 3: 683 | func_name = idc.ARGV[2] 684 | 685 | # filename = get_root_filename() 686 | # filename = filename + '.json' 687 | save_Json(filename, func_name) 688 | return 689 | 690 | if len(idc.ARGV) < 0: 691 | print_help() 692 | return 693 | set_seg = set() 694 | for i in range(0, get_func_qty()): 695 | fun = getn_func(i) # get_func returns a func_t struct for the function 696 | segname = get_segm_name( 697 | fun.startEA if not IDA700 else fun.start_ea) # get the segment name of the function by address ,x86 arch segment includes (_init _plt _plt_got _text extern _fini) 698 | if segname[1:3] not in ["OA", "OM", "te"]: 699 | continue 700 | # print p_func.getCFG_OF_Func() 701 | # print p_func.getAll_Nodes_Addr() 702 | # for item in p_func.getAll_Nodes_Addr(): 703 | # print hex(item),hex(p_func.get_Nodes_Endaddr(item)) 704 | if (GetFunctionName(fun.startEA if not IDA700 else fun.start_ea) != 'main'): 705 | p_func = Attributes_BlockLevel(fun) 706 | logger.INFO(p_func.getFuncName()) 707 | # p_func.get_All_Calls() 708 | allnodes = p_func.get_All_Nodes_StartAddr() 709 | for ea in allnodes: 710 | logger.INFO('block start' + hex(ea)) 711 | logger.INFO('block offspring:' + str(p_func.get_Offspring_of_Block(ea))) 712 | logger.INFO('block betweenness :' + str(p_func.get_Betweenness_of_Block(ea))) 713 | # logger.INFO('block instructions :' + str(len(p_func.get_All_instr_in_one_block(ea)))) 714 | # logger.INFO(p_func.get_reference_data_one_block(ea).next()) 715 | # logger.INFO('String: ' + str(p_func.get_All_Strings_of_Block(ea))) 716 | # logger.INFO(p_func.get_Numeric_Constants_One_block(ea)) 717 | # calls = p_func.get_Calls_BLock(ea) 718 | # logger.INFO(calls) 719 | # logger.INFO('trans number: ' + str(p_func.get_Trans_of_block(ea))) 720 | # logger.INFO('arithmetics :' + str(p_func.get_Arithmetics_Of_Block(ea))) 721 | 722 | 723 | # test for callers 724 | # 2018年12月25日10:19:07 测试通过 725 | def test_caller(): 726 | # 测试使用代码块 上 727 | for func in Functions(): 728 | AB = Attributes_BlockLevel(func_t(func)) 729 | func_callers = AB.get_callers() 730 | print "function {} {}".format(hex(func), Name(func)) 731 | for caller_ea in func_callers: 732 | print "callers {} {}".format(hex(caller_ea), Name(caller_ea)) 733 | # 测试使用代码块 下 734 | 735 | 736 | # def test_callee(): 737 | 738 | # do something within one function 739 | import json 740 | 741 | if __name__ == '__main__': 742 | # test_caller() 743 | try: 744 | main() 745 | except Exception as e: 746 | import traceback 747 | logger.INFO(traceback.format_exc()) 748 | Exit(0) 749 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 shouguoyang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LogRecorder.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | 4 | 5 | class CLogRecoder: 6 | 7 | def __init__(self, logfile = 'log.log', format = '%(asctime)s : %(message)s', level = logging.DEBUG): 8 | logging.basicConfig(filename= logfile, level= level , format= format) 9 | self._ft = format 10 | 11 | def addStreamHandler(self): 12 | console = logging.StreamHandler() 13 | console.setLevel(logging.INFO) 14 | formater = logging.Formatter(self._ft) 15 | console.setFormatter(formater) 16 | logging.getLogger('').addHandler(console) 17 | return self 18 | 19 | def INFO(self, message): 20 | logging.info(message) 21 | return self 22 | 23 | 24 | 25 | if __name__ == '__main__': 26 | lr = CLogRecoder().addStreamHandler() 27 | lr.INFO("test") 28 | 29 | -------------------------------------------------------------------------------- /PYAPI_Featureofbinary.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | #无需学会idapython 的使用,直接调用该类下的接口即可获得函数 3 | 4 | #系统ida所在的路径 5 | idapath = '/home/ubuntu/disk/hdd_1/ysg/tool/idapro-7.5/idat64' 6 | import os,time,commands,json 7 | import argparse 8 | 9 | parse = argparse.ArgumentParser() 10 | import sys 11 | pro_path = sys.path[0] 12 | 13 | class getFeature: 14 | def __init__(self, binarypath): 15 | self._bin = binarypath 16 | self._tmpfile = pro_path + os.sep + binarypath.split('/')[-1] + str(time.time()) + '.json' 17 | 18 | #read json file to get features 19 | def _ReadFeatures(self): 20 | with open(self._tmpfile,'r') as f: 21 | for line in f.readlines(): 22 | # print line 23 | x = json.loads(unicode(line,errors='ignore')) 24 | yield x 25 | 26 | def _del_tmpfile(self): 27 | os.remove(self._tmpfile) 28 | 29 | def get_Feature_all(self): 30 | return self.get_Feature_Function('') 31 | pass 32 | 33 | def get_Feature_Function(self, func_name): 34 | 35 | cmd = "TVHEADLESS=1 %s -A -S'%s/Feature_Of_Binary.py %s %s' %s" % (idapath, pro_path, self._tmpfile, func_name, self._bin) 36 | # print cmd 37 | s,o = commands.getstatusoutput(cmd) 38 | 39 | if s!=0 : 40 | print 'error occurs when extract Features from ida database file' 41 | print 'cmd is %s' % cmd 42 | print s,o 43 | return None 44 | 45 | features = list(self._ReadFeatures()) 46 | self._del_tmpfile() 47 | return features 48 | 49 | def test(args): 50 | 51 | binary_path = args.binaryfile 52 | # generate ida database file 53 | func_name = '' 54 | out_file = '' 55 | if args.f: 56 | func_name = args.f 57 | if args.o: 58 | out_file = args.o 59 | 60 | gf = getFeature(binary_path) 61 | feature = gf.get_Feature_Function(func_name) 62 | 63 | 64 | 65 | if len(out_file) > 0: 66 | func_dics = [] 67 | for dic in feature: 68 | nodes_ordered_list = [] 69 | for node_addr in dic.keys(): 70 | if str(node_addr).startswith('0x'): 71 | nodes_ordered_list.append(node_addr) 72 | feature_list = [] # the feature list for BBs 73 | adjacent_matrix = [[0 for i in range(len(nodes_ordered_list))] for j in range(len(nodes_ordered_list))] # adjacent matrix for CFG 74 | for i, node in enumerate(nodes_ordered_list): 75 | feature_list.append([ 76 | len(dic[node]["String_Constant"]), 77 | len(dic[node]["Numberic_Constant"]), 78 | dic[node]["No_Tran"], 79 | dic[node]["No_Call"], 80 | dic[node]["No_Instru"], 81 | dic[node]["No_Arith"], 82 | dic[node]["No_offspring"], 83 | ]) 84 | for presuccessor in dic[node]['pre']: 85 | p_i = nodes_ordered_list.index(presuccessor) 86 | adjacent_matrix[p_i][i] = 1 87 | new_dic = {"func_name": dic['fun_name'], 88 | 'feature_list':feature_list, 89 | 'adjacent_matrix': adjacent_matrix} 90 | func_dics.append(new_dic) 91 | 92 | with open(out_file, 'w') as f: 93 | json.dump(func_dics, f, indent=4) 94 | else: 95 | for x in feature: 96 | print x 97 | 98 | 99 | if __name__ == '__main__': 100 | parse.add_argument('binaryfile', help='file to be analysed') 101 | parse.add_argument('-f', help='function name to be handled ') 102 | parse.add_argument('-b', help='file to be analysed is binary file , default is ida database file', action = 'store_true' , default=False) 103 | parse.add_argument('-o', help='output filename') 104 | args = parse.parse_args() 105 | test(args) 106 | 107 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Extract features for the paper 'Scalable Graph-based Bug Search for Firmware Images' 2 | 3 | # Before use !!! 4 | you need change the variable idapath in file PYAPI_Featureobbinary.py line 5 to the ida text interface in your system 5 | like : `idapath = 'path/to/idal64'` 6 | 7 | # Usage 8 | ` python PYAPI_Featureofbinary.py -h to see help of script ` 9 | 10 | or 11 | 12 | ` idat64 -S "Feature_Of_Binary.py saved_file [func_name]" nbsmtp ` 13 | -------------------------------------------------------------------------------- /get_Call_instr_neighbor.py: -------------------------------------------------------------------------------- 1 | #get the instructions nearby call 2 | 3 | 4 | from idautils import * 5 | from idc import * 6 | from idaapi import * 7 | 8 | #get all functions of their address 9 | # return generator 10 | def get_funcs(): 11 | return Functions() 12 | 13 | #get the address of the instructions which call the functions 14 | def get_call_instr(addr): 15 | return [addr for addr in list(CodeRefsTo(addr, 0)) if is_call_insn(addr)] 16 | 17 | 18 | #get the disasm from start of basic block to given inst_ea 19 | def get_bb_to_ea(inst_ea): 20 | ea = inst_ea 21 | func = get_func(inst_ea) 22 | if not func: 23 | print 'get wrong address ',hex(inst_ea) 24 | return [] 25 | 26 | inst = [] 27 | #jmp over the call instruction 28 | inst.append(GetDisasm(ea)) 29 | ea = PrevHead(ea) 30 | 31 | while not is_xref_to(ea) and len(list(XrefsFrom(ea)))>0 and ea > func.startEA: 32 | inst.append(GetDisasm(ea)) 33 | ea = PrevHead(ea) 34 | if is_call_insn(ea): 35 | break 36 | 37 | return list(reversed(inst)) 38 | 39 | #get the disasm from start of basic block to end of the block 40 | def get_disasm_block(ea): 41 | func = get_func(ea) 42 | if not func: 43 | print 'get wrong address ',hex(ea) 44 | return [] 45 | inst = [] 46 | while not len(list(CodeRefsFrom(ea, 0)))>0 and ea < func.endEA: 47 | inst.append(GetDisasm(ea)) 48 | ea = NextHead(ea) 49 | if is_call_insn(ea): 50 | break 51 | # print hex(ea),GetDisasm(ea) 52 | 53 | return inst 54 | 55 | def get_Function_name(ea): 56 | return GetFunctionName(ea) 57 | 58 | # if the instruction in ea is refered 59 | def is_xref_to(ea): 60 | if len(list(CodeRefsTo(ea, 0))) > 0: 61 | return True 62 | 63 | return False 64 | 65 | def main(): 66 | filename = GetInputFile()# file name of binary 67 | with open(filename+'_directcall.txt','w') as f: 68 | for func_addr in get_funcs(): 69 | callee_function_name = get_Function_name(func_addr) 70 | callee_inst = get_disasm_block(func_addr) 71 | # print('callee_inst' + str(callee_inst)) 72 | for caller_ea in get_call_instr(func_addr): 73 | caller_inst = get_bb_to_ea(caller_ea) 74 | caller_function_name = get_Function_name(caller_ea) 75 | #make sure that 76 | if len(callee_inst) >=3 and len(caller_inst) >= 3: 77 | f.write("\n caller : {} -> callee : {} \n".format(caller_function_name, callee_function_name)) 78 | for all_inst in caller_inst+callee_inst: 79 | f.write(all_inst+'\n') 80 | 81 | main() 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /nbsmtp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangshouguo/Graph-based_Bug_Search/41f1e14f49e9f107508eb98c89d2333fb14630e4/nbsmtp -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/zsh 2 | idat64 -S"Feature_Of_Binary.py tmpoutfile deregister_tm_clones" nbsmtp.i64 3 | -------------------------------------------------------------------------------- /特征.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangshouguo/Graph-based_Bug_Search/41f1e14f49e9f107508eb98c89d2333fb14630e4/特征.png --------------------------------------------------------------------------------