├── README.md
├── raw-feature-extractor
    ├── cfg_constructor.py
    ├── discovRe.py
    ├── func.py
    ├── graph_analysis_ida.py
    ├── graph_property.py
    ├── preprocessing_ida.py
    └── raw_graphs.py
└── search-engine
    └── db.py


/README.md:
--------------------------------------------------------------------------------
 1 | This project provides two components of Genius, a graph-based bug search framework. The first component is the raw feature extraction. The second is the online bug search engine.
 2 | 
 3 | 1. The raw feature extraction is designed to achieve following two goals:
 4 | 
 5 | 	-> Extract the control flow graph for each binary function
 6 | 	
 7 | 	-> Extract the attributes for each node in the grap
 8 | 	
 9 | 	The feature extraction is built on top of IDA-pro. We wrote the scripts based on ida-python and extract the attributed control flow graph. ``preprocessing_ida.py'' is the main program to extract the ACFG.
10 | 	
11 | 2. The online bug search engine is used for real-time search:
12 | 
13 | 	-> It utilized localality sensitive hashing for indexing
14 | 	
15 | 	-> Nearest-neighbor search algorithm for search
16 | 	
17 | 	The online search is based on nearpy (https://github.com/pixelogik/NearPy). 
18 | 
19 | 


--------------------------------------------------------------------------------
/raw-feature-extractor/cfg_constructor.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import networkx as nx
  3 | from idautils import *
  4 | from idaapi import *
  5 | from idc import *
  6 | 
  7 | import copy
  8 | import networkx as nx
  9 | from idautils import *
 10 | from idaapi import *
 11 | from idc import *
 12 | from graph_analysis_ida import *
 13 | 
 14 | 
 15 | def getCfg(func, externs_eas, ea_externs):
 16 | 	func_start = func.startEA
 17 | 	func_end = func.endEA
 18 | 	cfg = nx.DiGraph()
 19 | 	control_blocks, main_blocks = obtain_block_sequence(func)
 20 | 	i = 0
 21 | 	visited = {}
 22 | 	start_node = None
 23 | 	for bl in control_blocks:
 24 | 		start = control_blocks[bl][0]
 25 | 		end = control_blocks[bl][1]
 26 | 		src_node = (start, end)
 27 | 		if src_node not in visited:
 28 | 			src_id = len(cfg)
 29 | 			visited[src_node] = src_id
 30 | 			cfg.add_node(src_id)
 31 | 			cfg.node[src_id]['label'] = src_node
 32 | 		else:
 33 | 			src_id = visited[src_node]
 34 | 
 35 | 		#if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp':
 36 | 		if start == func_start:
 37 | 			cfg.node[src_id]['c'] = "start"
 38 | 			start_node = src_node
 39 | 		if end == func_end:
 40 | 			cfg.node[src_id]['c'] = "end"
 41 | 		#print control_ea, 1
 42 | 		refs = CodeRefsTo(start, 0)
 43 | 		for ref in refs:
 44 | 			if ref in control_blocks:
 45 | 				dst_node = control_blocks[ref]
 46 | 				if dst_node not in visited:
 47 | 					visited[dst_node] = len(cfg)
 48 | 				dst_id = visited[dst_node]
 49 | 				cfg.add_edge(dst_id, src_id)
 50 | 				cfg.node[dst_id]['label'] = dst_node
 51 | 		#print control_ea, 1
 52 | 		refs = CodeRefsTo(start, 1)
 53 | 		for ref in refs:
 54 | 			if ref in control_blocks:
 55 | 				dst_node = control_blocks[ref]
 56 | 				if dst_node not in visited:
 57 | 					visited[dst_node] = len(cfg)
 58 | 				dst_id = visited[dst_node]
 59 | 				cfg.add_edge(dst_id, src_id)
 60 | 				cfg.node[dst_id]['label'] = dst_node
 61 | 	#print "attributing"
 62 | 	attributingRe(cfg, externs_eas, ea_externs)
 63 | 	# removing deadnodes
 64 | 	#old_cfg = copy.deepcopy(cfg)
 65 | 	#transform(cfg)
 66 | 	return cfg, 0
 67 | 
 68 | def transform(cfg):
 69 | 	merging(cfg)
 70 | 	filtering(cfg)
 71 | 
 72 | def merging(cfg):
 73 | 	bb_ids = cfg.nodes()
 74 | 	for bb_id in bb_ids:
 75 | 		try:
 76 | 			bb = cfg.node[bb_id]['label']
 77 | 			bb_start = bb[0]
 78 | 			bb_end = bb[1]
 79 | 			succs = cfg.successors(bb_id)
 80 | 			#preds = cfg.predecessors(bb_id)
 81 | 			if len(succs) == 1:
 82 | 				preds = cfg.predecessors(succs[0])
 83 | 				if len(preds) == 1:
 84 | 					domerge(cfg, bb_id, succs[0])
 85 | 		except:
 86 | 			pass
 87 | 
 88 | def domerge(cfg, bb_id, suc_node):
 89 | 	suc_nodes = cfg.successors(suc_node)
 90 | 	for node in suc_nodes:
 91 | 		cfg.add_edge(bb_id, node)
 92 | 	cfg.remove_node(suc_node)
 93 | 
 94 | 
 95 | def filtering(cfg):
 96 | 	rm_sets = []
 97 | 	for bb_id in cfg:
 98 | 		bb = cfg.node[bb_id]['label']
 99 | 		bb_start = bb[0]
100 | 		bb_end = bb[1]
101 | 		re = remove(bb_start, bb_end)
102 | 		print bb_id, re, bb_start, bb_end
103 | 		if re:
104 | 			print re, bb_id
105 | 			rm_sets.append(bb_id)
106 | 	print rm_sets
107 | 	for bb_id in rm_sets:
108 | 		cfg.remove_node(bb_id)
109 | 
110 | def remove(bb_start, bb_end):
111 | 	seqs = getSequences(bb_start, bb_end)
112 | 	if matchseq(seqs):
113 | 		return True
114 | 	return False
115 | 
116 | def matchseq(seqs):
117 | 	mips = set(['lw', "jr", "addiu"])
118 | 	x86 = set(['add', 'pop', 'retn'])
119 | 	b_mips = set(['b', ('move','$v0')])
120 | 	b_x86 = set(['b', ('mov','$eax')])
121 | 	re_mips = set([('move','$v0')])
122 | 	re_x86 = set([('mov','$eax')])
123 | 	diff_mips = set(seqs).difference(set(mips))
124 | 	if len(diff_mips) == 0:
125 | 		return True
126 | 	diff_x86 = set(seqs).difference(set(x86))
127 | 	if len(diff_x86) == 0:
128 | 		return True
129 | 	if set(seqs) == b_mips:
130 | 		return True
131 | 	if set(seqs) == b_x86:
132 | 		return True
133 | 	if set(seqs) == re_mips:
134 | 		return True
135 | 	if set(seqs) == re_x86:
136 | 		return True
137 | 	return False
138 | 
139 | def attributingRe(cfg, externs_eas, ea_externs):
140 | 	for node_id in cfg:
141 | 		bl = cfg.node[node_id]['label']
142 | 		numIns = calInsts(bl)
143 | 		cfg.node[node_id]['numIns'] = numIns
144 | 		numCalls = calCalls(bl)
145 | 		cfg.node[node_id]['numCalls'] = numCalls
146 | 		numLIs = calLogicInstructions(bl)
147 | 		cfg.node[node_id]['numLIs'] = numLIs
148 | 		numAs = calArithmeticIns(bl)
149 | 		cfg.node[node_id]['numAs'] = numAs
150 | 		strings, consts = getBBconsts(bl)
151 | 		cfg.node[node_id]['numNc'] = len(strings) + len(consts)
152 | 		cfg.node[node_id]['consts'] = consts
153 | 		cfg.node[node_id]['strings'] = strings
154 | 		externs = retrieveExterns(bl, ea_externs)
155 | 		cfg.node[node_id]['externs'] = externs
156 | 		numTIs = calTransferIns(bl)
157 | 		cfg.node[node_id]['numTIs'] = numTIs
158 | 
159 | 
160 | def attributing(cfg):
161 | 	ga = graph_analysis()
162 | 	ga.gwithoffspring(cfg)
163 | 	print "finishing offspring"
164 | 	for node in cfg:
165 | 		stmt_num = getStmtNum(node)
166 | 		binary_value = getBinaryValue(node)
167 | 		cfg.node[node]['stmt_num'] = stmt_num
168 | 		cfg.node[node]['binary_value'] = binary_value
169 | 	ga.domChecking(cfg)
170 | 	print "finishing domChecking"
171 | 	ga.loopChecking(cfg)
172 | 	print "finishing loopChecking"
173 | 
174 | 
175 | def getStmtNum(node):
176 | 	start = node[0]
177 | 	end = node[1]
178 | 	stmt_num = 0
179 | 	inst_addr = start
180 | 	while inst_addr < end:
181 | 		inst_addr = NextHead(inst_addr)
182 | 		stmt_num += 1
183 | 	return stmt_num
184 | 
185 | def getBinaryValue(node):
186 | 	start = node[0]
187 | 	inst_addr = NextHead(start)
188 | 	value = 0
189 | 	addr = 0
190 | 	for x in xrange((inst_addr - start)-1):
191 | 		addr = start + x
192 | 		y = GetOriginalByte(addr)
193 | 		print value, addr, y
194 | 		value = value | y
195 | 		value = value << 8
196 | 		print value
197 | 
198 | 	addr = inst_addr - 1
199 | 	y = GetOriginalByte(addr)
200 | 	print value, addr, y
201 | 	value = value | y
202 | 	print node
203 | 	print bin(value)
204 | 	return value
205 | 
206 | 
207 | def cfg_construct(func):
208 | 	func_start = func.startEA
209 | 	func_end = func.endEA
210 | 	cfg = nx.DiGraph()
211 | 	seq_blocks, main_blocks = obtain_block_sequence(func)
212 | 	i = 0
213 | 	visited = {}
214 | 	for bl in seq_blocks:
215 | 		start = seq_blocks[bl][0]
216 | 		end = seq_blocks[bl][1]
217 | 		src_node = (start, end)
218 | 		if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp':
219 | 						next_start = seq_blocks[end][0]
220 | 						next_end = seq_blocks[end][1]
221 | 						next_node = (next_start, next_end)
222 | 						cfg.add_edge(src_node, next_node)
223 | 		if start == func_start:
224 | 			cfg.add_node(src_node, c='start')
225 | 			start_node = src_node
226 | 		if end == func_end:
227 | 			cfg.add_node(src_node, c='end')
228 | 		refs = CodeRefsFrom(PrevHead(end), 0)
229 | 		
230 | 		for ref in refs:
231 | 						#print ref
232 | 						if ref in seq_blocks:
233 | 								dst_node = (seq_blocks[ref][0], seq_blocks[ref][1])
234 | 								cfg.add_edge(src_node, dst_node)
235 | 	return cfg, start_node
236 | 
237 | 
238 | def obtain_allpaths( cfg, node, path, allpaths):
239 | 	path.append(node)
240 | 	if 'c' in cfg.node[node] and cfg.node[node]['c'] == 'end':
241 | 		allpaths.append(path)
242 | 		return
243 | 	else:
244 | 		for suc in cfg.successors(node):
245 | 						if suc not in path:
246 | 								path_copy = copy.copy(path)
247 | 								obtain_allpaths(cfg, suc, path_copy, allpaths)
248 | 
249 | 
250 | def obtain_block_sequence(func):
251 | 	control_blocks = {}
252 | 	main_blocks = {}
253 | 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
254 | 	for bl in blocks:
255 | 		base = bl[0]
256 | 		end = PrevHead(bl[1])
257 | 		control_ea = checkCB(bl)
258 | 		control_blocks[control_ea] = bl
259 | 		control_blocks[end] = bl
260 | 		if func.startEA <= base <= func.endEA:
261 | 						main_blocks[base] = bl
262 | 		x = sorted(main_blocks)
263 | 	return control_blocks, x
264 | 
265 | def checkCB(bl):
266 | 	start = bl[0]
267 | 	end = bl[1]
268 | 	ea = start
269 | 	while ea < end:
270 | 		if checkCondition(ea):
271 | 			return ea
272 | 		ea = NextHead(ea)
273 | 
274 | 	return PrevHead(end)
275 | 
276 | def checkCondition(ea):
277 | 	mips_branch = {"beqz":1, "beq":1, "bne":1, "bgez":1, "b":1, "bnez":1, "bgtz":1, "bltz":1, "blez":1, "bgt":1, "bge":1, "blt":1, "ble":1, "bgtu":1, "bgeu":1, "bltu":1, "bleu":1}
278 | 	x86_branch = {"jz":1, "jnb":1, "jne":1, "je":1, "jg":1, "jle":1, "jl":1, "jge":1, "ja":1, "jae":1, "jb":1, "jbe":1, "jo":1, "jno":1, "js":1, "jns":1}
279 | 	arm_branch = {"B":1, "BAL":1, "BNE":1, "BEQ":1, "BPL":1, "BMI":1, "BCC":1, "BLO":1, "BCS":1, "BHS":1, "BVC":1, "BVS":1, "BGT":1, "BGE":1, "BLT":1, "BLE":1, "BHI":1 ,"BLS":1 }
280 | 	conds = {}
281 | 	conds.update(mips_branch)
282 | 	conds.update(x86_branch)
283 | 	opcode = GetMnem(ea)
284 | 	if opcode in conds:
285 | 		return True
286 | 	return False
287 | 


--------------------------------------------------------------------------------
/raw-feature-extractor/discovRe.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Reference Lister
  3 | #
  4 | # List all functions and all references to them in the current section.
  5 | #
  6 | # Implemented with the idautils module
  7 | #
  8 | import networkx as nx
  9 | import cPickle as pickle
 10 | import pdb
 11 | from graph_analysis_ida import *
 12 | from graph_property import *
 13 | #import wingdbstub
 14 | #wingdbstub.Ensure()
 15 | 
 16 | def get_funcs(ea):
 17 |         funcs = {}
 18 |         # Get current ea
 19 |         # Loop from start to end in the current segment
 20 | 	for funcea in Functions(SegStart(ea)):
 21 | 		funcname = GetFunctionName(funcea)
 22 | 		func = get_func(funcea)
 23 | 		blocks = FlowChart(func)
 24 | 		funcs[funcname] = []
 25 | 		for bl in blocks:
 26 | 		        start = bl.startEA
 27 | 		        end = bl.endEA
 28 | 		        funcs[funcname].append((start, end))
 29 |         return funcs
 30 | 
 31 | def get_funcs_for_discoverRe(ea):
 32 |     features = {}
 33 |     for funcea in Functions(SegStart(ea)):
 34 |         funcname = GetFunctionName(funcea)
 35 |         print funcname
 36 |         func = get_func(funcea)
 37 |         feature = get_discoverRe_feature(func)
 38 |         features[funcname] = feature
 39 |     return features
 40 | 
 41 | def get_discoverRe_feature(func, icfg):
 42 |     start = func.startEA
 43 |     end = func.endEA
 44 |     features = []
 45 |     FunctionCalls = getFuncCalls(func)
 46 |     #1
 47 |     features.append(FunctionCalls)
 48 |     LogicInstr = getLogicInsts(func)
 49 |     #2
 50 |     features.append(LogicInstr)
 51 |     Transfer = getTransferInsts(func)
 52 |     #3
 53 |     features.append(Transfer)
 54 |     Locals = getLocalVariables(func)
 55 |     #4
 56 |     features.append(Locals)
 57 |     BB = getBasicBlocks(func)
 58 |     #5
 59 |     features.append(BB)
 60 |     Edges = len(icfg.edges())
 61 |     #6
 62 |     features.append(Edges)
 63 |     Incoming = getIncommingCalls(func)
 64 |     #7
 65 |     features.append(Incoming)
 66 |     #8
 67 |     Instrs = getIntrs(func)
 68 |     features.append(Instrs)
 69 |     between = retrieveGP(icfg)
 70 |     #9
 71 |     features.append(between)
 72 | 
 73 |     strings, consts = getfunc_consts(func)
 74 |     features.append(strings)
 75 |     features.append(consts)
 76 |     return features
 77 | 
 78 | def get_func_names(ea):
 79 |     funcs = {}
 80 |     for funcea in Functions(SegStart(ea)):
 81 |             funcname = GetFunctionName(funcea)
 82 |             funcs[funcname] = funcea
 83 |     return funcs
 84 | 
 85 | def get_func_bases(ea):
 86 |         funcs = {}
 87 |         for funcea in Functions(SegStart(ea)):
 88 |                 funcname = GetFunctionName(funcea)
 89 |                 funcs[funcea] = funcname
 90 |         return funcs
 91 | 
 92 | def get_func_range(ea):
 93 |         funcs = {}
 94 |         for funcea in Functions(SegStart(ea)):
 95 |                 funcname = GetFunctionName(funcea)
 96 | 		func = get_func(funcea)
 97 |                 funcs[funcname] = (func.startEA, func.endEA)
 98 |         return funcs
 99 | 
100 | def get_func_sequences(ea):
101 | 	funcs_bodylist = {}
102 | 	funcs = get_funcs(ea)
103 | 	for funcname in funcs:
104 | 		if funcname not in funcs_bodylist:
105 | 			funcs_bodylist[funcname] = []
106 | 		for start, end in funcs[funcname]:
107 | 			inst_addr = start
108 | 			while inst_addr <= end:
109 | 				opcode = GetMnem(inst_addr)
110 | 				funcs_bodylist[funcname].append(opcode)
111 | 				inst_addr = NextHead(inst_addr)
112 |         return funcs_bodylist
113 | 
114 | def get_func_cfgs(ea):
115 |     func_cfglist = {}
116 |     i = 0
117 |     start, end = get_section('LOAD')
118 |     #print start, end
119 |     for funcea in Functions(SegStart(ea)):
120 |         if start <= funcea <= end:
121 |             funcname = GetFunctionName(funcea)
122 |             func = get_func(funcea)
123 |             print i
124 |             i += 1
125 |             try:
126 |                 icfg = cfg.cfg_construct(func)
127 |                 func_cfglist[funcname] = icfg
128 |             except:
129 |                 pass
130 |             
131 |     return func_cfglist
132 | 
133 | def get_section(t):
134 |     base = SegByName(t)
135 |     start = SegByBase(base)
136 |     end = SegEnd(start)
137 |     return start, end
138 | 
139 | 
140 | def get_func_cfg_sequences(func_cfglist):
141 |     func_cfg_seqlist = {}
142 |     for funcname in func_cfglist:
143 |         func_cfg_seqlist[funcname] = {}
144 |         cfg = func_cfglist[funcname][0]
145 |         for start, end in cfg:
146 |             codesq = get_sequences(start, end)
147 |             func_cfg_seqlist[funcname][(start,end)] = codesq
148 | 
149 |     return func_cfg_seqlist
150 | 
151 | 
152 | def get_sequences(start, end):
153 |     seq = []
154 |     inst_addr = start
155 |     while inst_addr <= end:
156 |         opcode = GetMnem(inst_addr)
157 |         seq.append(opcode)
158 |         inst_addr = NextHead(inst_addr)
159 |     return seq
160 | 
161 | def get_stack_arg(func_addr):
162 |     print func_addr
163 |     args = []
164 |     stack = GetFrame(func_addr)
165 |     if not stack:
166 |             return []
167 |     firstM = GetFirstMember(stack)
168 |     lastM = GetLastMember(stack)
169 |     i = firstM
170 |     while i <=lastM:
171 |         mName = GetMemberName(stack,i)
172 |         mSize = GetMemberSize(stack,i)
173 |         if mSize:
174 |                 i = i + mSize
175 |         else:
176 |                 i = i+4
177 |         if mName not in args and mName and ' s' not in mName and ' r' not in mName:
178 |             args.append(mName)
179 |     return args
180 | 
181 |         #pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
182 |         
183 | def processDataSegs():
184 |     funcdata = {}
185 |     datafunc = {}
186 |     for n in xrange(idaapi.get_segm_qty()):
187 |         seg = idaapi.getnseg(n)
188 |         ea = seg.startEA
189 |         segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
190 |         if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
191 |             start = idc.SegStart(ea)
192 |             end = idc.SegEnd(ea)
193 |             cur = start
194 |             while cur <= end:
195 |                 refs = [v for v in DataRefsTo(cur)]
196 |                 for fea in refs:
197 |                     name = GetFunctionName(fea)
198 |                     if len(name)== 0:
199 |                         continue
200 |                     if name not in funcdata:
201 |                         funcdata[name] = [cur]
202 |                     else:
203 |                         funcdata[name].append(cur)
204 |                     if cur not in datafunc:
205 |                         datafunc[cur] = [name]
206 |                     else:
207 |                         datafunc[cur].append(name)
208 |                 cur = NextHead(cur)
209 |     return funcdata, datafunc
210 | 
211 | def obtainDataRefs(callgraph):
212 |     datarefs = {}
213 |     funcdata, datafunc = processDataSegs()
214 |     for node in callgraph:
215 |         if node in funcdata:
216 |             datas = funcdata[node]
217 |             for dd in datas:
218 |                 refs = datafunc[dd]
219 |                 refs = list(set(refs))
220 |                 if node in datarefs:
221 |                     print refs
222 |                     datarefs[node] += refs
223 |                     datarefs[node] = list(set(datarefs[node]))
224 |                 else:
225 |                     datarefs[node] = refs
226 |     return datarefs
227 | 
228 | 
229 | 


--------------------------------------------------------------------------------
/raw-feature-extractor/func.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Reference Lister
  3 | #
  4 | # List all functions and all references to them in the current section.
  5 | #
  6 | # Implemented with the idautils module
  7 | #
  8 | from idautils import *
  9 | from idaapi import *
 10 | from idc import *
 11 | import networkx as nx
 12 | import cfg_constructor as cfg
 13 | import cPickle as pickle
 14 | import pdb
 15 | from raw_graphs import *
 16 | #from discovRe_feature.discovRe import *
 17 | from discovRe import *
 18 | #import wingdbstub
 19 | #wingdbstub.Ensure()
 20 | def gt_funcNames(ea):
 21 | 	funcs = []
 22 | 	plt_func, plt_data = processpltSegs()
 23 | 	for funcea in Functions(SegStart(ea)):
 24 | 			funcname = get_unified_funcname(funcea)
 25 | 			if funcname in plt_func:
 26 | 				print funcname
 27 | 				continue
 28 | 			funcs.append(funcname)
 29 | 	return funcs
 30 | 
 31 | def get_funcs(ea):
 32 | 	funcs = {}
 33 | 		# Get current ea
 34 | 		# Loop from start to end in the current segment
 35 | 	plt_func, plt_data = processpltSegs()
 36 | 	for funcea in Functions(SegStart(ea)):
 37 | 		funcname = get_unified_funcname(funcea)
 38 | 		if funcname in plt_func:
 39 | 			continue
 40 | 		func = get_func(funcea)
 41 | 		blocks = FlowChart(func)
 42 | 		funcs[funcname] = []
 43 | 		for bl in blocks:
 44 | 				start = bl.startEA
 45 | 				end = bl.endEA
 46 | 				funcs[funcname].append((start, end))
 47 | 	return funcs
 48 | 
 49 | # used for the callgraph generation.
 50 | def get_func_namesWithoutE(ea):
 51 | 	funcs = {}
 52 | 	plt_func, plt_data = processpltSegs()
 53 | 	for funcea in Functions(SegStart(ea)):
 54 | 			funcname = get_unified_funcname(funcea)
 55 | 			if 'close' in funcname:
 56 | 				print funcea
 57 | 			if funcname in plt_func:
 58 | 				print funcname
 59 | 				continue
 60 | 			funcs[funcname] = funcea
 61 | 	return funcs
 62 | 
 63 | # used for the callgraph generation.
 64 | def get_func_names(ea):
 65 | 	funcs = {}
 66 | 	for funcea in Functions(SegStart(ea)):
 67 | 			funcname = get_unified_funcname(funcea)
 68 | 			funcs[funcname] = funcea
 69 | 	return funcs
 70 | 
 71 | def get_func_bases(ea):
 72 | 		funcs = {}
 73 | 		plt_func, plt_data = processpltSegs()
 74 | 		for funcea in Functions(SegStart(ea)):
 75 | 				funcname = get_unified_funcname(funcea)
 76 | 				if funcname in plt_func:
 77 | 					continue
 78 | 				funcs[funcea] = funcname
 79 | 		return funcs
 80 | 
 81 | def get_func_range(ea):
 82 | 		funcs = {}
 83 | 		for funcea in Functions(SegStart(ea)):
 84 | 				funcname = get_unified_funcname(funcea)
 85 | 		func = get_func(funcea)
 86 | 		funcs[funcname] = (func.startEA, func.endEA)
 87 | 		return funcs
 88 | 
 89 | def get_unified_funcname(ea):
 90 | 	funcname = GetFunctionName(ea)
 91 | 	if len(funcname) > 0:
 92 | 		if '.' == funcname[0]:
 93 | 			funcname = funcname[1:]
 94 | 	return funcname
 95 | 
 96 | def get_func_sequences(ea):
 97 | 	funcs_bodylist = {}
 98 | 	funcs = get_funcs(ea)
 99 | 	for funcname in funcs:
100 | 		if funcname not in funcs_bodylist:
101 | 			funcs_bodylist[funcname] = []
102 | 		for start, end in funcs[funcname]:
103 | 			inst_addr = start
104 | 			while inst_addr <= end:
105 | 				opcode = GetMnem(inst_addr)
106 | 				funcs_bodylist[funcname].append(opcode)
107 | 				inst_addr = NextHead(inst_addr)
108 | 	return funcs_bodylist
109 | 
110 | def get_func_cfgs_c(ea):
111 | 	binary_name = idc.GetInputFile()
112 | 	raw_cfgs = raw_graphs(binary_name)
113 | 	externs_eas, ea_externs = processpltSegs()
114 | 	i = 0
115 | 	for funcea in Functions(SegStart(ea)):
116 | 		funcname = get_unified_funcname(funcea)
117 | 		func = get_func(funcea)
118 | 		print i
119 | 		i += 1
120 | 		icfg = cfg.getCfg(func, externs_eas, ea_externs)
121 | 		func_f = get_discoverRe_feature(func, icfg[0])
122 | 		raw_g = raw_graph(funcname, icfg, func_f)
123 | 		raw_cfgs.append(raw_g)
124 | 			
125 | 	return raw_cfgs
126 | 
127 | def get_func_cfgs_ctest(ea):
128 | 	binary_name = idc.GetInputFile()
129 | 	raw_cfgs = raw_graphs(binary_name)
130 | 	externs_eas, ea_externs = processpltSegs()
131 | 	i = 0
132 | 	diffs = {}
133 | 	for funcea in Functions(SegStart(ea)):
134 | 		funcname = get_unified_funcname(funcea)
135 | 		func = get_func(funcea)
136 | 		print i
137 | 		i += 1
138 | 		icfg, old_cfg = cfg.getCfg(func, externs_eas, ea_externs)
139 | 		diffs[funcname] = (icfg, old_cfg)
140 | 		#raw_g = raw_graph(funcname, icfg)
141 | 		#raw_cfgs.append(raw_g)
142 | 			
143 | 	return diffs
144 | 
145 | def get_func_cfgs(ea):
146 | 	func_cfglist = {}
147 | 	i = 0
148 | 	for funcea in Functions(SegStart(ea)):
149 | 		funcname = get_unified_funcname(funcea)
150 | 		func = get_func(funcea)
151 | 		print i
152 | 		i += 1
153 | 		try:
154 | 			icfg = cfg.getCfg(func)
155 | 			func_cfglist[funcname] = icfg
156 | 		except:
157 | 			pass
158 | 			
159 | 	return func_cfglist
160 | 
161 | def get_func_cfg_sequences(func_cfglist):
162 | 	func_cfg_seqlist = {}
163 | 	for funcname in func_cfglist:
164 | 		func_cfg_seqlist[funcname] = {}
165 | 		cfg = func_cfglist[funcname][0]
166 | 		for start, end in cfg:
167 | 			codesq = get_sequences(start, end)
168 | 			func_cfg_seqlist[funcname][(start,end)] = codesq
169 | 
170 | 	return func_cfg_seqlist
171 | 
172 | 
173 | def get_sequences(start, end):
174 | 	seq = []
175 | 	inst_addr = start
176 | 	while inst_addr <= end:
177 | 		opcode = GetMnem(inst_addr)
178 | 		seq.append(opcode)
179 | 		inst_addr = NextHead(inst_addr)
180 | 	return seq
181 | 
182 | def get_stack_arg(func_addr):
183 | 	print func_addr
184 | 	args = []
185 | 	stack = GetFrame(func_addr)
186 | 	if not stack:
187 | 			return []
188 | 	firstM = GetFirstMember(stack)
189 | 	lastM = GetLastMember(stack)
190 | 	i = firstM
191 | 	while i <=lastM:
192 | 		mName = GetMemberName(stack,i)
193 | 		mSize = GetMemberSize(stack,i)
194 | 		if mSize:
195 | 				i = i + mSize
196 | 		else:
197 | 				i = i+4
198 | 		if mName not in args and mName and ' s' not in mName and ' r' not in mName:
199 | 			args.append(mName)
200 | 	return args
201 | 
202 | 		#pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
203 | 
204 | def processExternalSegs():
205 | 	funcdata = {}
206 | 	datafunc = {}
207 | 	for n in xrange(idaapi.get_segm_qty()):
208 | 		seg = idaapi.getnseg(n)
209 | 		ea = seg.startEA
210 | 		segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
211 | 		if segtype in [idc.SEG_XTRN]:
212 | 			start = idc.SegStart(ea)
213 | 			end = idc.SegEnd(ea)
214 | 			cur = start
215 | 			while cur <= end:
216 | 				name = get_unified_funcname(cur)
217 | 				funcdata[name] = hex(cur)
218 | 				cur = NextHead(cur)
219 | 	return funcdata
220 | 
221 | def processpltSegs():
222 | 	funcdata = {}
223 | 	datafunc = {}
224 | 	for n in xrange(idaapi.get_segm_qty()):
225 | 		seg = idaapi.getnseg(n)
226 | 		ea = seg.startEA
227 | 		segname = SegName(ea)
228 | 		if segname in ['.plt', 'extern', '.MIPS.stubs']:
229 | 			start = seg.startEA
230 | 			end = seg.endEA
231 | 			cur = start
232 | 			while cur < end:
233 | 				name = get_unified_funcname(cur)
234 | 				funcdata[name] = hex(cur)
235 | 				datafunc[cur]= name
236 | 				cur = NextHead(cur)
237 | 	return funcdata, datafunc
238 | 
239 | 		
240 | def processDataSegs():
241 | 	funcdata = {}
242 | 	datafunc = {}
243 | 	for n in xrange(idaapi.get_segm_qty()):
244 | 		seg = idaapi.getnseg(n)
245 | 		ea = seg.startEA
246 | 		segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
247 | 		if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
248 | 			start = idc.SegStart(ea)
249 | 			end = idc.SegEnd(ea)
250 | 			cur = start
251 | 			while cur <= end:
252 | 				refs = [v for v in DataRefsTo(cur)]
253 | 				for fea in refs:
254 | 					name = get_unified_funcname(fea)
255 | 					if len(name)== 0:
256 | 						continue
257 | 					if name not in funcdata:
258 | 						funcdata[name] = [cur]
259 | 					else:
260 | 						funcdata[name].append(cur)
261 | 					if cur not in datafunc:
262 | 						datafunc[cur] = [name]
263 | 					else:
264 | 						datafunc[cur].append(name)
265 | 				cur = NextHead(cur)
266 | 	return funcdata, datafunc
267 | 
268 | def obtainDataRefs(callgraph):
269 | 	datarefs = {}
270 | 	funcdata, datafunc = processDataSegs()
271 | 	for node in callgraph:
272 | 		if node in funcdata:
273 | 			datas = funcdata[node]
274 | 			for dd in datas:
275 | 				refs = datafunc[dd]
276 | 				refs = list(set(refs))
277 | 				if node in datarefs:
278 | 					print refs
279 | 					datarefs[node] += refs
280 | 					datarefs[node] = list(set(datarefs[node]))
281 | 				else:
282 | 					datarefs[node] = refs
283 | 	return datarefs
284 | 
285 | 
286 | 


--------------------------------------------------------------------------------
/raw-feature-extractor/graph_analysis_ida.py:
--------------------------------------------------------------------------------
  1 | from idautils import *
  2 | from idaapi import *
  3 | from idc import *
  4 | 
  5 | def getfunc_consts(func):
  6 | 	strings = []
  7 | 	consts = []
  8 | 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
  9 | 	for bl in blocks:
 10 | 		strs, conts = getBBconsts(bl)
 11 | 		strings += strs
 12 | 		consts += conts
 13 | 	return strings, consts
 14 | 
 15 | def getConst(ea, offset):
 16 | 	strings = []
 17 | 	consts = []
 18 | 	optype1 = GetOpType(ea, offset)
 19 | 	if optype1 == idaapi.o_imm:
 20 | 		imm_value = GetOperandValue(ea, offset)
 21 | 		if 0<= imm_value <= 10:
 22 | 			consts.append(imm_value)
 23 | 		else:
 24 | 			if idaapi.isLoaded(imm_value) and idaapi.getseg(imm_value):
 25 | 				str_value = GetString(imm_value)
 26 | 				if str_value is None:
 27 | 					str_value = GetString(imm_value+0x40000)
 28 | 					if str_value is None:
 29 | 						consts.append(imm_value)
 30 | 					else:
 31 | 						re = all(40 <= ord(c) < 128 for c in str_value)
 32 | 						if re:
 33 | 							strings.append(str_value)
 34 | 						else:
 35 | 							consts.append(imm_value)
 36 | 				else:
 37 | 					re = all(40 <= ord(c) < 128 for c in str_value)
 38 | 					if re:
 39 | 						strings.append(str_value)
 40 | 					else:
 41 | 						consts.append(imm_value)
 42 | 			else:
 43 | 				consts.append(imm_value)
 44 | 	return strings, consts
 45 | 
 46 | def getBBconsts(bl):
 47 | 	strings = []
 48 | 	consts = []
 49 | 	start = bl[0]
 50 | 	end = bl[1]
 51 | 	invoke_num = 0
 52 | 	inst_addr = start
 53 | 	while inst_addr < end:
 54 | 		opcode = GetMnem(inst_addr)
 55 | 		if opcode in ['la','jalr','call', 'jal']:
 56 | 			inst_addr = NextHead(inst_addr)
 57 | 			continue
 58 | 		strings_src, consts_src = getConst(inst_addr, 0)
 59 | 		strings_dst, consts_dst = getConst(inst_addr, 1)
 60 | 		strings += strings_src
 61 | 		strings += strings_dst
 62 | 		consts += consts_src
 63 | 		consts += consts_dst
 64 | 		try:
 65 | 			strings_dst, consts_dst = getConst(inst_addr, 2)
 66 | 			consts += consts_dst
 67 | 			strings += strings_dst
 68 | 		except:
 69 | 			pass
 70 | 
 71 | 		inst_addr = NextHead(inst_addr)
 72 | 	return strings, consts
 73 | 
 74 | def getFuncCalls(func):
 75 | 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
 76 | 	sumcalls = 0
 77 | 	for bl in blocks:
 78 | 		callnum = calCalls(bl)
 79 | 		sumcalls += callnum
 80 | 	return sumcalls
 81 | 
 82 | def getLogicInsts(func):
 83 | 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
 84 | 	sumcalls = 0
 85 | 	for bl in blocks:
 86 | 		callnum = calLogicInstructions(bl)
 87 | 		sumcalls += callnum
 88 | 	return sumcalls
 89 | 
 90 | def getTransferInsts(func):
 91 | 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
 92 | 	sumcalls = 0
 93 | 	for bl in blocks:
 94 | 		callnum = calTransferIns(bl)
 95 | 		sumcalls += callnum
 96 | 	return sumcalls
 97 | 
 98 | def getIntrs(func):
 99 | 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
100 | 	sumcalls = 0
101 | 	for bl in blocks:
102 | 		callnum = calInsts(bl)
103 | 		sumcalls += callnum
104 | 	return sumcalls	
105 | 
106 | def getLocalVariables(func):
107 | 	args_num = get_stackVariables(func.startEA)
108 | 	return args_num
109 | 
110 | def getBasicBlocks(func):
111 | 	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
112 | 	return len(blocks)
113 | 
114 | def getIncommingCalls(func):
115 | 	refs = CodeRefsTo(func.startEA, 0)
116 | 	re = len([v for v in refs])
117 | 	return re
118 | 
119 | 
120 | def get_stackVariables(func_addr):
121 |     #print func_addr
122 |     args = []
123 |     stack = GetFrame(func_addr)
124 |     if not stack:
125 |             return 0
126 |     firstM = GetFirstMember(stack)
127 |     lastM = GetLastMember(stack)
128 |     i = firstM
129 |     while i <=lastM:
130 |         mName = GetMemberName(stack,i)
131 |         mSize = GetMemberSize(stack,i)
132 |         if mSize:
133 |                 i = i + mSize
134 |         else:
135 |                 i = i+4
136 |         if mName not in args and mName and 'var_' in mName:
137 |             args.append(mName)
138 |     return len(args)
139 | 
140 | 
141 | 
142 | def calArithmeticIns(bl):
143 | 	x86_AI = {'add':1, 'sub':1, 'div':1, 'imul':1, 'idiv':1, 'mul':1, 'shl':1, 'dec':1, 'inc':1}
144 | 	mips_AI = {'add':1, 'addu':1, 'addi':1, 'addiu':1, 'mult':1, 'multu':1, 'div':1, 'divu':1}
145 | 	calls = {}
146 | 	calls.update(x86_AI)
147 | 	calls.update(mips_AI)
148 | 	start = bl[0]
149 | 	end = bl[1]
150 | 	invoke_num = 0
151 | 	inst_addr = start
152 | 	while inst_addr < end:
153 | 		opcode = GetMnem(inst_addr)
154 | 		if opcode in calls:
155 | 			invoke_num += 1
156 | 		inst_addr = NextHead(inst_addr)
157 | 	return invoke_num
158 | 
159 | def calCalls(bl):
160 | 	calls = {'call':1, 'jal':1, 'jalr':1}
161 | 	start = bl[0]
162 | 	end = bl[1]
163 | 	invoke_num = 0
164 | 	inst_addr = start
165 | 	while inst_addr < end:
166 | 		opcode = GetMnem(inst_addr)
167 | 		if opcode in calls:
168 | 			invoke_num += 1
169 | 		inst_addr = NextHead(inst_addr)
170 | 	return invoke_num
171 | 
172 | def calInsts(bl):
173 | 	start = bl[0]
174 | 	end = bl[1]
175 | 	ea = start
176 | 	num = 0
177 | 	while ea < end:
178 | 		num += 1
179 | 		ea = NextHead(ea)
180 | 	return num
181 | 
182 | def calLogicInstructions(bl):
183 | 	x86_LI = {'and':1, 'andn':1, 'andnpd':1, 'andpd':1, 'andps':1, 'andnps':1, 'test':1, 'xor':1, 'xorpd':1, 'pslld':1}
184 | 	mips_LI = {'and':1, 'andi':1, 'or':1, 'ori':1, 'xor':1, 'nor':1, 'slt':1, 'slti':1, 'sltu':1}
185 | 	calls = {}
186 | 	calls.update(x86_LI)
187 | 	calls.update(mips_LI)
188 | 	start = bl[0]
189 | 	end = bl[1]
190 | 	invoke_num = 0
191 | 	inst_addr = start
192 | 	while inst_addr < end:
193 | 		opcode = GetMnem(inst_addr)
194 | 		if opcode in calls:
195 | 			invoke_num += 1
196 | 		inst_addr = NextHead(inst_addr)
197 | 	return invoke_num
198 | 
199 | def calSconstants(bl):
200 | 	start = bl[0]
201 | 	end = bl[1]
202 | 	invoke_num = 0
203 | 	inst_addr = start
204 | 	while inst_addr < end:
205 | 		opcode = GetMnem(inst_addr)
206 | 		if opcode in calls:
207 | 			invoke_num += 1
208 | 		inst_addr = NextHead(inst_addr)
209 | 	return invoke_num
210 | 
211 | 
212 | def calNconstants(bl):
213 | 	start = bl[0]
214 | 	end = bl[1]
215 | 	invoke_num = 0
216 | 	inst_addr = start
217 | 	while inst_addr < end:
218 | 		optype1 = GetOpType(inst_addr, 0)
219 | 		optype2 = GetOpType(inst_addr, 1)
220 | 		if optype1 == 5 or optype2 == 5:
221 | 			invoke_num += 1
222 | 		inst_addr = NextHead(inst_addr)
223 | 	return invoke_num
224 | 
225 | def retrieveExterns(bl, ea_externs):
226 | 	externs = []
227 | 	start = bl[0]
228 | 	end = bl[1]
229 | 	inst_addr = start
230 | 	while inst_addr < end:
231 | 		refs = CodeRefsFrom(inst_addr, 1)
232 | 		try:
233 | 			ea = [v for v in refs if v in ea_externs][0]
234 | 			externs.append(ea_externs[ea])
235 | 		except:
236 | 			pass
237 | 		inst_addr = NextHead(inst_addr)
238 | 	return externs
239 | 
240 | def calTransferIns(bl):
241 | 	x86_TI = {'jmp':1, 'jz':1, 'jnz':1, 'js':1, 'je':1, 'jne':1, 'jg':1, 'jle':1, 'jge':1, 'ja':1, 'jnc':1, 'call':1}
242 | 	mips_TI = {'beq':1, 'bne':1, 'bgtz':1, "bltz":1, "bgez":1, "blez":1, 'j':1, 'jal':1, 'jr':1, 'jalr':1}
243 | 	arm_TI = {'MVN':1, "MOV":1}
244 | 	calls = {}
245 | 	calls.update(x86_TI)
246 | 	calls.update(mips_TI)
247 | 	start = bl[0]
248 | 	end = bl[1]
249 | 	invoke_num = 0
250 | 	inst_addr = start
251 | 	while inst_addr < end:
252 | 		opcode = GetMnem(inst_addr)
253 | 		re = [v for v in calls if opcode in v]
254 | 		if len(re) > 0:
255 | 			invoke_num += 1
256 | 		inst_addr = NextHead(inst_addr)
257 | 	return invoke_num


--------------------------------------------------------------------------------
/raw-feature-extractor/graph_property.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx
 2 | import pdb
 3 | def betweeness(g):
 4 | 	#pdb.set_trace()
 5 | 	betweenness = nx.betweenness_centrality(g)
 6 | 	return betweenness
 7 | 
 8 | def eigenvector(g):
 9 | 	centrality = nx.eigenvector_centrality(g)
10 | 	return centrality
11 | 
12 | def closeness_centrality(g):
13 | 	closeness = nx.closeness_centrality(g)
14 | 	return closeness
15 | 
16 | def retrieveGP(g):
17 | 	bf = betweeness(g)
18 | 	#close = closeness_centrality(g)
19 | 	#bf_sim = 
20 | 	#close_sim = 
21 | 	x = sorted(bf.values())
22 | 	value = sum(x)/len(x)
23 | 	return round(value,5)
24 | 
25 | 


--------------------------------------------------------------------------------
/raw-feature-extractor/preprocessing_ida.py:
--------------------------------------------------------------------------------
 1 | from func import *
 2 | from raw_graphs import *
 3 | from idc import *
 4 | import os
 5 | import argparse
 6 | 
 7 | def parse_command():
 8 | 	parser = argparse.ArgumentParser(description='Process some integers.')
 9 | 	parser.add_argument("--path", type=str, help="The directory where to store the generated .ida file")
10 | 	args = parser.parse_args()
11 | 	return args
12 | 
13 | if __name__ == '__main__':
14 | 
15 | 	args = parse_command()
16 | 	path = args.path
17 | 	analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
18 | 	analysis_flags &= ~idc.AF_IMMOFF
19 | 	# turn off "automatically make offset" heuristic
20 | 	idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
21 | 	idaapi.autoWait()
22 | 	cfgs = get_func_cfgs_c(FirstSeg())
23 | 	binary_name = idc.GetInputFile() + '.ida'
24 | 	fullpath = os.path.join(path, binary_name)
25 | 	pickle.dump(cfgs, open(fullpath,'w'))
26 | 	print binary_name
27 | 	idc.Exit(0)


--------------------------------------------------------------------------------
/raw-feature-extractor/raw_graphs.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import sys
  3 | sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
  4 | import networkx as nx
  5 | #import numpy as np
  6 | from subprocess import Popen, PIPE
  7 | import pdb
  8 | import os
  9 | import re,mmap
 10 | #from graph_edit_new import *
 11 | 
 12 | class raw_graph:
 13 | 	def __init__(self, funcname, g, func_f):
 14 | 		self.funcname = funcname
 15 | 		self.old_g = g[0]
 16 | 		self.g = nx.DiGraph()
 17 | 		self.entry = g[1]
 18 | 		self.fun_features = func_f
 19 | 		self.attributing()
 20 | 
 21 | 	def __len__(self):
 22 | 		return len(self.g)
 23 | 
 24 | 	def attributing(self):
 25 | 		self.obtainOffsprings(self.old_g)
 26 | 		for node in self.old_g:
 27 | 			fvector = self.retrieveVec(node, self.old_g)
 28 | 			self.g.add_node(node)
 29 | 			self.g.node[node]['v'] = fvector
 30 | 
 31 | 		for edge in self.old_g.edges():
 32 | 			node1 = edge[0]
 33 | 			node2 = edge[1]
 34 | 			self.g.add_edge(node1, node2)
 35 | 
 36 | 	def obtainOffsprings(self,g):
 37 | 		nodes = g.nodes()
 38 | 		for node in nodes:
 39 | 			offsprings = {}
 40 | 			self.getOffsprings(g, node, offsprings)
 41 | 			g.node[node]['offs'] = len(offsprings)
 42 | 		return g
 43 | 
 44 | 	def getOffsprings(self, g, node, offsprings):
 45 | 		node_offs = 0
 46 | 		sucs = g.successors(node)
 47 | 		for suc in sucs:
 48 | 			if suc not in offsprings:
 49 | 				offsprings[suc] = 1
 50 | 				self.getOffsprings(g, suc, offsprings)
 51 | 
 52 | 	def retrieveVec(self, id_, g):
 53 | 		feature_vec = []
 54 | 		#numC0
 55 | 		numc = g.node[id_]['consts']
 56 | 		feature_vec.append(numc)
 57 | 		#nums1
 58 | 		nums = g.node[id_]['strings']
 59 | 		feature_vec.append(nums)
 60 | 		#offsprings2
 61 | 		offs = g.node[id_]['offs']
 62 | 		feature_vec.append(offs)
 63 | 		#numAs3
 64 | 		numAs = g.node[id_]['numAs']
 65 | 		feature_vec.append(numAs)
 66 | 		# of calls4
 67 | 		calls = g.node[id_]['numCalls']
 68 | 		feature_vec.append(calls)
 69 | 		# of insts5
 70 | 		insts = g.node[id_]['numIns']
 71 | 		feature_vec.append(insts)
 72 | 		# of LIs6
 73 | 		insts = g.node[id_]['numLIs']
 74 | 		feature_vec.append(insts)
 75 | 		# of TIs7
 76 | 		insts = g.node[id_]['numTIs']
 77 | 		feature_vec.append(insts)	
 78 | 		return feature_vec
 79 | 
 80 | 
 81 | 	def enumerating(self, n):
 82 | 		subgs = []
 83 | 		#pdb.set_trace()
 84 | 		for sub_nodes in itertools.combinations(self.g.nodes(), n):
 85 | 		    subg = self.g.subgraph(sub_nodes)
 86 | 		    u_subg = subg.to_undirected()
 87 | 		    if nx.is_connected(u_subg):
 88 | 		        subgs.append(subg)
 89 | 		return subgs
 90 | 
 91 | 
 92 | 	def genMotifs(self, n):
 93 | 		motifs = {}
 94 | 		subgs = enumerating(n)
 95 | 		for subg in subgs:
 96 | 			if len(motifs) == 0:
 97 | 				motifs[subg] = [subg]
 98 | 			else:
 99 | 				nomatch = True
100 | 				for mt in motifs:
101 | 					if nx.is_isomorphic(mt, subg):
102 | 						motifs[mt].append(subg)
103 | 						nomatch = False
104 | 				if nomatch:
105 | 					motifs[subg] = [subg]
106 | 		return motifs
107 | 
108 | 	def enumerating_efficient(self, n):
109 | 		#pdb.set_trace()
110 | 		if len(self.g) >= 200:
111 | 			return []
112 | 		with open('/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt','wb') as f:
113 | 			nx.write_edgelist(self.g,f,data=False)
114 | 		#pdb.set_trace()
115 | 		process = Popen(["/home/qian/workspace/FANMOD-command_line-source/executables/./fanmod_command_line_linux", str(n), "100000", "1", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt", "1", "0", "0", "2", "0", "0", "0", "1000", "3", "3", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt", "0", "1"], stdout=PIPE, stderr=PIPE)
116 | 		stdout, stderr = process.communicate()
117 | 		if process.returncode >= 0:
118 | 		#os.system("/home/qian/software/FANMOD-command_line-source/executables/./fanmod_command_line_linux " +str(n) + " 100000 1 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt 1 0 0 2 0 0 0 1000 3 3 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt 0 1")
119 | 		#pdb.set_trace()
120 | 			#pdb.set_trace()
121 | 			subgs = self.parseOutput("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump", n)
122 | 			#pdb.set_trace()
123 | 			os.remove("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump")
124 | 			return subgs
125 | 		return []
126 | 
127 | 	def parseOutput(self, path, n):
128 | 		pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+')
129 | 		subgraphs = []
130 | 		with open(path,'r') as f:
131 | 			data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
132 | 			mo = re.findall(pattern, data)
133 | 			if mo:
134 | 				results = [map(int, v.split(',')[1:]) for v in mo]
135 | 				subgraphs = self.createGraphDirectly(results)
136 | 		return subgraphs
137 | 
138 | 	def parseOutputByconditions(self, path, n):
139 | 		pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+')
140 | 		subgraphs = []
141 | 		with open(path,'r') as f:
142 | 			data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
143 | 			mo = re.findall(pattern, data)
144 | 			if mo:
145 | 				results = [map(int, v.split(',')[1:]) for v in mo]
146 | 				subgraphs = self.create_Graphbycondition_Directly(results)
147 | 		return subgraphs
148 | 
149 | 	def create_Graphbycondition_Directly(self, results):
150 | 		subgs = []
151 | 		for indexes in results:
152 | 			tg = template_graph()
153 | 			subg = self.g.subgraph(indexes)
154 | 			tg.updateG(subg)
155 | 			subgs.append(tg)
156 | 			del tg
157 | 		return subgs
158 | 
159 | 	def createGraphDirectly(self, results):
160 | 		#pdb.set_trace()
161 | 		#subgs = [self.g.subgraph(indexes) for indexes in results]
162 | 		subgs = []
163 | 		for indexes in results:
164 | 			tg = template_graph()
165 | 			subg = self.g.subgraph(indexes)
166 | 			tg.updateG(subg)
167 | 			subgs.append(tg)
168 | 			del tg
169 | 		return subgs
170 | 
171 | 	def createGraph(self, results, n):
172 | 		binary_value = int(results[0],2)
173 | 		indexes = [int(v) for v in results[1:]]
174 | 		fang = self.createG(results[0], n)
175 | 		if fang:
176 | 			tg = template_graph(binary_value)
177 | 			tg.updateG(fang, indexes, self.g)
178 | 			return tg
179 | 		pdb.set_trace()
180 | 		print "there is g which is none"
181 | 
182 | 	def createG(self, binary_str, n):
183 | 		g = nx.DiGraph()
184 | 		l = [int(v) for v in binary_str]
185 | 		#pdb.set_trace()
186 | 		shape = (n, n)
187 | 		data = np.array(l)
188 | 		ad_matrix = data.reshape(shape)
189 | 		for i in xrange(n):
190 | 			for j in xrange(n):
191 | 				if ad_matrix[i][j] == 1:
192 | 					g.add_edge(i, j)
193 | 		return g
194 | 			
195 | 
196 | 
197 | class raw_graphs:
198 | 	def __init__(self, binary_name):
199 | 		self.binary_name = binary_name
200 | 		self.raw_graph_list = []
201 | 
202 | 	def append(self, raw_g):
203 | 		self.raw_graph_list.append(raw_g)
204 | 
205 | 	def __len__(self):
206 | 		return len(self.raw_graph_list)
207 | 
208 | 
209 | class graphlets:
210 | 	def __init__(self, funcname):
211 | 		self.funcname = funcname
212 | 		self.graphlets_list = []
213 | 		self.binary_name = None
214 | 
215 | 	def updateBN(self, binary_name):
216 | 		self.binary_name = binary_name
217 | 
218 | 	def append(self, subg):
219 | 		self.graphlets_list.append(subg)
220 | 
221 | 	def appendSet(self, subgs):
222 | 		self.graphlets_list += subgs
223 | 
224 | 	def __len__(self):
225 | 		return len(self.graphlets_list)
226 | 
227 | class template_graph:
228 | 	def __init__(self, value=None):
229 | 		self.value = value
230 | 		self.g = None
231 | 
232 | 	def updateG(self,g):
233 | 		self.g = g
234 | 	#def updateIndexes(self, indexes):
235 | 	#	self.indexes = indexes
236 | 
237 | 	#def updateAttributes(self, pg, indexes, maing):
238 | 	#	for id_ in xrange(len(indexes)):
239 | 	#		index = indexes[id_]
240 | 	#		gnode = self.findNode(index, maing)
241 | 	#		self.g.node[gnode] = pg.node[index]
242 | 
243 | 
244 | class template_graphs:
245 | 	def __init__(self, size):
246 | 		self.size = size
247 | 		self.gs = []
248 | 		self.bit_len = None
249 | 
250 | 	def enumeratingAll(self):
251 | 		subgs = []
252 | 		binary_value = self.genBinValue()
253 | 		for i in xrange(binary_value):
254 | 			if i == 0 :
255 | 				continue
256 | 			g = self.createG(i)
257 | 			if g:
258 | 				tg = template_graph(i)
259 | 				tg.updateG(g)
260 | 				self.gs.append(tg)
261 | 
262 | 	def genBinValue(self):
263 | 		n = self.size
264 | 		self.bit_len = n*n
265 | 		return 2**(self.bit_len)
266 | 
267 | 	def createG(self, i):
268 | 		g = nx.DiGraph()
269 | 		l = self.genArray(i)
270 | 		#pdb.set_trace()
271 | 		shape = (self.size, self.size)
272 | 		data = np.array(l)
273 | 		ad_matrix = data.reshape(shape)
274 | 		for i in xrange(self.size):
275 | 			for j in xrange(self.size):
276 | 				if ad_matrix[i][j] == 1:
277 | 					g.add_edge(i, j)
278 | 		u_g = g.to_undirected()
279 | 		if len(g) == self.size and nx.is_connected(u_g):
280 | 			return g
281 | 		return False
282 | 
283 | 	def genArray(self, i):
284 | 		l = [int(x) for x in bin(i)[2:]]
285 | 		x = [0 for v in xrange(self.bit_len - len(l))]
286 | 		return x + l
287 | 


--------------------------------------------------------------------------------
/search-engine/db.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle 
  2 | from search import *
  3 | from nearpy import Engine
  4 | from nearpy.hashes import RandomDiscretizedProjections
  5 | from nearpy.filters import NearestFilter, UniqueFilter
  6 | from nearpy.distances import EuclideanDistance
  7 | from nearpy.distances import CosineDistance
  8 | from nearpy.hashes import RandomBinaryProjections
  9 | from nearpy.experiments import DistanceRatioExperiment
 10 | from redis import Redis
 11 | from nearpy.storage import RedisStorage
 12 | from feature import *
 13 | import numpy as np
 14 | import os
 15 | import pdb
 16 | import argparse
 17 | import time
 18 | import numpy as np
 19 | from refactoring import *
 20 | import pymongo
 21 | from pymongo import MongoClient
 22 | 
 23 | def initDB():
 24 | 	client = MongoClient()
 25 | 	client = MongoClient('localhost', 27017)
 26 | 	client = MongoClient('mongodb://localhost:27017/')
 27 | 	db = client.test_database
 28 | 	db = client['iot-encoding']
 29 | 	return db
 30 | 
 31 | db = initDB()
 32 | posts = db.posts
 33 | 
 34 | class db:
 35 | 	
 36 | 	def __init__(self):
 37 | 		self.feature_list = {}
 38 | 		self.engine = None
 39 | 
 40 | 	def loadHashmap(self, feature_size, result_n):
 41 | 		# Create redis storage adapter
 42 | 		redis_object = Redis(host='localhost', port=6379, db=0)
 43 | 		redis_storage = RedisStorage(redis_object)
 44 | 		pdb.set_trace()
 45 | 		try:
 46 | 			# Get hash config from redis
 47 | 			config = redis_storage.load_hash_configuration('test')
 48 | 			# Config is existing, create hash with None parameters
 49 | 			lshash = RandomBinaryProjections(None, None)
 50 | 			# Apply configuration loaded from redis
 51 | 			lshash.apply_config(config)
 52 | 			
 53 | 		except:
 54 | 			# Config is not existing, create hash from scratch, with 10 projections
 55 | 			lshash = RandomBinaryProjections('test', 0)
 56 | 			
 57 | 
 58 | 		# Create engine for feature space of 100 dimensions and use our hash.
 59 | 		# This will set the dimension of the lshash only the first time, not when
 60 | 		# using the configuration loaded from redis. Use redis storage to store
 61 | 		# buckets.
 62 | 		nearest = NearestFilter(1000)
 63 | 		#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
 64 | 		pdb.set_trace()
 65 | 		self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
 66 | 
 67 | 		# Do some stuff like indexing or querying with the engine...
 68 | 
 69 | 		# Finally store hash configuration in redis for later use
 70 | 		redis_storage.store_hash_configuration(lshash)
 71 | 
 72 | 	def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
 73 | 		if fvector is None:
 74 | 			return
 75 | 		#ftuple = tuple([fvector])
 76 | 		self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
 77 | 
 78 | 	def batch_appendDB(self, binary_name, features, firmware_name=""):
 79 | 		for funcname in features:
 80 | 			feature = features[funcname]
 81 | 			#pdb.set_trace()
 82 | 			self.appendToDB(binary_name, funcname, feature, firmware_name)
 83 | 
 84 | 	def batch_appendDBbyDir(self, base_dir):
 85 | 		cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
 86 | 		i = 0
 87 | 		for v in cursor:
 88 | 			print i
 89 | 			i+=1
 90 | 			binary_name = v['binary_name']
 91 | 			funcname = v['func_name']
 92 | 			firmware_name = v['firmware_name']
 93 | 			feature = v['fvector']
 94 | 			self.appendToDB(binary_name, funcname, feature, firmware_name)
 95 | 
 96 | 	def batch_appendDBbyDir1(self, base_dir):
 97 | 		image_dir = os.path.join(base_dir, "image")
 98 | 		firmware_featrues={}
 99 | 		bnum = 0
100 | 		fnum = 0
101 | 		i  = 0
102 | 		pdb.set_trace()
103 | 		for firmware_name in os.listdir(image_dir):
104 | 			print firmware_name
105 | 			firmware_featrues[firmware_name] = {}
106 | 			firmware_dir = os.path.join(image_dir, firmware_name)
107 | 			for binary_name in os.listdir(firmware_dir):
108 | 				if binary_name.endswith(".features"):
109 | 					bnum += 1
110 | 					featrues_dir = os.path.join(firmware_dir, binary_name)
111 | 					featrues = pickle.load(open(featrues_dir, "r"))
112 | 					for funcname in featrues:
113 | 						fnum +=1
114 | 						#pdb.set_trace()
115 | 						feature = featrues[funcname]
116 | 						self.appendToDB(binary_name, funcname, feature, firmware_name)
117 | 					del featrues
118 | 		print("bnum ", bnum)
119 | 		print("fnum ", fnum)
120 | 
121 | 	def dump(self, base_dir):
122 | 		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
123 | 		pickle.dump(self.feature_list, open(db_dir, 'w'))
124 | 		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
125 | 		pickle.dump(self.engine, open(db_dir, 'w'))
126 | 
127 | 	def loadDB(self, base_dir):
128 | 		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
129 | 		self.feature_list = pickle.load(open(db_dir, 'r'))
130 | 		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
131 | 		self.engine = pickle.load(open(db_dir, 'r'))
132 | 
133 | 	def findF(self, binary_name, funcname):
134 | 		x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
135 | 		return x[0]
136 | 
137 | def retrieveFeaturesByDir(n, base_dir):
138 | 	firmware_featrues={}
139 | 	i = 0
140 | 	for firmware_name in os.listdir(base_dir):
141 | 		if firmware_name.endWith(".features"):
142 | 			firmware_featrues[firmware_name] = {}
143 | 			firmware_dir = os.path.join(base_dir, firmware_name)
144 | 			if i > 0:
145 | 				break
146 | 			i += 1
147 | 			pdb.set_trace()
148 | 			for binary_name in os.listdir(firmware_dir):
149 | 				featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
150 | 				featrues = pickle.load(open(featrues_dir, "r"))
151 | 				for funcname in featrues:
152 | 					feature = featrues[funcname]
153 | 					self.appendToDB(firmware_name, binary_name, funcname, feature)
154 | 				del featrues
155 | 
156 | def retrieveFeatures(n, base_dir, filename, funcs):
157 | 	feature_dic = {}
158 | 	featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
159 | 	featrues = pickle.load(open(featrues_dir, "r"))
160 | 	#featuresx = retrieveFeaturesx(filename)
161 | 	for name in featrues:
162 | 		#if name in funcs:
163 | 		x = featrues[name] 
164 | 		#+ featuresx[name]
165 | 		feature_dic[name] = np.asarray(x)
166 | 	return feature_dic
167 | 
168 | def retrieveVuldb(base_input_dir):
169 | 	vul_path = os.path.join(base_input_dir, "vul")
170 | 	vul_db = pickle.load(open(vul_path, "r"))
171 | 	return vul_db
172 | 
173 | 
174 | def retrieveFeaturesx(filename):
175 | 	ida_input_dir = os.path.join("./data/", filename + ".features")
176 | 	featuresx = pickle.load(open(ida_input_dir, "r"))
177 | 	return featuresx
178 | 
179 | def retrieveQueries(n, base_dir, filename1, featrues_src):
180 | 	queries = {}
181 | 	featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
182 | 	featrues = pickle.load(open(featrues_dir, "r"))
183 | 	#featuresx = retrieveFeaturesx(filename1)
184 | 	for name in featrues:
185 | 		#if name in featrues_src:
186 | 		x = featrues[name] 
187 | 		#+ featuresx[name]
188 | 		queries[name] = np.asarray(x)
189 | 	return queries
190 | 
191 | def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
192 | 	queries = {}
193 | 	featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
194 | 	featrues = pickle.load(open(featrues_dir, "r"))
195 | 	for name in featrues:
196 | 		#del featrues[name][5]
197 | 		queries[name] = np.asarray(featrues[name])
198 | 	return queries
199 | 
200 | def retrieveQuery(n, base_dir, filename, funcname):
201 | 	featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
202 | 	featrues = pickle.load(open(featrues_dir, "r"))
203 | 	f = [featrues[v] for v in featrues if funcname in v ][0]
204 | 	return np.asarray(f)
205 | 
206 | def parse_command():
207 | 	parser = argparse.ArgumentParser(description='Process some integers.')
208 | 	parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
209 | 	parser.add_argument('--output_dir', type=str, help="output dir")
210 | 	parser.add_argument("--filename1", type=str, help="the size of each graphlet")
211 | 	parser.add_argument("--filename2", type=str, help="the size of each graphlet")
212 | 	parser.add_argument("--size", type=int, help="the size of each graphlet")
213 | 	#parser.add_argument("--size", type=int, help="the size of each graphlet")
214 | 	args = parser.parse_args()
215 | 	return args
216 | 
217 | def loadFuncs(path):
218 | 	funcs = {}
219 | 	x86_dir = os.path.join(path, "func_candid")
220 | 	#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
221 | 	fp = open(x86_dir,"r")
222 | 	for line in fp:
223 | 		items = line.split("\n")
224 | 		funcname = items[0]
225 | 		funcs[funcname] = 1
226 | 	return funcs
227 | 
228 | def dump(path, featrues, queries):
229 | 	fp = open(path + "/" + "matrix", 'w')
230 | 	for name in featrues:
231 | 		row = []
232 | 		row.append("x86")
233 | 		row.append(name)
234 | 		row += featrues[name]
235 | 		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
236 | 	for name in queries:
237 | 		row = []
238 | 		row.append("mips")
239 | 		row.append(name)
240 | 		row += queries[name]
241 | 		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
242 | 	fp.close()
243 | 
244 | 
245 | def queryBytwo(base_input_dir, filename1, filename2, n):
246 | 	threthold = 50
247 | 	db_instance = db()
248 | 	funcs = loadFuncs(base_input_dir)
249 | 	db_instance.loadHashmap(n, 50000)
250 | 	#pdb.set_trace()
251 | 	featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
252 | 	queries = retrieveQueries(n, base_input_dir, filename2, funcs)
253 | 	#queries = refactoring(queries, featrues)
254 | 	vul_db = retrieveVuldb(base_input_dir)
255 | 	pdb.set_trace()
256 | 	#dump(base_input_dir, featrues, queries)
257 | 	#start = time.time()
258 | 	#db_instance.batch_appendDBbyDir(base_input_dir)
259 | 	#end = time.time()
260 | 	#total = end - start
261 | 	#print total
262 | 	db_instance.batch_appendDB(filename1, featrues)
263 | 	pdb.set_trace()
264 | 	ranks = []
265 | 	times = []
266 | 	for threthold in xrange(1, 210, 10):
267 | 		hit = []
268 | 		i = 0
269 | 		for name in queries:
270 | 			#print i 
271 | 			i += 1
272 | 			'''
273 | 			if i == 1000:
274 | 				print (sum(times)/len(times))
275 | 				pdb.set_trace()
276 | 				print "s"
277 | 			'''
278 | 			#if name not in vul_db['openssl']:
279 | 			#	continue
280 | 			if name not in featrues:
281 | 				continue
282 | 			#pdb.set_trace()
283 | 			query = queries[name]
284 | 			#start = time.time()
285 | 			x = db_instance.engine.neighbours(query)
286 | 			#end = time.time()
287 | 			#total = end - start
288 | 			#times.append(total)
289 | 			#print total
290 | 			#pdb.set_trace()
291 | 			try:
292 | 				rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
293 | 				ranks.append((name, rank))
294 | 				if rank <= threthold:
295 | 					hit.append(1)
296 | 				else:
297 | 					hit.append(0)
298 | 			except:
299 | 				#pdb.set_trace()
300 | 				hit.append(0)
301 | 				pass
302 | 		#pdb.set_trace()
303 | 		acc = sum(hit) * 1.0 / len(hit)
304 | 		print acc
305 | 
306 | def queryAll(base_dir, firmware_name, filename1, n):
307 | 	threthold = 155
308 | 	db_instance = db()
309 | 	db_instance.loadHashmap(n, 50000)
310 | 	queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
311 | 	start = time.time()
312 | 	pdb.set_trace()
313 | 	db_instance.batch_appendDBbyDir(n, base_dir)
314 | 	end = time.time()
315 | 	dur = end - start
316 | 	print dur
317 | 	pdb.set_trace()
318 | 	hit = []
319 | 	i = 0
320 | 	times = []
321 | 	for name in queries:
322 | 		print i 
323 | 		i += 1
324 | 		query = queries[name]
325 | 		start = time.clock()
326 | 		x = db_instance.engine.neighbours(query)
327 | 		end = time.clock()
328 | 		dur = end - start
329 | 		times.append(dur)
330 | 		#pdb.set_trace()
331 | 		try:
332 | 			rank = [v for v in xrange(len(x)) if name in x[v][1]]
333 | 			if len(rank) > 1:
334 | 				pdb.set_trace()
335 | 				print "stop"
336 | 			if rank[0] <= threthold:
337 | 				hit.append(1)
338 | 			else:
339 | 				hit.append(0)
340 | 		except:
341 | 			hit.append(0)
342 | 	
343 | 	acc = sum(hit) * 1.0 / len(hit)
344 | 	mean = np.mean(times)
345 | 	std =  np.std(times)
346 | 	#pdb.set_trace()
347 | 	print acc
348 | 
349 | if __name__ == "__main__":
350 | 	args = parse_command()
351 | 	base_dir = args.base_input_dir
352 | 	filename1 = args.filename1
353 | 	filename2 = args.filename2
354 | 	n = args.size
355 | 	pdb.set_trace()
356 | 	queryBytwo(base_dir, filename1, filename2, n)
357 | 


--------------------------------------------------------------------------------