├── .gitignore ├── plugin.json ├── llil_categories.py ├── spp_primes.py ├── tarjan_sort.py ├── README.md ├── providers.py └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /plugin.json: -------------------------------------------------------------------------------- 1 | { 2 | "plugin": { 3 | "name": "SimilarNinja Plugin", 4 | "type": ["core", "ui", "architecture", "binaryview"], 5 | "api": "python2", 6 | "description": "Find similar functions with Binary Ninja", 7 | "longdescription": "", 8 | "license": { 9 | "name": "GPLv2", 10 | "text": "" 11 | }, 12 | "dependencies": { 13 | "pip": [], 14 | "apt": [], 15 | "installers": [], 16 | "other": [] 17 | }, 18 | "version": "0.2 alpha", 19 | "author": "buherator", 20 | "minimumBinaryNinjaVersion": { 21 | "dev": "1.0.dev-576", 22 | "release": "0" 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llil_categories.py: -------------------------------------------------------------------------------- 1 | from binaryninja.enums import LowLevelILOperation 2 | 3 | LLIL_REDIRECT = [ LowLevelILOperation.LLIL_JUMP, 4 | LowLevelILOperation.LLIL_JUMP_TO, 5 | LowLevelILOperation.LLIL_CALL, 6 | LowLevelILOperation.LLIL_RET, 7 | LowLevelILOperation.LLIL_NORET, 8 | LowLevelILOperation.LLIL_IF, 9 | LowLevelILOperation.LLIL_GOTO, 10 | LowLevelILOperation.LLIL_FLAG_COND, 11 | LowLevelILOperation.LLIL_CMP_E, 12 | LowLevelILOperation.LLIL_CMP_NE, 13 | LowLevelILOperation.LLIL_CMP_SLT, 14 | LowLevelILOperation.LLIL_CMP_ULT, 15 | LowLevelILOperation.LLIL_CMP_SLE, 16 | LowLevelILOperation.LLIL_CMP_ULE, 17 | LowLevelILOperation.LLIL_CMP_SGE, 18 | LowLevelILOperation.LLIL_CMP_UGE, 19 | LowLevelILOperation.LLIL_CMP_SGT, 20 | LowLevelILOperation.LLIL_CMP_UGT, 21 | ] 22 | 23 | LLIL_ARITHMETIC = [ LowLevelILOperation.LLIL_ADD, 24 | LowLevelILOperation.LLIL_ADC, 25 | LowLevelILOperation.LLIL_SUB, 26 | LowLevelILOperation.LLIL_SBB, 27 | LowLevelILOperation.LLIL_LSL, 28 | LowLevelILOperation.LLIL_LSR, 29 | LowLevelILOperation.LLIL_ASR, 30 | LowLevelILOperation.LLIL_ROL, 31 | LowLevelILOperation.LLIL_RLC, 32 | LowLevelILOperation.LLIL_ROR, 33 | LowLevelILOperation.LLIL_RRC, 34 | LowLevelILOperation.LLIL_MUL, 35 | LowLevelILOperation.LLIL_MULU_DP, 36 | LowLevelILOperation.LLIL_MULS_DP, 37 | LowLevelILOperation.LLIL_DIVU, 38 | LowLevelILOperation.LLIL_DIVU_DP, 39 | LowLevelILOperation.LLIL_DIVS, 40 | LowLevelILOperation.LLIL_DIVS_DP, 41 | LowLevelILOperation.LLIL_MODU, 42 | LowLevelILOperation.LLIL_MODU_DP, 43 | LowLevelILOperation.LLIL_MODS, 44 | LowLevelILOperation.LLIL_MODS_DP, 45 | ] 46 | 47 | LLIL_LOGIC = [ LowLevelILOperation.LLIL_NEG, 48 | LowLevelILOperation.LLIL_NOT, 49 | LowLevelILOperation.LLIL_AND, 50 | LowLevelILOperation.LLIL_OR, 51 | LowLevelILOperation.LLIL_XOR, 52 | ] -------------------------------------------------------------------------------- /spp_primes.py: -------------------------------------------------------------------------------- 1 | # Prime constants from Diaphora: https://github.com/joxeankoret/diaphora/blob/master/jkutils/graph_hashes.py 2 | 3 | #------------------------------------------------------------------------------- 4 | # Different type of basic blocks (graph nodes). 5 | NODE_ENTRY = 2 6 | NODE_EXIT = 3 7 | NODE_NORMAL = 5 8 | 9 | # 10 | # NOTE: In the current implementation (Nov-2018) all edges are considered as if 11 | # they were conditional. Keep reading... 12 | # 13 | EDGE_IN_CONDITIONAL = 7 14 | EDGE_OUT_CONDITIONAL = 11 15 | 16 | # 17 | # Reserved but unused because, probably, it doesn't make sense when comparing 18 | # multiple different architectures. 19 | # 20 | EDGE_IN_UNCONDITIONAL = 13 21 | EDGE_OUT_UNCONDITIONAL = 17 22 | 23 | # 24 | # The following are feature types that aren't applied at basic block but rather 25 | # at function level. The idea is that if we do at function level we will have no 26 | # problems finding the same function that was re-ordered because of some crazy 27 | # code a different compiler decided to create (i.e., resilient to reordering). 28 | # 29 | FEATURE_LOOP = 19 30 | FEATURE_CALL = 23 31 | FEATURE_DATA_REFS = 29 32 | FEATURE_CALL_REF = 31 33 | FEATURE_STRONGLY_CONNECTED = 37 34 | FEATURE_FUNC_NO_RET = 41 35 | FEATURE_FUNC_LIB = 43 36 | FEATURE_FUNC_THUNK = 47 37 | 38 | # End of Diaphora prime constants 39 | 40 | LLIL_FEATURE_REDIRECT = 53 41 | LLIL_FEATURE_ARITHMETIC = 59 42 | LLIL_FEATURE_LOGIC = 61 43 | 44 | ALL_PRIMES = [ 45 | NODE_ENTRY, 46 | NODE_EXIT, 47 | NODE_NORMAL, 48 | EDGE_IN_CONDITIONAL, 49 | EDGE_OUT_CONDITIONAL, 50 | EDGE_IN_UNCONDITIONAL, 51 | EDGE_OUT_UNCONDITIONAL, 52 | FEATURE_LOOP, 53 | FEATURE_CALL, 54 | FEATURE_DATA_REFS, 55 | FEATURE_CALL_REF, 56 | FEATURE_STRONGLY_CONNECTED, 57 | FEATURE_FUNC_NO_RET, 58 | FEATURE_FUNC_LIB, 59 | FEATURE_FUNC_THUNK, 60 | LLIL_FEATURE_REDIRECT, 61 | LLIL_FEATURE_ARITHMETIC, 62 | LLIL_FEATURE_LOGIC, 63 | ] -------------------------------------------------------------------------------- /tarjan_sort.py: -------------------------------------------------------------------------------- 1 | 2 | # Downloaded from http://www.logarithmic.net/pfh-files/blog/01208083168/sort.py 3 | 4 | """ 5 | 6 | Tarjan's algorithm and topological sorting implementation in Python 7 | 8 | by Paul Harrison 9 | 10 | Public domain, do with it as you will 11 | 12 | """ 13 | 14 | def strongly_connected_components(graph): 15 | """ Find the strongly connected components in a graph using 16 | Tarjan's algorithm. 17 | 18 | graph should be a dictionary mapping node names to 19 | lists of successor nodes. 20 | """ 21 | 22 | result = [ ] 23 | stack = [ ] 24 | low = { } 25 | 26 | def visit(node): 27 | if node in low: return 28 | 29 | num = len(low) 30 | low[node] = num 31 | stack_pos = len(stack) 32 | stack.append(node) 33 | 34 | for successor in graph[node]: 35 | visit(successor) 36 | low[node] = min(low[node], low[successor]) 37 | 38 | if num == low[node]: 39 | component = tuple(stack[stack_pos:]) 40 | del stack[stack_pos:] 41 | result.append(component) 42 | for item in component: 43 | low[item] = len(graph) 44 | 45 | for node in graph: 46 | visit(node) 47 | 48 | return result 49 | 50 | 51 | def topological_sort(graph): 52 | count = { } 53 | for node in graph: 54 | count[node] = 0 55 | for node in graph: 56 | for successor in graph[node]: 57 | count[successor] += 1 58 | 59 | ready = [ node for node in graph if count[node] == 0 ] 60 | 61 | result = [ ] 62 | while ready: 63 | node = ready.pop(-1) 64 | result.append(node) 65 | 66 | for successor in graph[node]: 67 | count[successor] -= 1 68 | if count[successor] == 0: 69 | ready.append(successor) 70 | 71 | return result 72 | 73 | 74 | def robust_topological_sort(graph): 75 | """ First identify strongly connected components, 76 | then perform a topological sort on these components. """ 77 | 78 | components = strongly_connected_components(graph) 79 | 80 | node_component = { } 81 | for component in components: 82 | for node in component: 83 | node_component[node] = component 84 | 85 | component_graph = { } 86 | for component in components: 87 | component_graph[component] = [ ] 88 | 89 | for node in graph: 90 | node_c = node_component[node] 91 | for successor in graph[node]: 92 | successor_c = node_component[successor] 93 | if node_c != successor_c: 94 | component_graph[node_c].append(successor_c) 95 | 96 | return topological_sort(component_graph) 97 | 98 | 99 | if __name__ == '__main__': 100 | d = { 101 | 0 : [1], 102 | 1 : [2], 103 | 2 : [1,3], 104 | 3 : [3], 105 | } 106 | #print d 107 | #print robust_topological_sort(d) 108 | 109 | d = {0 : [1, 2, 4], 1 : [3, 4], 2 : [0, 3], 3 : [], 4: [1]} 110 | print d 111 | print "scc", strongly_connected_components(d) 112 | print "rts", robust_topological_sort(d) 113 | 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deprecation 2 | 3 | This project is deprecated as [binexport](https://github.com/google/binexport) supports Binary Ninja too these days. 4 | 5 | # SimilarNinja Plugin (v0.2 alpha) 6 | Author: **buherator** 7 | _Find similar functions with Binary Ninja_ 8 | ## Description: 9 | 10 | This is a partial implementation of the [KOKA algorithm](http://joxeankoret.com/blog/2018/11/04/new-cfg-based-heuristic-diaphora/) for CFG matching. 11 | 12 | Currently the following algorithms are implemented: 13 | 14 | * KOKA features bound together in an SPP hash: 15 | * NODE_ENTRY, NODE_EXIT, NODE_NORMAL 16 | * EDGE_OUT_CONDITIONAL, EDGE_IN_CONDITIONAL 17 | * FEATURE_FUNC_NO_RET, FEATURE_FUNC_LIB 18 | * Features from the [original paper](https://census-labs.com/media/efficient-features-bindiff.pdf): 19 | * Digraph Signature 20 | * String histogram 21 | * Others: 22 | * Basic Block Count 23 | * LLIL instruction types (LLIL_FEATURE_REDIRECT, LLIL_FEATURE_LOGIC, LLIL_FEATURE_ARITHMETIC) 24 | 25 | Experimental infrastructure is available for exact and partial matching. 26 | 27 | Early stage of development, code is unstable. 28 | 29 | Bugs? Very likely, please use the Issue Tracker! 30 | 31 | ### Why? 32 | 33 | The licensing model of IDA sucks, we need tools for independent frameworks. Other design goals: 34 | 35 | * Easy feature vector composition - creation of custom similarity metrics should be easy (at src level) 36 | * No external databases - Redundant data storage should be avoided 37 | * SQLite based compatibility layer for Diaphora would be nice 38 | 39 | ### Usage 40 | 41 | The plugin adds two menu items: one for generating feature vetors for the functions of a binary, another for comparing the results of the previous one. Results can be saved to standalone JSON files or along with the analysis data in the BNDB database (the later is recommended). When comparing results the plugin tries to load raw JSON formatted data unless the extension of the opened file is ".bndb" - in that case the JSON object is read from database metadata. Comparison results can be saved to standalone JSON files. 42 | 43 | Beware that some feature extractor classes can be stateful. When working with multiple views it is usually a good idea to `reload(similarninja)` in the Python Console when using it on a different tab. 44 | 45 | ## Customization 46 | 47 | You can compose your custom feature vector generator by editing the `PROVIDERS` list. Each list element should be a `FeatureProvider` subclass instance or a tuple. The `FeatureProvider` will be used to calculate similarity metrics for the corresponding vector position. In case of tuples the first element should be the `FeatureProvider` instance, while the second one is a float that will be used as a weight for the element. The default weight for each element is 1.0. 48 | 49 | ### Examples 50 | 51 | Using String Histogram with a Small Primes Product of the Stringly Connected Features and Function Flags. Both features have a weight of 1.0 when comparing: 52 | 53 | ``` 54 | [StringHistogramProvider(), 55 | SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures])] 56 | ``` 57 | 58 | 59 | Using String Histogram with the Stringly Connected Features and Function Flags as separate prime products. String Histogram has doubled weight: 60 | 61 | ``` 62 | [(StringHistogramProvider(), 2.0), 63 | SPPFeatureProvider([FuncStronglyConnectedFeatures]), 64 | SPPFeatureProvider([FuncFlagsFeatures])] 65 | ``` 66 | 67 | 68 | ## Testing 69 | 70 | With the corresponding views open, save the current BinaryView objects in the Binary Ninja console: 71 | 72 | ``` 73 | >>> bv0=bv 74 | # Switch views on the GUI 75 | >>> bv1=bv 76 | ``` 77 | 78 | Invoke the tester function: 79 | 80 | ``` 81 | >>> similarninja.tester(bv0, bv1) 82 | ``` 83 | 84 | This will do the feature extraction (features stored in the database for later use) and comparison, then uses the available symbol information to measure accuracy. 85 | 86 | The function allows testing multiple feature provider compositions at once, refer to the source for details! 87 | 88 | ### Results 89 | 90 | The following results are based on debug information contained in unstripped binaries (exact function name match). 91 | 92 | The current algorithm for partial matches is very liberal and will try to find a match for everything - this is the reason of high incorrect match numbers. However these "incorrect" numbers also contain actual good matches (like matches between `shaX_process_blockN()` functions). 93 | 94 | ### Busybox 95 | 96 | #### 1.29.1 vs. 1.29.2 x64 ELF 97 | 98 | | Func # | Correct match | Incorrect match | 99 | |--------|---------------|-----------------| 100 | | 3114 | 1600 (51.3%) | 1488 (47.8%) | 101 | 102 | Feature vector providers: 103 | ``` 104 | [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(),BBLCountProvider()] 105 | ``` 106 | 107 | | Func # | Correct match | Incorrect match | 108 | |--------|---------------|-----------------| 109 | | 3114 | 2098 (67.4%) | 981 (31.5%) | 110 | 111 | Feature vector providers: 112 | ``` 113 | [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(), StringHistogramProvider()] 114 | ``` 115 | 116 | 117 | ### SQLite 118 | 119 | #### 3.25.03 vs. 3.25.00 x64 ELF* 120 | 121 | | Func # | Correct match | Incorrect match | 122 | |--------|---------------|-----------------| 123 | | 3122 | 1432 (45.9%) | 1689 (54.1%) | 124 | 125 | Feature vector providers: 126 | ``` 127 | [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(),BBLCountProvider()] 128 | ``` 129 | 130 | | Func # | Correct match | Incorrect match | 131 | |--------|---------------|-----------------| 132 | | 3122 | 1618 (51.8%) | 1503 (48.1%) | 133 | 134 | Feature vector providers: 135 | ``` 136 | [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(), StringHistogramProvider()] 137 | ``` 138 | 139 | ## TODO 140 | 141 | A lot of things... 142 | 143 | * More matcher algorithms 144 | * Algorithms from the [original paper](https://census-labs.com/media/efficient-features-bindiff.pdf): 145 | * Markov lumping 146 | * Instruction histogram (with capstone/pyxed/other external lib?) 147 | * Better integration with the UI 148 | * Without a BinaryView we loose cross-function control-flow data, so function predecessors/successors can't be discovered during matching 149 | * Multiple ways to handle this, have to decide which way to go... 150 | * LICENSE file... 151 | 152 | ### Binary Ninja API wishlist 153 | 154 | * Instruction level classification 155 | * Data XRefs 156 | 157 | ## Minimum Version 158 | 159 | This plugin requires the following minimum version of Binary Ninja: 160 | 161 | * release - 0 162 | * dev - 1.0.dev-576 163 | 164 | ## Required Dependencies 165 | 166 | The following dependencies are required for this plugin: 167 | 168 | * pip - 169 | * installers - 170 | * other - 171 | * apt - 172 | 173 | ## License 174 | This plugin is released under a GPLv2 license as required by Diaphora. 175 | 176 | -------------------------------------------------------------------------------- /providers.py: -------------------------------------------------------------------------------- 1 | from binaryninja import * 2 | from tarjan_sort import * 3 | from spp_primes import * 4 | from llil_categories import * 5 | 6 | 7 | class SPPBBLProvider: 8 | @staticmethod 9 | def calculate(bbl): 10 | pass 11 | 12 | class SPPFunctionProvider: 13 | @staticmethod 14 | def calculate(func): 15 | pass 16 | 17 | class FeatureProvider: 18 | def calculate(self, func): 19 | pass 20 | 21 | @staticmethod 22 | def compare(f0, f1): 23 | pass 24 | 25 | class BBLTypeFeatures(SPPBBLProvider): 26 | @staticmethod 27 | def calculate(b): 28 | ret = 1 29 | 30 | if len(b.incoming_edges) == 0: 31 | ret *= NODE_ENTRY 32 | if len(b.outgoing_edges) == 0: 33 | ret *= NODE_EXIT 34 | ret *= NODE_NORMAL 35 | return ret 36 | 37 | class BBLEdgeFeatures(SPPBBLProvider): 38 | @staticmethod 39 | def calculate(b): 40 | ret = 1 41 | 42 | ins = b.incoming_edges 43 | outs = b.outgoing_edges 44 | 45 | for e in outs: 46 | ret *= EDGE_OUT_CONDITIONAL 47 | for e in ins: 48 | ret *= EDGE_IN_CONDITIONAL 49 | 50 | return ret 51 | 52 | class FuncInstructionFeatures(SPPFunctionProvider): 53 | @staticmethod 54 | def calculate(func): 55 | ret = 1 56 | for block in func.low_level_il: 57 | for ins in block: 58 | if ins.operation in LLIL_REDIRECT: 59 | ret *= LLIL_FEATURE_REDIRECT 60 | elif ins.operation in LLIL_ARITHMETIC: 61 | ret *= LLIL_FEATURE_ARITHMETIC 62 | elif ins.operation in LLIL_LOGIC: 63 | ret *= LLIL_FEATURE_LOGIC 64 | return ret 65 | 66 | class FuncStronglyConnectedFeatures(SPPFunctionProvider): 67 | @staticmethod 68 | def calculate(func): 69 | bb_relations = {} 70 | ret = 1 71 | for block in func.basic_blocks: 72 | # Creating bb_relations 73 | bb_relations[block.start] = [] 74 | for e in block.outgoing_edges: 75 | bb_relations[block.start].append(e.target.start) 76 | 77 | for e in block.incoming_edges: 78 | try: 79 | bb_relations[e.source.start].append(block.start) 80 | except KeyError: 81 | bb_relations[e.source.start] = [block.start] 82 | try: 83 | strongly_connected = strongly_connected_components(bb_relations) 84 | for sc in strongly_connected: 85 | if len(sc) > 1: 86 | ret *= FEATURE_LOOP 87 | else: 88 | if sc[0] in bb_relations and sc[0] in bb_relations[sc[0]]: 89 | ret *= FEATURE_LOOP 90 | ret *= FEATURE_STRONGLY_CONNECTED ** len(strongly_connected) 91 | except: 92 | log_error("Exception: %s" % (sys.exc_info()[1])) 93 | return ret 94 | 95 | class FuncFlagsFeatures(SPPFunctionProvider): 96 | @staticmethod 97 | def calculate(func): 98 | ret = 1 99 | if not func.can_return: 100 | ret *= FEATURE_FUNC_NO_RET 101 | if func.symbol.type is SymbolType.ImportedFunctionSymbol: 102 | ret *= FEATURE_FUNC_LIB 103 | # [TODO] Binary Ninja API for Thunks 104 | return ret 105 | 106 | class SPPFeatureProvider(FeatureProvider): 107 | def __init__(self, features=[]): 108 | self.features=features 109 | 110 | def calculate(self, func): 111 | ret=1 112 | for p in self.features: 113 | for block in func.basic_blocks: 114 | if issubclass(p,SPPBBLProvider): 115 | ret *= p.calculate(block) 116 | if issubclass(p,SPPFunctionProvider): 117 | ret *= p.calculate(func) 118 | return ret 119 | 120 | @staticmethod 121 | def _primes(n): 122 | # http://outslide.tumblr.com/post/167558674272 123 | if n in ALL_PRIMES: 124 | return [n] 125 | # This is slow as hell for large numbers 126 | i = 0 127 | primes=[] 128 | while n != 1: 129 | if n % ALL_PRIMES[i] == 0: 130 | primes.append(ALL_PRIMES[i]) 131 | n = n / ALL_PRIMES[i] 132 | else: 133 | i += 1 134 | if i >= len(ALL_PRIMES): 135 | log_error("Something is fucky with SPP primes! %x " % (n)) 136 | break 137 | return primes 138 | 139 | @staticmethod 140 | def _hcfnaive(a,b): 141 | if(b==0): 142 | return a 143 | else: 144 | return SPPFeatureProvider._hcfnaive(b,a%b) 145 | 146 | @staticmethod 147 | def compare(f0,f1): 148 | if f0 == f1: 149 | return 1.0 150 | if f0 == 0 or f1 == 0: 151 | return 0.0 152 | else: 153 | hcf = SPPFeatureProvider._hcfnaive(f0,f1) 154 | f0_hcf_primes=SPPFeatureProvider._primes(f0/hcf) 155 | f1_hcf_primes=SPPFeatureProvider._primes(f1/hcf) 156 | try: 157 | if len(f0_hcf_primes) > len(f1_hcf_primes): 158 | return 1-(float(len(f0_hcf_primes))/len(SPPFeatureProvider._primes(f0))) 159 | else: 160 | return 1-(float(len(f1_hcf_primes))/len(SPPFeatureProvider._primes(f1))) 161 | except OverflowError: 162 | return 0.0 163 | except ZeroDivisionError: 164 | log_error("Division by zero: %X %X HCF: %X Primes: %s %s " % (f0,f1,hcf,SPPFeatureProvider._primes(f0),SPPFeatureProvider._primes(f1))) 165 | raise 166 | 167 | class DigraphFeatureProvider(FeatureProvider): 168 | def __init__(self): 169 | self.visited=set() 170 | 171 | def dfs(self,block,value): 172 | #log_info("Entering %x value: %x" % (block.start,value)) 173 | 174 | if block.start not in self.visited: 175 | value *= 2 176 | value += 1 177 | #log_info("Not visited yet! %x" % (value)) 178 | self.visited.add(block.start) 179 | else: 180 | return value 181 | 182 | for e in block.outgoing_edges: 183 | value=self.dfs(e.target, value) 184 | value *= 2 185 | #log_info("Leaving %x Value: %x" % (block.start,value)) 186 | return value 187 | 188 | def calculate(self,func): 189 | block=func.get_basic_block_at(func.start) 190 | 191 | value=self.dfs(block, 0) 192 | #log_info("Final Value: %d" % value) 193 | return value 194 | 195 | @staticmethod 196 | def compare(f0,f1): 197 | binlen0=float(len(bin(f0))) 198 | binlen1=float(len(bin(f1))) 199 | hamming=float(bin(f0^f1).count('1')) 200 | if binlen0 >= binlen1: 201 | return 1.0-(hamming/binlen0) 202 | else: 203 | return 1.0-(hamming/binlen1) 204 | 205 | class BBLCountProvider(FeatureProvider): 206 | def calculate(self, func): 207 | return len(func.basic_blocks) 208 | 209 | @staticmethod 210 | def compare(f0,f1): 211 | 212 | if f0>=f1: 213 | return 1-(float(f0-f1)/f0) 214 | else: 215 | return 1-(float(f1-f0)/f1) 216 | 217 | class StringHistogramProvider(FeatureProvider): 218 | def __init__(self): 219 | self.cache=None 220 | 221 | 222 | def calculate(self, func): 223 | # String info is available for the global BinaryView 224 | # String histograms are calculated and cached at first call 225 | if self.cache is None: 226 | self.cache = {} 227 | vectors = {} 228 | bv=func.view 229 | for s in bv.strings: 230 | value = s.value 231 | str_xrefs = bv.get_code_refs(s.start) 232 | #log_info("--------- %s" % s.value) 233 | 234 | # Updating character counts for all functions referencing the current string 235 | for x in str_xrefs: 236 | if x.function.start not in vectors: 237 | vectors[x.function.start] = [0]*256 238 | for c in s.value: 239 | vectors[x.function.start][ord(c)] += 1 240 | #log_info(repr(vectors[x.function.start])) 241 | for f, c_vec in vectors.iteritems(): 242 | #log_info("%s" % repr(c_vec)) 243 | self.cache[f] = 0 244 | cmax = 0 245 | begin = 0 246 | started = False 247 | end = 255 248 | 249 | # Find maximum and cut empty ends 250 | for i in xrange(0,256): 251 | if c_vec[i] != 0: 252 | if not started: 253 | begin = i 254 | started = True 255 | end = i 256 | if c_vec[i] > cmax: 257 | cmax = c_vec[i] 258 | #log_info("%d %d %s" % (begin,end,repr(c_vec[begin:end+1]))) 259 | 260 | # Normalize charcter counts to 0-15 and encode vector as integer 261 | # [4,0,8,0,16,8,4,2] -> 0x3070f731 262 | for i in xrange(begin, end): 263 | self.cache[f] *= 16 264 | self.cache[f] += int((float(c_vec[i])/cmax)*15) 265 | #log_info("%X" % self.cache[f]) 266 | if func.start in self.cache: 267 | return self.cache[func.start] 268 | else: 269 | return 0 270 | 271 | @staticmethod 272 | def compare(f0,f1): 273 | binlen0=float(len(bin(f0))) 274 | binlen1=float(len(bin(f1))) 275 | hamming=float(bin(f0^f1).count('1')) 276 | if binlen0 >= binlen1: 277 | return 1.0-(hamming/binlen0) 278 | else: 279 | return 1.0-(hamming/binlen1) 280 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from binaryninja import * 2 | from providers import * 3 | import json 4 | 5 | PROVIDERS = [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(),(StringHistogramProvider(), 2.0)] 6 | 7 | def get_func_predecessors(bv,f): 8 | ret=[] 9 | for xref in bv.get_code_refs(f.start): 10 | x_func = xref.function 11 | low_level_il = x_func.get_low_level_il_at(bv.platform.arch, xred.address) 12 | il = function.low_level_il[low_level_il] 13 | if il.operation == LLIL_CALL: 14 | ret.append(x_func.start) 15 | return ret 16 | 17 | def load_data(self, fn0): 18 | data0 = None 19 | if fn0.endswith(".bndb"): 20 | fm = FileMetadata() 21 | db0 = fm.open_existing_database(fn0) 22 | for t in db0.available_view_types: 23 | try: 24 | bv0= db0.get_view_of_type(t.name) 25 | data0 = json.loads(bv0.query_metadata("similarninja")) 26 | break 27 | except KeyError: 28 | pass 29 | else: 30 | f0 = open(fn0, "r") 31 | data0=json.loads(f0.read()) 32 | return data0 33 | 34 | class FeatureGenerator(BackgroundTaskThread): 35 | def __init__(self, bv, providers, self_contained=None): 36 | super(FeatureGenerator, self).__init__("", False) 37 | self.bv = bv 38 | self.providers = providers 39 | self.self_contained = self_contained 40 | 41 | def run(self): 42 | results={} 43 | func_len = len(self.bv.functions) 44 | for n, func in enumerate(self.bv.functions): 45 | self.progress = "Generating features (%d/%d)" % (n, func_len) 46 | idx=long(func.start) 47 | results[idx] = [None] * len(self.providers) 48 | for i,p in enumerate(self.providers): 49 | if isinstance(p, tuple): 50 | p = p[0] 51 | results[idx][i] = p.calculate(func) 52 | self.progress = "Done generating features" 53 | log_info(repr(results)) 54 | if self.self_contained or show_message_box("SimilarNinja","Do you want to save the results to the Binary Ninja database?", MessageBoxButtonSet.YesNoButtonSet, MessageBoxIcon.QuestionIcon) == 1: 55 | # Storing as JSON is wasteful, but more secure than Pickle... good enough for now 56 | self.bv.store_metadata("similarninja",json.dumps(results)) 57 | else: 58 | out = open(get_save_filename_input("Filename to save function hashes:","*","output.json"),"wb") 59 | out.write(json.dumps(results)) 60 | out.close() 61 | self.finish() 62 | 63 | class SimilarNinjaComparer(BackgroundTaskThread): 64 | def __init__(self, providers, data0=None, data1=None, result=None): 65 | super(SimilarNinjaComparer, self).__init__("", False) 66 | self.providers = providers 67 | self.data0 = data0 68 | self.data1 = data1 69 | self.result = result 70 | 71 | def match_fvs(self, data0, data1): 72 | res=[] 73 | ordered_keys0=sorted(data0,key=data0.get,reverse=True) 74 | ordered_keys1=sorted(data1,key=data1.get,reverse=True) 75 | 76 | for func0 in ordered_keys0: # So we can delete elements 77 | if func0 not in data0: continue 78 | feat0 = data0[func0] 79 | 80 | for func1 in ordered_keys1: 81 | if func1 not in data1: continue 82 | feat1 = data1[func1] 83 | matching=True 84 | for i in xrange(0,len(feat1)): 85 | if feat0[i] != feat1[i]: 86 | matching=False 87 | break 88 | if matching: 89 | log_info("%x <-> %x %s (%f)\n%s %s" % (long(func0), long(func1), [], 1.0, feat0, feat1)) 90 | res.append(((long(func0), feat0), (long(func1),feat1), 1.0)) 91 | del data0[func0] 92 | del data1[func1] 93 | break 94 | return res 95 | 96 | def run(self): 97 | self.progress="Opening files for comparison" 98 | 99 | data0 = None 100 | data1 = None 101 | 102 | if self.data0 is not None: 103 | data0 = self.data0 104 | else: 105 | fn0 = get_open_filename_input("filename0:","*") 106 | data0 = load_data(fn0) 107 | 108 | if self.data1 is not None: 109 | data1 = self.data1 110 | else: 111 | fn1 = get_open_filename_input("filename1:","*") 112 | data1 = load_data(fn1) 113 | 114 | self.progress="Comparing..." 115 | data0_len = len(data0) 116 | data1_len = len(data1) 117 | 118 | log_info("Data sizes: %d %d" % (len(data0), len(data1))) 119 | matches=self.match_fvs(data0, data1) 120 | log_info("Data sizes after matching: %d %d" % (len(data0), len(data1))) 121 | 122 | # Inexact matches 123 | for func0 in list(data0.keys()): # So we can delete elements 124 | if func0 not in data0: continue 125 | feat0 = data0[func0] 126 | 127 | sims0 = [None] * len(self.providers) 128 | sim_avg0 = 0.0 129 | func_match = None 130 | feat_match = None 131 | for func1 in list(data1.keys()): 132 | if func1 not in data1: continue 133 | feat1 = data1[func1] 134 | 135 | sims = [None] * len(self.providers) 136 | weight_sum = 0.0 137 | for i, p in enumerate(self.providers): 138 | weight = 1.0 139 | if isinstance(p, tuple): 140 | weight = p[1] 141 | p = p[0] 142 | sims[i] = p.compare(feat0[i],feat1[i])*weight 143 | weight_sum += weight 144 | sim_avg = 0.0 145 | for s in sims: 146 | sim_avg += s 147 | sim_avg = sim_avg / weight_sum 148 | 149 | if sim_avg > sim_avg0: 150 | sim_avg0 = sim_avg 151 | sims0 = sims 152 | func_match = func1 153 | feat_match = feat1 154 | if sim_avg0 == 1.0: break # Exit early for perfect matches 155 | 156 | if func_match is not None: 157 | log_info("%x <-> %x %s (%f)\n%s %s" % (long(func0), long(func_match), repr(sims0), sim_avg0, feat0, feat_match)) 158 | matches.append(((long(func0), feat0), (long(func_match), feat_match), sim_avg0)) 159 | self.progress = "Matches: %d (%d <-> %d)" % (len(matches), data0_len, data1_len) 160 | del data0[func0] 161 | del data1[func_match] 162 | 163 | result_fn = None 164 | if self.result is None: 165 | result_fn = get_save_filename_input("Filename to save comparison results:","*","compare.json") 166 | else: 167 | result_fn = self.result 168 | if result_fn is not None: 169 | out = open(result_fn, "wb") 170 | out.write(json.dumps(matches)) 171 | out.close() 172 | self.matches = matches 173 | self.finish() 174 | 175 | 176 | def tester(bv0, bv1): 177 | providers_list = [ 178 | [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(), StringHistogramProvider()] , 179 | ] 180 | 181 | for p in providers_list: 182 | fgen0=FeatureGenerator(bv0, p, True) 183 | fgen0.start() 184 | 185 | fgen1=FeatureGenerator(bv1, p, True) 186 | fgen1.start() 187 | 188 | fgen0.join() 189 | fgen1.join() 190 | 191 | data0 = json.loads(bv0.query_metadata("similarninja")) 192 | data1 = json.loads(bv1.query_metadata("similarninja")) 193 | 194 | sn_comparer=SimilarNinjaComparer(p, data0, data1, "/dev/null") 195 | sn_comparer.start() 196 | sn_comparer.join() 197 | 198 | matches=sn_comparer.matches 199 | 200 | unknown=0 201 | success=0 202 | failure=0 203 | for m in matches: 204 | func0=bv0.get_function_at(m[0][0]) 205 | if func0 is None or func0.start != m[0][0]: 206 | log_info("Switching views") 207 | bv1, bv0 = bv0, bv1 208 | func0=bv0.get_function_at(m[0][0]) 209 | func1=bv1.get_function_at(m[1][0]) 210 | try: 211 | if func0.name.startswith("sub_") and func1.name.startswith("sub_"): 212 | unknown += 1 213 | continue 214 | except AttributeError: 215 | log_error("Function not found: %x %x" % (m[0][0],m[1][0])) 216 | return 217 | if func0.name == func1.name: 218 | log_info("%s (%x) == %s (%x)" % (func0.name, func0.start, func1.name, func1.start)) 219 | success += 1 220 | else: 221 | log_info("%s (%x) != %s (%x)" % (func0.name, func0.start, func1.name, func1.start)) 222 | failure += 1 223 | log_info("PROVIDERS: %s" % repr(p)) 224 | log_info("\\_ Success: %d" % success) 225 | log_info("\\_ Failure: %d" % failure) 226 | log_info("\\_ Total: %d (%d <-> %d) \n" % (len(matches), len(bv0.functions), len(bv1.functions))) 227 | 228 | def compare(bv): 229 | sn_comparer=SimilarNinjaComparer(PROVIDERS) 230 | sn_comparer.start() 231 | 232 | def gen_feature(bv): 233 | fgen=FeatureGenerator(bv, PROVIDERS) 234 | fgen.start() 235 | 236 | PluginCommand.register("SimilarNinja - Generate Feature Vectors", "Generates Feature Vectors for all functions", gen_feature) 237 | PluginCommand.register("SimilarNinja - Compare", "Compare functions based on generated data files", compare) 238 | --------------------------------------------------------------------------------