├── .gitignore
├── plugin.json
├── llil_categories.py
├── spp_primes.py
├── tarjan_sort.py
├── README.md
├── providers.py
└── __init__.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/plugin.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"plugin": {
 3 | 		"name": "SimilarNinja Plugin",
 4 | 		"type": ["core", "ui", "architecture", "binaryview"],
 5 | 		"api": "python2",
 6 | 		"description": "Find similar functions with Binary Ninja",
 7 | 		"longdescription": "",
 8 | 		"license": {
 9 | 			"name": "GPLv2",
10 | 			"text": ""
11 | 		},
12 | 		"dependencies": {
13 | 			"pip": [],
14 | 			"apt": [],
15 | 			"installers": [],
16 | 			"other": []
17 | 		},
18 | 		"version": "0.2 alpha",
19 | 		"author": "buherator",
20 | 		"minimumBinaryNinjaVersion": {
21 | 			"dev": "1.0.dev-576",
22 | 			"release": "0"
23 | 		}
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/llil_categories.py:
--------------------------------------------------------------------------------
 1 | from binaryninja.enums import LowLevelILOperation
 2 | 
 3 | LLIL_REDIRECT = [   LowLevelILOperation.LLIL_JUMP,
 4 | 					LowLevelILOperation.LLIL_JUMP_TO,
 5 | 					LowLevelILOperation.LLIL_CALL,
 6 | 					LowLevelILOperation.LLIL_RET,
 7 | 					LowLevelILOperation.LLIL_NORET,
 8 | 					LowLevelILOperation.LLIL_IF,
 9 | 					LowLevelILOperation.LLIL_GOTO,
10 | 					LowLevelILOperation.LLIL_FLAG_COND,
11 | 					LowLevelILOperation.LLIL_CMP_E,
12 | 					LowLevelILOperation.LLIL_CMP_NE,
13 | 					LowLevelILOperation.LLIL_CMP_SLT,
14 | 					LowLevelILOperation.LLIL_CMP_ULT,
15 | 					LowLevelILOperation.LLIL_CMP_SLE,
16 | 					LowLevelILOperation.LLIL_CMP_ULE,
17 | 					LowLevelILOperation.LLIL_CMP_SGE,
18 | 					LowLevelILOperation.LLIL_CMP_UGE,
19 | 					LowLevelILOperation.LLIL_CMP_SGT,
20 | 					LowLevelILOperation.LLIL_CMP_UGT,
21 | 				]
22 | 
23 | LLIL_ARITHMETIC = [ LowLevelILOperation.LLIL_ADD,
24 | 					LowLevelILOperation.LLIL_ADC,
25 | 					LowLevelILOperation.LLIL_SUB,
26 | 					LowLevelILOperation.LLIL_SBB,
27 | 					LowLevelILOperation.LLIL_LSL,
28 | 					LowLevelILOperation.LLIL_LSR,
29 | 					LowLevelILOperation.LLIL_ASR,
30 | 					LowLevelILOperation.LLIL_ROL,
31 | 					LowLevelILOperation.LLIL_RLC,
32 | 					LowLevelILOperation.LLIL_ROR,
33 | 					LowLevelILOperation.LLIL_RRC,
34 | 					LowLevelILOperation.LLIL_MUL,
35 | 					LowLevelILOperation.LLIL_MULU_DP,
36 | 					LowLevelILOperation.LLIL_MULS_DP,
37 | 					LowLevelILOperation.LLIL_DIVU,
38 | 					LowLevelILOperation.LLIL_DIVU_DP,
39 | 					LowLevelILOperation.LLIL_DIVS,
40 | 					LowLevelILOperation.LLIL_DIVS_DP,
41 | 					LowLevelILOperation.LLIL_MODU,
42 | 					LowLevelILOperation.LLIL_MODU_DP,
43 | 					LowLevelILOperation.LLIL_MODS,
44 | 					LowLevelILOperation.LLIL_MODS_DP,
45 | 				]
46 | 
47 | LLIL_LOGIC = [  LowLevelILOperation.LLIL_NEG,
48 | 				LowLevelILOperation.LLIL_NOT,
49 | 				LowLevelILOperation.LLIL_AND,
50 | 				LowLevelILOperation.LLIL_OR,
51 | 				LowLevelILOperation.LLIL_XOR,
52 | 			]


--------------------------------------------------------------------------------
/spp_primes.py:
--------------------------------------------------------------------------------
 1 | # Prime constants from Diaphora: https://github.com/joxeankoret/diaphora/blob/master/jkutils/graph_hashes.py
 2 | 
 3 | #-------------------------------------------------------------------------------
 4 | # Different type of basic blocks (graph nodes).
 5 | NODE_ENTRY = 2
 6 | NODE_EXIT = 3
 7 | NODE_NORMAL = 5
 8 | 
 9 | #
10 | # NOTE: In the current implementation (Nov-2018) all edges are considered as if
11 | # they were conditional. Keep reading...
12 | #
13 | EDGE_IN_CONDITIONAL = 7
14 | EDGE_OUT_CONDITIONAL = 11
15 | 
16 | #
17 | # Reserved but unused because, probably, it doesn't make sense when comparing
18 | # multiple different architectures.
19 | #
20 | EDGE_IN_UNCONDITIONAL = 13
21 | EDGE_OUT_UNCONDITIONAL = 17
22 | 
23 | # 
24 | # The following are feature types that aren't applied at basic block but rather
25 | # at function level. The idea is that if we do at function level we will have no
26 | # problems finding the same function that was re-ordered because of some crazy
27 | # code a different compiler decided to create (i.e., resilient to reordering).
28 | #
29 | FEATURE_LOOP = 19
30 | FEATURE_CALL = 23
31 | FEATURE_DATA_REFS = 29
32 | FEATURE_CALL_REF = 31
33 | FEATURE_STRONGLY_CONNECTED = 37
34 | FEATURE_FUNC_NO_RET = 41
35 | FEATURE_FUNC_LIB = 43
36 | FEATURE_FUNC_THUNK = 47 
37 | 
38 | # End of Diaphora prime constants
39 | 
40 | LLIL_FEATURE_REDIRECT = 53 
41 | LLIL_FEATURE_ARITHMETIC = 59
42 | LLIL_FEATURE_LOGIC = 61
43 | 
44 | ALL_PRIMES = [
45 | 				NODE_ENTRY,
46 | 				NODE_EXIT,
47 | 				NODE_NORMAL,
48 | 				EDGE_IN_CONDITIONAL,
49 | 				EDGE_OUT_CONDITIONAL,
50 | 				EDGE_IN_UNCONDITIONAL,
51 | 				EDGE_OUT_UNCONDITIONAL,
52 | 				FEATURE_LOOP,
53 | 				FEATURE_CALL,
54 | 				FEATURE_DATA_REFS,
55 | 				FEATURE_CALL_REF,
56 | 				FEATURE_STRONGLY_CONNECTED,
57 | 				FEATURE_FUNC_NO_RET,
58 | 				FEATURE_FUNC_LIB,
59 | 				FEATURE_FUNC_THUNK,
60 |                 LLIL_FEATURE_REDIRECT,
61 |                 LLIL_FEATURE_ARITHMETIC,
62 |                 LLIL_FEATURE_LOGIC,
63 | 			]


--------------------------------------------------------------------------------
/tarjan_sort.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Downloaded from http://www.logarithmic.net/pfh-files/blog/01208083168/sort.py
  3 | 
  4 | """
  5 |    
  6 |    Tarjan's algorithm and topological sorting implementation in Python
  7 |    
  8 |    by Paul Harrison
  9 |    
 10 |    Public domain, do with it as you will
 11 | 
 12 | """
 13 | 
 14 | def strongly_connected_components(graph):
 15 |     """ Find the strongly connected components in a graph using
 16 |         Tarjan's algorithm.
 17 |         
 18 |         graph should be a dictionary mapping node names to
 19 |         lists of successor nodes.
 20 |         """
 21 |     
 22 |     result = [ ]
 23 |     stack = [ ]
 24 |     low = { }
 25 |         
 26 |     def visit(node):
 27 |         if node in low: return
 28 | 
 29 |         num = len(low)
 30 |         low[node] = num
 31 |         stack_pos = len(stack)
 32 |         stack.append(node)
 33 |         
 34 |         for successor in graph[node]:
 35 |             visit(successor)
 36 |             low[node] = min(low[node], low[successor])
 37 |         
 38 |         if num == low[node]:
 39 |             component = tuple(stack[stack_pos:])
 40 |             del stack[stack_pos:]
 41 |             result.append(component)
 42 |             for item in component:
 43 |                 low[item] = len(graph)
 44 |     
 45 |     for node in graph:
 46 |         visit(node)
 47 |     
 48 |     return result
 49 | 
 50 | 
 51 | def topological_sort(graph):
 52 |     count = { }
 53 |     for node in graph:
 54 |         count[node] = 0
 55 |     for node in graph:
 56 |         for successor in graph[node]:
 57 |             count[successor] += 1
 58 | 
 59 |     ready = [ node for node in graph if count[node] == 0 ]
 60 |     
 61 |     result = [ ]
 62 |     while ready:
 63 |         node = ready.pop(-1)
 64 |         result.append(node)
 65 |         
 66 |         for successor in graph[node]:
 67 |             count[successor] -= 1
 68 |             if count[successor] == 0:
 69 |                 ready.append(successor)
 70 |     
 71 |     return result
 72 | 
 73 | 
 74 | def robust_topological_sort(graph):
 75 |     """ First identify strongly connected components,
 76 |         then perform a topological sort on these components. """
 77 | 
 78 |     components = strongly_connected_components(graph)
 79 | 
 80 |     node_component = { }
 81 |     for component in components:
 82 |         for node in component:
 83 |             node_component[node] = component
 84 | 
 85 |     component_graph = { }
 86 |     for component in components:
 87 |         component_graph[component] = [ ]
 88 |     
 89 |     for node in graph:
 90 |         node_c = node_component[node]
 91 |         for successor in graph[node]:
 92 |             successor_c = node_component[successor]
 93 |             if node_c != successor_c:
 94 |                 component_graph[node_c].append(successor_c) 
 95 | 
 96 |     return topological_sort(component_graph)
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     d = {
101 |         0 : [1],
102 |         1 : [2],
103 |         2 : [1,3],
104 |         3 : [3],
105 |     }
106 |     #print d
107 |     #print robust_topological_sort(d)
108 | 
109 |     d = {0 : [1, 2, 4], 1 : [3, 4], 2 : [0, 3], 3 : [], 4: [1]}
110 |     print d
111 |     print "scc", strongly_connected_components(d)
112 |     print "rts", robust_topological_sort(d)
113 | 
114 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Deprecation
  2 | 
  3 | This project is deprecated as [binexport](https://github.com/google/binexport) supports Binary Ninja too these days.
  4 | 
  5 | # SimilarNinja Plugin (v0.2 alpha)
  6 | Author: **buherator**
  7 | _Find similar functions with Binary Ninja_
  8 | ## Description:
  9 | 
 10 | This is a partial implementation of the [KOKA algorithm](http://joxeankoret.com/blog/2018/11/04/new-cfg-based-heuristic-diaphora/) for CFG matching. 
 11 | 
 12 | Currently the following algorithms are implemented:
 13 | 
 14 | * KOKA features bound together in an SPP hash:
 15 |   * NODE_ENTRY, NODE_EXIT, NODE_NORMAL
 16 |   * EDGE_OUT_CONDITIONAL, EDGE_IN_CONDITIONAL
 17 |   * FEATURE_FUNC_NO_RET, FEATURE_FUNC_LIB
 18 | * Features from the [original paper](https://census-labs.com/media/efficient-features-bindiff.pdf):
 19 |   * Digraph Signature
 20 |   * String histogram
 21 | * Others:
 22 |   * Basic Block Count
 23 |   * LLIL instruction types (LLIL_FEATURE_REDIRECT, LLIL_FEATURE_LOGIC, LLIL_FEATURE_ARITHMETIC)
 24 | 
 25 | Experimental infrastructure is available for exact and partial matching.
 26 | 
 27 | Early stage of development, code is unstable. 
 28 | 
 29 | Bugs? Very likely, please use the Issue Tracker!
 30 | 
 31 | ### Why?
 32 | 
 33 | The licensing model of IDA sucks, we need tools for independent frameworks. Other design goals:
 34 | 
 35 | * Easy feature vector composition - creation of custom similarity metrics should be easy (at src level)
 36 | * No external databases - Redundant data storage should be avoided
 37 |   * SQLite based compatibility layer for Diaphora would be nice 
 38 | 
 39 | ### Usage
 40 | 
 41 | The plugin adds two menu items: one for generating feature vetors for the functions of a binary, another for comparing the results of the previous one. Results can be saved to standalone JSON files or along with the analysis data in the BNDB database (the later is recommended). When comparing results the plugin tries to load raw JSON formatted data unless the extension of the opened file is ".bndb" - in that case the JSON object is read from database metadata. Comparison results can be saved to standalone JSON files.
 42 | 
 43 | Beware that some feature extractor classes can be stateful. When working with multiple views it is usually a good idea to `reload(similarninja)` in the Python Console when using it on a different tab.
 44 | 
 45 | ## Customization
 46 | 
 47 | You can compose your custom feature vector generator by editing the `PROVIDERS` list. Each list element should be a `FeatureProvider` subclass instance or a tuple. The `FeatureProvider`  will be used to calculate similarity metrics for the corresponding vector position. In case of tuples the first element should be the `FeatureProvider` instance, while the second one is a float that will be used as a weight for the element. The default weight for each element is 1.0.    
 48 | 
 49 | ### Examples
 50 | 
 51 | Using String Histogram with a Small Primes Product of the Stringly Connected Features and Function Flags. Both features have a weight of 1.0 when comparing:
 52 | 
 53 | ```
 54 | [StringHistogramProvider(), 
 55 | SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures])]
 56 | ```
 57 | 
 58 | 
 59 | Using String Histogram with the Stringly Connected Features and Function Flags as separate prime products. String Histogram has doubled weight:
 60 | 
 61 | ```
 62 | [(StringHistogramProvider(), 2.0), 
 63 | SPPFeatureProvider([FuncStronglyConnectedFeatures]), 
 64 | SPPFeatureProvider([FuncFlagsFeatures])]
 65 | ```
 66 | 
 67 | 
 68 | ## Testing
 69 | 
 70 | With the corresponding views open, save the current BinaryView objects in the Binary Ninja console:
 71 | 
 72 | ```
 73 | >>> bv0=bv
 74 | # Switch views on the GUI
 75 | >>> bv1=bv
 76 | ```
 77 | 
 78 | Invoke the tester function:
 79 | 
 80 | ```
 81 | >>> similarninja.tester(bv0, bv1)
 82 | ```
 83 | 
 84 | This will do the feature extraction (features stored in the database for later use) and comparison, then uses the available symbol information to measure accuracy. 
 85 | 
 86 | The function allows testing multiple feature provider compositions at once, refer to the source for details!
 87 | 
 88 | ### Results
 89 | 
 90 | The following results are based on debug information contained in unstripped binaries (exact function name match).
 91 | 
 92 | The current algorithm for partial matches is very liberal and will try to find a match for everything - this is the reason of high incorrect match numbers. However these "incorrect" numbers also contain actual good matches (like matches between `shaX_process_blockN()` functions). 
 93 | 
 94 | ### Busybox 
 95 | 
 96 | #### 1.29.1 vs. 1.29.2 x64 ELF
 97 | 
 98 | | Func # | Correct match | Incorrect match |
 99 | |--------|---------------|-----------------|
100 | | 3114   | 1600 (51.3%)  | 1488 (47.8%)    |
101 | 
102 | Feature vector providers:
103 | ```
104 | [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(),BBLCountProvider()]
105 | ```
106 | 
107 | | Func # | Correct match | Incorrect match |
108 | |--------|---------------|-----------------|
109 | | 3114   | 2098 (67.4%)  | 981 (31.5%)     |
110 | 
111 | Feature vector providers:
112 | ```
113 | [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(), StringHistogramProvider()] 
114 | ```
115 | 
116 | 
117 | ### SQLite
118 | 
119 | #### 3.25.03 vs. 3.25.00 x64 ELF*
120 | 
121 | | Func # | Correct match | Incorrect match |
122 | |--------|---------------|-----------------|
123 | | 3122   | 1432 (45.9%)  | 1689 (54.1%)    |
124 | 
125 | Feature vector providers:
126 | ```
127 | [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(),BBLCountProvider()]
128 | ```
129 | 
130 | | Func # | Correct match | Incorrect match |
131 | |--------|---------------|-----------------|
132 | | 3122   | 1618 (51.8%)  | 1503 (48.1%)    |
133 | 
134 | Feature vector providers:
135 | ```
136 | [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(), StringHistogramProvider()] 
137 | ```
138 | 
139 | ## TODO
140 | 
141 | A lot of things...
142 | 
143 | * More matcher algorithms
144 |   * Algorithms from the [original paper](https://census-labs.com/media/efficient-features-bindiff.pdf):
145 |     * Markov lumping
146 |     * Instruction histogram (with capstone/pyxed/other external lib?)
147 | * Better integration with the UI
148 | * Without a BinaryView we loose cross-function control-flow data, so function predecessors/successors can't be discovered during matching
149 |   * Multiple ways to handle this, have to decide which way to go...
150 | * LICENSE file...
151 | 
152 | ### Binary Ninja API wishlist
153 | 
154 | * Instruction level classification  
155 | * Data XRefs
156 | 
157 | ## Minimum Version
158 | 
159 | This plugin requires the following minimum version of Binary Ninja:
160 | 
161 |  * release - 0
162 |  * dev - 1.0.dev-576
163 | 
164 | ## Required Dependencies
165 | 
166 | The following dependencies are required for this plugin:
167 | 
168 |  * pip - 
169 |  * installers - 
170 |  * other - 
171 |  * apt - 
172 | 
173 | ## License
174 | This plugin is released under a GPLv2 license as required by Diaphora. 
175 | 
176 | 


--------------------------------------------------------------------------------
/providers.py:
--------------------------------------------------------------------------------
  1 | from binaryninja import *
  2 | from tarjan_sort import *
  3 | from spp_primes import *
  4 | from llil_categories import *
  5 | 
  6 | 
  7 | class SPPBBLProvider:
  8 |     @staticmethod
  9 |     def calculate(bbl):
 10 |         pass
 11 | 
 12 | class SPPFunctionProvider:
 13 |     @staticmethod
 14 |     def calculate(func):
 15 |         pass
 16 | 
 17 | class FeatureProvider:
 18 |     def calculate(self, func):
 19 |         pass
 20 | 
 21 |     @staticmethod
 22 |     def compare(f0, f1):
 23 |         pass
 24 | 
 25 | class BBLTypeFeatures(SPPBBLProvider):
 26 |     @staticmethod
 27 |     def calculate(b):
 28 |         ret = 1
 29 |         
 30 |         if len(b.incoming_edges) == 0:
 31 |             ret *= NODE_ENTRY
 32 |         if len(b.outgoing_edges) == 0:
 33 |             ret *= NODE_EXIT
 34 |         ret *= NODE_NORMAL
 35 |         return ret
 36 | 
 37 | class BBLEdgeFeatures(SPPBBLProvider):
 38 |     @staticmethod
 39 |     def calculate(b):
 40 |         ret = 1
 41 |         
 42 |         ins = b.incoming_edges
 43 |         outs = b.outgoing_edges
 44 |             
 45 |         for e in outs:
 46 |             ret *= EDGE_OUT_CONDITIONAL
 47 |         for e in ins:
 48 |             ret *= EDGE_IN_CONDITIONAL
 49 | 
 50 |         return ret
 51 | 
 52 | class FuncInstructionFeatures(SPPFunctionProvider):
 53 |     @staticmethod
 54 |     def calculate(func):
 55 |         ret = 1
 56 |         for block in func.low_level_il:
 57 |             for ins in block:
 58 |                 if ins.operation in LLIL_REDIRECT:
 59 |                     ret *= LLIL_FEATURE_REDIRECT
 60 |                 elif ins.operation in LLIL_ARITHMETIC:
 61 |                     ret *= LLIL_FEATURE_ARITHMETIC
 62 |                 elif ins.operation in LLIL_LOGIC:
 63 |                     ret *= LLIL_FEATURE_LOGIC
 64 |         return ret
 65 |         
 66 | class FuncStronglyConnectedFeatures(SPPFunctionProvider):
 67 |     @staticmethod
 68 |     def calculate(func):
 69 |         bb_relations = {}
 70 |         ret = 1
 71 |         for block in func.basic_blocks:
 72 |             # Creating bb_relations 
 73 |             bb_relations[block.start] = []
 74 |             for e in block.outgoing_edges:
 75 |                 bb_relations[block.start].append(e.target.start)
 76 | 
 77 |             for e in block.incoming_edges:
 78 |                 try:
 79 |                     bb_relations[e.source.start].append(block.start)
 80 |                 except KeyError:
 81 |                     bb_relations[e.source.start] = [block.start]
 82 |         try:
 83 |             strongly_connected = strongly_connected_components(bb_relations)
 84 |             for sc in strongly_connected:
 85 |                 if len(sc) > 1:
 86 |                     ret *= FEATURE_LOOP
 87 |                 else:
 88 |                     if sc[0] in bb_relations and sc[0] in bb_relations[sc[0]]:
 89 |                         ret *= FEATURE_LOOP
 90 |             ret *= FEATURE_STRONGLY_CONNECTED ** len(strongly_connected)
 91 |         except:
 92 |             log_error("Exception: %s" % (sys.exc_info()[1]))
 93 |         return ret
 94 | 
 95 | class FuncFlagsFeatures(SPPFunctionProvider):
 96 |     @staticmethod
 97 |     def calculate(func):
 98 |         ret = 1    
 99 |         if not func.can_return:
100 |             ret *= FEATURE_FUNC_NO_RET
101 |         if func.symbol.type is SymbolType.ImportedFunctionSymbol:
102 |             ret *= FEATURE_FUNC_LIB
103 |         # [TODO] Binary Ninja API for Thunks
104 |         return ret
105 | 
106 | class SPPFeatureProvider(FeatureProvider):
107 |     def __init__(self, features=[]):
108 |         self.features=features
109 | 
110 |     def calculate(self, func):
111 |         ret=1
112 |         for p in self.features:
113 |             for block in func.basic_blocks:
114 |                 if issubclass(p,SPPBBLProvider):
115 |                     ret *= p.calculate(block)    
116 |             if issubclass(p,SPPFunctionProvider):
117 |                 ret *= p.calculate(func)
118 |         return ret
119 | 
120 |     @staticmethod
121 |     def _primes(n):
122 |         # http://outslide.tumblr.com/post/167558674272
123 |         if n in ALL_PRIMES:
124 |             return [n]
125 |         # This is slow as hell for large numbers
126 |         i = 0
127 |         primes=[]
128 |         while n != 1:
129 |             if n % ALL_PRIMES[i] == 0:
130 |                 primes.append(ALL_PRIMES[i])
131 |                 n = n / ALL_PRIMES[i]
132 |             else:
133 |                 i += 1
134 |                 if i >= len(ALL_PRIMES): 
135 |                     log_error("Something is fucky with SPP primes! %x " % (n))
136 |                     break
137 |         return primes
138 | 
139 |     @staticmethod
140 |     def _hcfnaive(a,b): 
141 |         if(b==0): 
142 |             return a 
143 |         else: 
144 |             return SPPFeatureProvider._hcfnaive(b,a%b) 
145 | 
146 |     @staticmethod
147 |     def compare(f0,f1):
148 |         if f0 == f1:
149 |             return 1.0
150 |         if f0 == 0 or f1 == 0:
151 |             return 0.0 
152 |         else:
153 |             hcf = SPPFeatureProvider._hcfnaive(f0,f1)
154 |             f0_hcf_primes=SPPFeatureProvider._primes(f0/hcf)
155 |             f1_hcf_primes=SPPFeatureProvider._primes(f1/hcf)
156 |             try:
157 |                 if len(f0_hcf_primes) > len(f1_hcf_primes):
158 |                     return 1-(float(len(f0_hcf_primes))/len(SPPFeatureProvider._primes(f0)))
159 |                 else:
160 |                     return 1-(float(len(f1_hcf_primes))/len(SPPFeatureProvider._primes(f1)))
161 |             except OverflowError:
162 |                 return 0.0
163 |             except ZeroDivisionError:
164 |                 log_error("Division by zero: %X %X HCF: %X Primes: %s %s " % (f0,f1,hcf,SPPFeatureProvider._primes(f0),SPPFeatureProvider._primes(f1)))
165 |                 raise
166 | 
167 | class DigraphFeatureProvider(FeatureProvider):
168 |     def __init__(self):
169 |         self.visited=set()
170 | 
171 |     def dfs(self,block,value):
172 |         #log_info("Entering %x value: %x" % (block.start,value))
173 |         
174 |         if block.start not in self.visited:  
175 |             value *= 2    
176 |             value += 1
177 |             #log_info("Not visited yet! %x" % (value))
178 |             self.visited.add(block.start)
179 |         else:
180 |             return value
181 | 
182 |         for e in block.outgoing_edges:
183 |             value=self.dfs(e.target, value)
184 |         value *= 2
185 |         #log_info("Leaving %x Value: %x" % (block.start,value))       
186 |         return value
187 | 
188 |     def calculate(self,func):
189 |         block=func.get_basic_block_at(func.start)
190 |         
191 |         value=self.dfs(block, 0)
192 |         #log_info("Final Value: %d" % value)
193 |         return value
194 | 
195 |     @staticmethod
196 |     def compare(f0,f1):
197 |         binlen0=float(len(bin(f0)))
198 |         binlen1=float(len(bin(f1)))
199 |         hamming=float(bin(f0^f1).count('1'))
200 |         if binlen0 >= binlen1:
201 |             return 1.0-(hamming/binlen0)
202 |         else:
203 |             return 1.0-(hamming/binlen1)
204 | 
205 | class BBLCountProvider(FeatureProvider):
206 |     def calculate(self, func):
207 |         return len(func.basic_blocks)
208 | 
209 |     @staticmethod
210 |     def compare(f0,f1):
211 |         
212 |         if f0>=f1:
213 |             return 1-(float(f0-f1)/f0)
214 |         else:
215 |             return 1-(float(f1-f0)/f1)
216 | 
217 | class StringHistogramProvider(FeatureProvider):
218 |     def __init__(self):
219 |         self.cache=None
220 | 
221 | 
222 |     def calculate(self, func):
223 |         # String info is available for the global BinaryView
224 |         # String histograms are calculated and cached at first call
225 |         if self.cache is None:
226 |             self.cache = {}
227 |             vectors = {}
228 |             bv=func.view
229 |             for s in bv.strings:
230 |                 value = s.value
231 |                 str_xrefs = bv.get_code_refs(s.start)
232 |                 #log_info("--------- %s" % s.value)
233 | 
234 |                 # Updating character counts for all functions referencing the current string
235 |                 for x in str_xrefs:
236 |                     if x.function.start not in vectors:
237 |                         vectors[x.function.start] = [0]*256
238 |                     for c in s.value:
239 |                         vectors[x.function.start][ord(c)] += 1
240 |                     #log_info(repr(vectors[x.function.start]))
241 |             for f, c_vec in vectors.iteritems():
242 |                 #log_info("%s" % repr(c_vec))
243 |                 self.cache[f] = 0
244 |                 cmax = 0
245 |                 begin = 0
246 |                 started = False
247 |                 end = 255
248 | 
249 |                 # Find maximum and cut empty ends
250 |                 for i in xrange(0,256):
251 |                     if c_vec[i] != 0:
252 |                         if not started:
253 |                             begin = i
254 |                             started = True
255 |                         end = i
256 |                     if c_vec[i] > cmax:
257 |                         cmax = c_vec[i]
258 |                 #log_info("%d %d %s" % (begin,end,repr(c_vec[begin:end+1])))
259 | 
260 |                 # Normalize charcter counts to 0-15 and encode vector as integer
261 |                 # [4,0,8,0,16,8,4,2] -> 0x3070f731
262 |                 for i in xrange(begin, end):
263 |                     self.cache[f] *= 16
264 |                     self.cache[f] += int((float(c_vec[i])/cmax)*15)
265 |                 #log_info("%X" % self.cache[f])
266 |         if func.start in self.cache:
267 |             return self.cache[func.start]
268 |         else:
269 |             return 0
270 | 
271 |     @staticmethod
272 |     def compare(f0,f1):
273 |         binlen0=float(len(bin(f0)))
274 |         binlen1=float(len(bin(f1)))
275 |         hamming=float(bin(f0^f1).count('1'))
276 |         if binlen0 >= binlen1:
277 |             return 1.0-(hamming/binlen0)
278 |         else:
279 |             return 1.0-(hamming/binlen1)
280 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
  1 | from binaryninja import *
  2 | from providers import *
  3 | import json
  4 | 
  5 | PROVIDERS = [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(),(StringHistogramProvider(), 2.0)]  
  6 | 
  7 | def get_func_predecessors(bv,f):
  8 |     ret=[]
  9 |     for xref in bv.get_code_refs(f.start):
 10 |         x_func = xref.function
 11 |         low_level_il = x_func.get_low_level_il_at(bv.platform.arch, xred.address)
 12 |         il = function.low_level_il[low_level_il]
 13 |         if il.operation ==  LLIL_CALL: 
 14 |             ret.append(x_func.start)
 15 |     return ret
 16 | 
 17 | def load_data(self, fn0):
 18 |     data0 = None
 19 |     if fn0.endswith(".bndb"):
 20 |         fm = FileMetadata()
 21 |         db0 = fm.open_existing_database(fn0)
 22 |         for t in db0.available_view_types:
 23 |             try:
 24 |                 bv0= db0.get_view_of_type(t.name)
 25 |                 data0 = json.loads(bv0.query_metadata("similarninja"))
 26 |                 break
 27 |             except KeyError:
 28 |                 pass
 29 |     else:
 30 |         f0 = open(fn0, "r")
 31 |         data0=json.loads(f0.read())
 32 |     return data0
 33 | 
 34 | class FeatureGenerator(BackgroundTaskThread):
 35 |     def __init__(self, bv, providers, self_contained=None):
 36 |         super(FeatureGenerator, self).__init__("", False)
 37 |         self.bv = bv
 38 |         self.providers = providers
 39 |         self.self_contained = self_contained
 40 | 
 41 |     def run(self):
 42 |         results={}
 43 |         func_len = len(self.bv.functions)
 44 |         for n, func in enumerate(self.bv.functions):
 45 |             self.progress = "Generating features (%d/%d)" % (n, func_len)
 46 |             idx=long(func.start)
 47 |             results[idx] = [None] * len(self.providers)
 48 |             for i,p in enumerate(self.providers):
 49 |                 if isinstance(p, tuple):
 50 |                     p = p[0]
 51 |                 results[idx][i] = p.calculate(func)
 52 |         self.progress = "Done generating features"    
 53 |         log_info(repr(results))
 54 |         if self.self_contained or show_message_box("SimilarNinja","Do you want to save the results to the Binary Ninja database?", MessageBoxButtonSet.YesNoButtonSet, MessageBoxIcon.QuestionIcon) == 1:
 55 |             # Storing as JSON is wasteful, but more secure than Pickle... good enough for now
 56 |             self.bv.store_metadata("similarninja",json.dumps(results))
 57 |         else:
 58 |             out = open(get_save_filename_input("Filename to save function hashes:","*","output.json"),"wb")
 59 |             out.write(json.dumps(results))
 60 |             out.close()
 61 |         self.finish()
 62 | 
 63 | class SimilarNinjaComparer(BackgroundTaskThread):
 64 |     def __init__(self, providers, data0=None, data1=None, result=None):
 65 |         super(SimilarNinjaComparer, self).__init__("", False)
 66 |         self.providers = providers
 67 |         self.data0 = data0
 68 |         self.data1 = data1
 69 |         self.result = result
 70 | 
 71 |     def match_fvs(self, data0, data1):
 72 |         res=[]
 73 |         ordered_keys0=sorted(data0,key=data0.get,reverse=True)
 74 |         ordered_keys1=sorted(data1,key=data1.get,reverse=True)
 75 | 
 76 |         for func0 in ordered_keys0: # So we can delete elements
 77 |             if func0 not in data0: continue
 78 |             feat0 = data0[func0]
 79 | 
 80 |             for func1 in ordered_keys1:
 81 |                 if func1 not in data1: continue
 82 |                 feat1 = data1[func1]
 83 |                 matching=True
 84 |                 for i in xrange(0,len(feat1)):
 85 |                     if feat0[i] != feat1[i]:
 86 |                         matching=False
 87 |                         break
 88 |                 if matching:
 89 |                     log_info("%x <-> %x %s (%f)\n%s %s" % (long(func0), long(func1), [], 1.0, feat0, feat1))
 90 |                     res.append(((long(func0), feat0), (long(func1),feat1), 1.0))
 91 |                     del data0[func0]
 92 |                     del data1[func1]
 93 |                     break
 94 |         return res        
 95 | 
 96 |     def run(self):        
 97 |         self.progress="Opening files for comparison"
 98 | 
 99 |         data0 = None
100 |         data1 = None
101 |         
102 |         if self.data0 is not None:
103 |             data0 = self.data0
104 |         else:
105 |             fn0 = get_open_filename_input("filename0:","*")
106 |             data0 = load_data(fn0)
107 |         
108 |         if self.data1 is not None:
109 |             data1 = self.data1
110 |         else:
111 |             fn1 = get_open_filename_input("filename1:","*")
112 |             data1 = load_data(fn1)
113 |         
114 |         self.progress="Comparing..."
115 |         data0_len = len(data0)
116 |         data1_len = len(data1)
117 | 
118 |         log_info("Data sizes: %d %d" % (len(data0), len(data1)))
119 |         matches=self.match_fvs(data0, data1)
120 |         log_info("Data sizes after matching: %d %d" % (len(data0), len(data1)))
121 |         
122 |         # Inexact matches
123 |         for func0 in list(data0.keys()): # So we can delete elements
124 |             if func0 not in data0: continue
125 |             feat0 = data0[func0]
126 | 
127 |             sims0 = [None] * len(self.providers)
128 |             sim_avg0 = 0.0
129 |             func_match = None
130 |             feat_match = None
131 |             for func1 in list(data1.keys()):
132 |                 if func1 not in data1: continue
133 |                 feat1 = data1[func1]
134 | 
135 |                 sims = [None] * len(self.providers)
136 |                 weight_sum = 0.0
137 |                 for i, p in enumerate(self.providers):
138 |                     weight = 1.0
139 |                     if isinstance(p, tuple):
140 |                         weight = p[1]
141 |                         p = p[0]
142 |                     sims[i] = p.compare(feat0[i],feat1[i])*weight
143 |                     weight_sum += weight
144 |                 sim_avg = 0.0
145 |                 for s in sims:
146 |                     sim_avg += s
147 |                 sim_avg = sim_avg / weight_sum        
148 | 
149 |                 if sim_avg > sim_avg0:
150 |                     sim_avg0 = sim_avg
151 |                     sims0 = sims
152 |                     func_match = func1
153 |                     feat_match = feat1
154 |                 if sim_avg0 == 1.0: break # Exit early for perfect matches
155 |             
156 |             if func_match is not None:
157 |                 log_info("%x <-> %x %s (%f)\n%s %s" % (long(func0), long(func_match), repr(sims0), sim_avg0, feat0, feat_match))
158 |                 matches.append(((long(func0), feat0), (long(func_match), feat_match), sim_avg0))
159 |                 self.progress = "Matches: %d (%d <-> %d)" % (len(matches), data0_len, data1_len)
160 |                 del data0[func0]
161 |                 del data1[func_match]
162 |         
163 |         result_fn = None
164 |         if self.result is None:
165 |             result_fn = get_save_filename_input("Filename to save comparison results:","*","compare.json")
166 |         else:
167 |             result_fn = self.result
168 |         if result_fn is not None:
169 |             out = open(result_fn, "wb")
170 |             out.write(json.dumps(matches))
171 |             out.close()
172 |         self.matches = matches
173 |         self.finish()
174 |         
175 | 
176 | def tester(bv0, bv1):
177 |     providers_list = [
178 |                         [SPPFeatureProvider([BBLTypeFeatures]), SPPFeatureProvider([BBLEdgeFeatures]), SPPFeatureProvider([FuncInstructionFeatures]), SPPFeatureProvider([FuncStronglyConnectedFeatures, FuncFlagsFeatures]), DigraphFeatureProvider(), StringHistogramProvider()] ,
179 |                      ]
180 | 
181 |     for p in providers_list:
182 |         fgen0=FeatureGenerator(bv0, p, True)
183 |         fgen0.start()        
184 | 
185 |         fgen1=FeatureGenerator(bv1, p, True)
186 |         fgen1.start()        
187 |         
188 |         fgen0.join()
189 |         fgen1.join()
190 | 
191 |         data0 = json.loads(bv0.query_metadata("similarninja"))
192 |         data1 = json.loads(bv1.query_metadata("similarninja"))
193 | 
194 |         sn_comparer=SimilarNinjaComparer(p, data0, data1, "/dev/null")
195 |         sn_comparer.start()
196 |         sn_comparer.join()
197 | 
198 |         matches=sn_comparer.matches
199 | 
200 |         unknown=0
201 |         success=0
202 |         failure=0
203 |         for m in matches:
204 |             func0=bv0.get_function_at(m[0][0])
205 |             if func0 is None or func0.start != m[0][0]:
206 |                 log_info("Switching views")
207 |                 bv1, bv0 = bv0, bv1
208 |                 func0=bv0.get_function_at(m[0][0])
209 |             func1=bv1.get_function_at(m[1][0])
210 |             try:
211 |                 if func0.name.startswith("sub_") and func1.name.startswith("sub_"):
212 |                     unknown += 1
213 |                     continue
214 |             except AttributeError:
215 |                 log_error("Function not found: %x %x" % (m[0][0],m[1][0]))
216 |                 return
217 |             if func0.name == func1.name:
218 |                 log_info("%s (%x) == %s (%x)" % (func0.name, func0.start, func1.name, func1.start))
219 |                 success += 1
220 |             else:
221 |                 log_info("%s (%x) != %s (%x)" % (func0.name, func0.start, func1.name, func1.start))
222 |                 failure += 1
223 |         log_info("PROVIDERS: %s" % repr(p))
224 |         log_info("\\_ Success: %d" % success)
225 |         log_info("\\_ Failure: %d" % failure)
226 |         log_info("\\_ Total: %d (%d <-> %d) \n" % (len(matches), len(bv0.functions), len(bv1.functions)))
227 | 
228 | def compare(bv):
229 |     sn_comparer=SimilarNinjaComparer(PROVIDERS)
230 |     sn_comparer.start()
231 | 
232 | def gen_feature(bv):
233 |     fgen=FeatureGenerator(bv, PROVIDERS)
234 |     fgen.start()
235 | 
236 | PluginCommand.register("SimilarNinja - Generate Feature Vectors", "Generates Feature Vectors for all functions", gen_feature)
237 | PluginCommand.register("SimilarNinja - Compare", "Compare functions based on generated data files", compare)
238 | 


--------------------------------------------------------------------------------