├── LICENSE ├── README.md ├── doc ├── BHArsenal_slide_deck.pdf └── usage_examples.pdf ├── setup.py └── stadeo ├── __init__.py ├── cff ├── __init__.py ├── arcg_cache_depgraph.py ├── cff_recognizer.py ├── cff_solver.py └── cff_strategies.py ├── string ├── __init__.py ├── string_revealer.py └── string_symb_stubs.py └── utils ├── __init__.py ├── extended_asmcfg.py └── xref_patcher.py /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020, ESET spol. s r.o. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | 28 | 29 | -------------------------------------------------------------------------------- 30 | Some regexes in stadeo/string/string_revealer.py are 31 | 32 | Copyright (C) 2017 FireEye, Inc 33 | All rights reserved. 34 | 35 | Apache License 36 | Version 2.0, January 2004 37 | http://www.apache.org/licenses/ 38 | 39 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 40 | 41 | 1. Definitions. 42 | 43 | "License" shall mean the terms and conditions for use, reproduction, 44 | and distribution as defined by Sections 1 through 9 of this document. 45 | 46 | "Licensor" shall mean the copyright owner or entity authorized by 47 | the copyright owner that is granting the License. 48 | 49 | "Legal Entity" shall mean the union of the acting entity and all 50 | other entities that control, are controlled by, or are under common 51 | control with that entity. For the purposes of this definition, 52 | "control" means (i) the power, direct or indirect, to cause the 53 | direction or management of such entity, whether by contract or 54 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 55 | outstanding shares, or (iii) beneficial ownership of such entity. 56 | 57 | "You" (or "Your") shall mean an individual or Legal Entity 58 | exercising permissions granted by this License. 59 | 60 | "Source" form shall mean the preferred form for making modifications, 61 | including but not limited to software source code, documentation 62 | source, and configuration files. 63 | 64 | "Object" form shall mean any form resulting from mechanical 65 | transformation or translation of a Source form, including but 66 | not limited to compiled object code, generated documentation, 67 | and conversions to other media types. 68 | 69 | "Work" shall mean the work of authorship, whether in Source or 70 | Object form, made available under the License, as indicated by a 71 | copyright notice that is included in or attached to the work 72 | (an example is provided in the Appendix below). 73 | 74 | "Derivative Works" shall mean any work, whether in Source or Object 75 | form, that is based on (or derived from) the Work and for which the 76 | editorial revisions, annotations, elaborations, or other modifications 77 | represent, as a whole, an original work of authorship. For the purposes 78 | of this License, Derivative Works shall not include works that remain 79 | separable from, or merely link (or bind by name) to the interfaces of, 80 | the Work and Derivative Works thereof. 81 | 82 | "Contribution" shall mean any work of authorship, including 83 | the original version of the Work and any modifications or additions 84 | to that Work or Derivative Works thereof, that is intentionally 85 | submitted to Licensor for inclusion in the Work by the copyright owner 86 | or by an individual or Legal Entity authorized to submit on behalf of 87 | the copyright owner. For the purposes of this definition, "submitted" 88 | means any form of electronic, verbal, or written communication sent 89 | to the Licensor or its representatives, including but not limited to 90 | communication on electronic mailing lists, source code control systems, 91 | and issue tracking systems that are managed by, or on behalf of, the 92 | Licensor for the purpose of discussing and improving the Work, but 93 | excluding communication that is conspicuously marked or otherwise 94 | designated in writing by the copyright owner as "Not a Contribution." 95 | 96 | "Contributor" shall mean Licensor and any individual or Legal Entity 97 | on behalf of whom a Contribution has been received by Licensor and 98 | subsequently incorporated within the Work. 99 | 100 | 2. Grant of Copyright License. Subject to the terms and conditions of 101 | this License, each Contributor hereby grants to You a perpetual, 102 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 103 | copyright license to reproduce, prepare Derivative Works of, 104 | publicly display, publicly perform, sublicense, and distribute the 105 | Work and such Derivative Works in Source or Object form. 106 | 107 | 3. Grant of Patent License. Subject to the terms and conditions of 108 | this License, each Contributor hereby grants to You a perpetual, 109 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 110 | (except as stated in this section) patent license to make, have made, 111 | use, offer to sell, sell, import, and otherwise transfer the Work, 112 | where such license applies only to those patent claims licensable 113 | by such Contributor that are necessarily infringed by their 114 | Contribution(s) alone or by combination of their Contribution(s) 115 | with the Work to which such Contribution(s) was submitted. If You 116 | institute patent litigation against any entity (including a 117 | cross-claim or counterclaim in a lawsuit) alleging that the Work 118 | or a Contribution incorporated within the Work constitutes direct 119 | or contributory patent infringement, then any patent licenses 120 | granted to You under this License for that Work shall terminate 121 | as of the date such litigation is filed. 122 | 123 | 4. Redistribution. You may reproduce and distribute copies of the 124 | Work or Derivative Works thereof in any medium, with or without 125 | modifications, and in Source or Object form, provided that You 126 | meet the following conditions: 127 | 128 | (a) You must give any other recipients of the Work or 129 | Derivative Works a copy of this License; and 130 | 131 | (b) You must cause any modified files to carry prominent notices 132 | stating that You changed the files; and 133 | 134 | (c) You must retain, in the Source form of any Derivative Works 135 | that You distribute, all copyright, patent, trademark, and 136 | attribution notices from the Source form of the Work, 137 | excluding those notices that do not pertain to any part of 138 | the Derivative Works; and 139 | 140 | (d) If the Work includes a "NOTICE" text file as part of its 141 | distribution, then any Derivative Works that You distribute must 142 | include a readable copy of the attribution notices contained 143 | within such NOTICE file, excluding those notices that do not 144 | pertain to any part of the Derivative Works, in at least one 145 | of the following places: within a NOTICE text file distributed 146 | as part of the Derivative Works; within the Source form or 147 | documentation, if provided along with the Derivative Works; or, 148 | within a display generated by the Derivative Works, if and 149 | wherever such third-party notices normally appear. The contents 150 | of the NOTICE file are for informational purposes only and 151 | do not modify the License. You may add Your own attribution 152 | notices within Derivative Works that You distribute, alongside 153 | or as an addendum to the NOTICE text from the Work, provided 154 | that such additional attribution notices cannot be construed 155 | as modifying the License. 156 | 157 | You may add Your own copyright statement to Your modifications and 158 | may provide additional or different license terms and conditions 159 | for use, reproduction, or distribution of Your modifications, or 160 | for any such Derivative Works as a whole, provided Your use, 161 | reproduction, and distribution of the Work otherwise complies with 162 | the conditions stated in this License. 163 | 164 | 5. Submission of Contributions. Unless You explicitly state otherwise, 165 | any Contribution intentionally submitted for inclusion in the Work 166 | by You to the Licensor shall be under the terms and conditions of 167 | this License, without any additional terms or conditions. 168 | Notwithstanding the above, nothing herein shall supersede or modify 169 | the terms of any separate license agreement you may have executed 170 | with Licensor regarding such Contributions. 171 | 172 | 6. Trademarks. This License does not grant permission to use the trade 173 | names, trademarks, service marks, or product names of the Licensor, 174 | except as required for reasonable and customary use in describing the 175 | origin of the Work and reproducing the content of the NOTICE file. 176 | 177 | 7. Disclaimer of Warranty. Unless required by applicable law or 178 | agreed to in writing, Licensor provides the Work (and each 179 | Contributor provides its Contributions) on an "AS IS" BASIS, 180 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 181 | implied, including, without limitation, any warranties or conditions 182 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 183 | PARTICULAR PURPOSE. You are solely responsible for determining the 184 | appropriateness of using or redistributing the Work and assume any 185 | risks associated with Your exercise of permissions under this License. 186 | 187 | 8. Limitation of Liability. In no event and under no legal theory, 188 | whether in tort (including negligence), contract, or otherwise, 189 | unless required by applicable law (such as deliberate and grossly 190 | negligent acts) or agreed to in writing, shall any Contributor be 191 | liable to You for damages, including any direct, indirect, special, 192 | incidental, or consequential damages of any character arising as a 193 | result of this License or out of the use or inability to use the 194 | Work (including but not limited to damages for loss of goodwill, 195 | work stoppage, computer failure or malfunction, or any and all 196 | other commercial damages or losses), even if such Contributor 197 | has been advised of the possibility of such damages. 198 | 199 | 9. Accepting Warranty or Additional Liability. While redistributing 200 | the Work or Derivative Works thereof, You may choose to offer, 201 | and charge a fee for, acceptance of support, warranty, indemnity, 202 | or other liability obligations and/or rights consistent with this 203 | License. However, in accepting such obligations, You may act only 204 | on Your own behalf and on Your sole responsibility, not on behalf 205 | of any other Contributor, and only if You agree to indemnify, 206 | defend, and hold each Contributor harmless for any liability 207 | incurred by, or claims asserted against, such Contributor by reason 208 | of your accepting any such warranty or additional liability. 209 | 210 | END OF TERMS AND CONDITIONS 211 | 212 | APPENDIX: How to apply the Apache License to your work. 213 | 214 | To apply the Apache License to your work, attach the following 215 | boilerplate notice, with the fields enclosed by brackets "{}" 216 | replaced with your own identifying information. (Don't include 217 | the brackets!) The text should be enclosed in the appropriate 218 | comment syntax for the file format. We also recommend that a 219 | file or class name and description of purpose be included on the 220 | same "printed page" as the copyright notice for easier 221 | identification within third-party archives. 222 | 223 | Copyright {yyyy} {name of copyright owner} 224 | 225 | Licensed under the Apache License, Version 2.0 (the "License"); 226 | you may not use this file except in compliance with the License. 227 | You may obtain a copy of the License at 228 | 229 | http://www.apache.org/licenses/LICENSE-2.0 230 | 231 | Unless required by applicable law or agreed to in writing, software 232 | distributed under the License is distributed on an "AS IS" BASIS, 233 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 234 | See the License for the specific language governing permissions and 235 | limitations under the License. 236 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Stadeo 2 | ====== 3 | 4 | Stadeo is a set of tools primarily developed to facilitate analysis of 5 | [Stantinko](https://www.welivesecurity.com/2017/07/20/stantinko-massive-adware-campaign-operating-covertly-since-2012/), 6 | which is a botnet performing click fraud, ad injection, social network 7 | fraud, password stealing attacks and 8 | [cryptomining](https://www.welivesecurity.com/2019/11/26/stantinko-botnet-adds-cryptomining-criminal-activities/). 9 | 10 | The scripts, written entirely in Python, deal with Stantinko's unique 11 | control-flow-flattening (CFF) and string obfuscation techniques 12 | described in our March 2020 13 | [blogpost](https://www.welivesecurity.com/2020/03/19/stantinko-new-cryptominer-unique-obfuscation-techniques/). 14 | Additionally, they can be utilized for other purposes: for example, 15 | we’ve already extended our approach to support deobfuscating the CFF 16 | featured in Emotet – a trojan that steals banking credentials and that 17 | downloads additional payloads such as ransomware. 18 | 19 | Our deobfuscation methods use 20 | [IDA](https://www.hex-rays.com/products/ida/), which is a standard tool 21 | in the industry, and [Miasm](https://github.com/cea-sec/miasm) – an open 22 | source framework providing us with various data-flow analyses, a 23 | symbolic execution engine, a dynamic symbolic execution engine and the 24 | means to reassemble modified functions. 25 | -------------------------------------------------------------------------------- /doc/BHArsenal_slide_deck.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eset/stadeo/447842592f3aa6d78be5ba58e0ec9d8e407d7fb2/doc/BHArsenal_slide_deck.pdf -------------------------------------------------------------------------------- /doc/usage_examples.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eset/stadeo/447842592f3aa6d78be5ba58e0ec9d8e407d7fb2/doc/usage_examples.pdf -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | 7 | from setuptools import setup 8 | 9 | setup( 10 | name='stadeo', 11 | version='0.0.1', 12 | packages=['stadeo', 'stadeo.cff', 'stadeo.utils', 'stadeo.string'], 13 | url='https://github.com/eset/stadeo', 14 | license='BSD', 15 | author='Vladislav Hrčka', 16 | author_email='vladislav.hrcka@eset.com', 17 | description='Stadeo is a set of tools for control-flow-flattening and string deobfuscation', 18 | classifiers=[ 19 | "Development Status :: 5 - Production/Stable", 20 | "Environment :: Console", 21 | "License :: OSI Approved :: BSD License", 22 | "Programming Language :: Python :: 3", 23 | ], 24 | install_requires=[ 25 | 'z3-solver==4.8.7.0', 26 | 'sortedcontainers', 27 | 'rpyc', 28 | 'future', 29 | 'miasm @ git+https://github.com/cea-sec/miasm@a01c29cd82f5a717e8dee622002e1ca3e189f420', 30 | ], 31 | keywords=[ 32 | "reverse engineering", 33 | "symbolic execution", 34 | "deobfuscation", 35 | "control flow flattening", 36 | "string obfuscation", 37 | "Stantinko", 38 | "Emotet", 39 | ], 40 | ) 41 | -------------------------------------------------------------------------------- /stadeo/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | -------------------------------------------------------------------------------- /stadeo/cff/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | -------------------------------------------------------------------------------- /stadeo/cff/arcg_cache_depgraph.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | 7 | from miasm.analysis import depgraph 8 | from miasm.analysis.data_flow import AssignblkNode 9 | from miasm.expression.expression import * 10 | from miasm.expression.simplifications import expr_simp 11 | 12 | 13 | def is_local_variable(expr, ir_arch_a, mn): 14 | if not expr.is_mem(): 15 | return None 16 | ptr = expr.ptr 17 | diff = expr_simp(ptr - mn.regs.regs_init[ir_arch_a.sp]) 18 | if diff.is_int() and int(expr_simp(expr_is_signed_lower(diff, ExprInt(0, diff.size)))): 19 | return True 20 | return None 21 | 22 | 23 | def contains_local_variable(expr, ir_arch_a, mn): 24 | visitor = ExprWalk(lambda x: is_local_variable(x, ir_arch_a, mn)) 25 | return visitor.visit(expr) 26 | 27 | 28 | def custom_init(self, ircfg, initial_state, state, inputs): 29 | super(depgraph.DependencyResult, self).__init__(state.loc_key, state.pending) 30 | self.initial_state = initial_state 31 | self.history = state.history 32 | self.pending = state.pending 33 | self.line_nb = state.line_nb 34 | self.inputs = inputs 35 | self.links = state.links 36 | self._ircfg = ircfg 37 | 38 | # Init lazy elements 39 | self._has_loop = None 40 | if hasattr(state, 'pending_links'): 41 | self.pending_links = state.pending_links 42 | 43 | 44 | class MyDependencyState(depgraph.DependencyState): 45 | def __init__(self, *args, **kwargs): 46 | super(depgraph.DependencyState, self).__init__(*args, **kwargs) 47 | self.pending_links = set() 48 | 49 | # state consisting of the pendings suits much better our needs 50 | def get_done_state(self): 51 | """Returns immutable object representing current state""" 52 | return self.loc_key, frozenset(self.pending) 53 | 54 | def extend(self, loc_key): 55 | """Return a copy of itself, with itself in history 56 | @loc_key: LocKey instance for the new DependencyState's loc_key 57 | """ 58 | new_state = self.__class__(loc_key, self.pending) 59 | new_state.links = set(self.links) 60 | new_state.history = self.history + [loc_key] 61 | new_state.pending_links = set(self.pending_links) 62 | return new_state 63 | 64 | 65 | def custom_visit_inner(self, expr, *args, **kwargs): 66 | if expr.is_id(): 67 | self.follow.add(expr) 68 | elif expr.is_int(): 69 | self.nofollow.add(expr) 70 | elif expr.is_loc(): 71 | self.nofollow.add(expr) 72 | elif expr.is_mem(): 73 | self.follow.add(expr) 74 | if not self.follow_mem: 75 | return None 76 | elif expr.is_function_call(): 77 | self.follow.add(expr) 78 | if not self.follow_call: 79 | return None 80 | 81 | ret = super(depgraph.FilterExprSources, self).visit(expr, *args, **kwargs) 82 | return ret 83 | 84 | 85 | def is_push_param(recognizer, loc_key, index): 86 | initial_irb = recognizer.ircfg.blocks[loc_key] 87 | initial_assignblk = initial_irb[index] 88 | target_stack_ptr = recognizer.conn.modules.idc.get_spd(initial_assignblk.instr.offset) 89 | todo = [(loc_key, index + 1)] 90 | done = set() 91 | while todo: 92 | loc_key, index = todo.pop() 93 | if loc_key in done: 94 | continue 95 | done.add(loc_key) 96 | irb = recognizer.ircfg.blocks[loc_key] 97 | for assignblk in irb[index:]: 98 | if assignblk.instr and assignblk.instr.offset: 99 | stack_ptr = recognizer.conn.modules.idc.get_spd(assignblk.instr.offset) 100 | if stack_ptr and stack_ptr >= target_stack_ptr: 101 | break 102 | for dst, src in assignblk.items(): 103 | if src.is_function_call(): 104 | arg_addresses = recognizer.conn.modules.idaapi.get_arg_addrs(assignblk.instr.offset) 105 | if arg_addresses and initial_assignblk.instr.offset in arg_addresses: 106 | return True 107 | break 108 | else: 109 | for succ in recognizer.ircfg.successors(loc_key): 110 | todo.append((succ, 0)) 111 | return False 112 | 113 | 114 | depgraph.FilterExprSources.visit_inner = custom_visit_inner 115 | depgraph.DependencyState = MyDependencyState 116 | depgraph.DependencyResult.__init__ = custom_init 117 | 118 | 119 | class ArgCacheDependencyGraph(depgraph.DependencyGraph): 120 | """ 121 | Since there's typically a number of sequential comparisons in cff loops, we take advantage of the fact and, memoize 122 | already processed states. We can do this because the graph doesn't change. We also halt on mem stack arguments, they 123 | cannot be part of cff loops. 124 | """ 125 | def __init__(self, recognizer, *args, **kwargs): 126 | super(ArgCacheDependencyGraph, self).__init__(*args, **kwargs) 127 | self.incorrect = False 128 | self.ir = recognizer.ir_arch 129 | self.mn = recognizer.mn 130 | self.recognizer = recognizer 131 | self.defuse_edges = recognizer.analyses.defuse_edges 132 | self.cached = False 133 | self.new_cache_states = set() 134 | 135 | def _track_exprs(self, state, assignblk, line_nb): 136 | """Track pending expression in an assignblock""" 137 | if self.incorrect: 138 | return 139 | future_pending = {} 140 | node_resolved = set() 141 | for dst, src in assignblk.items(): 142 | assignblk_node = AssignblkNode(state.loc_key, line_nb, dst) 143 | # Only track pending 144 | if dst not in state.pending: 145 | if type(src) in [ExprId, ExprOp, ExprCompose] and any(src in i for i in state.pending): 146 | if assignblk_node in self.defuse_edges: 147 | # targets function arguments such as lea eax, var; push eax since constant propagation doesn't 148 | # work correctly in miasm; https://github.com/cea-sec/miasm/issues/1197; 149 | # https://github.com/cea-sec/miasm/issues/1218; https://github.com/cea-sec/miasm/issues/1259; 150 | # TODO when constant propagation is fixed, rework this; elaborate on 1259 151 | for assignblk_node in self.defuse_edges[assignblk_node]: 152 | if is_local_variable(assignblk_node.var, self.ir, self.mn) \ 153 | and assignblk_node not in self.defuse_edges: 154 | break 155 | else: 156 | continue 157 | elif not is_local_variable(dst, self.ir, self.mn): 158 | continue 159 | 160 | if is_push_param(self.recognizer, assignblk_node.label, assignblk_node.index): 161 | # prevents FPs in weird code such as push [ebp+var_18]; call ...; add esp, 4 162 | # where [ebp+var_18] is not param and it's just pushed 163 | self.incorrect = True 164 | return 165 | continue 166 | # Track IRDst in implicit mode only 167 | if dst == self._ircfg.IRDst and not self._implicit: 168 | continue 169 | assert dst not in node_resolved 170 | node_resolved.add(dst) 171 | dependencies = self._follow_apply_cb(src) 172 | 173 | state.link_element(dst, line_nb) 174 | state.link_dependencies(dst, line_nb, 175 | dependencies, future_pending) 176 | 177 | # Update pending nodes 178 | state.remove_pendings(node_resolved) 179 | state.add_pendings(future_pending) 180 | 181 | def get(self, loc_key, elements, line_nb, heads, done_cache_states=None, incorrect_cache_states=None): 182 | """Compute the dependencies of @elements at line number @line_nb in 183 | the block named @loc_key in the current IRCFG, before the execution of 184 | this line. Dependency check stop if one of @heads is reached. The difference 185 | with the Miasm implementation is that we just want to know whether there's a 186 | non-integer dependency and optimize the computation that way. 187 | @loc_key: LocKey instance 188 | @element: set of Expr instances 189 | @line_nb: int 190 | @heads: set of LocKey instances 191 | Return an iterator on DiGraph(DependencyNode) 192 | """ 193 | # Init the algorithm 194 | if done_cache_states is None: 195 | done_cache_states = set() 196 | if incorrect_cache_states is None: 197 | incorrect_cache_states = set() 198 | inputs = {element: set() for element in elements} 199 | initial_state = depgraph.DependencyState(loc_key, inputs, line_nb) 200 | todo = {initial_state} 201 | dpResultcls = depgraph.DependencyResultImplicit if self._implicit else depgraph.DependencyResult 202 | self.incorrect = False 203 | new_cache_states = set() 204 | self.new_cache_states = new_cache_states 205 | 206 | while todo: 207 | state = todo.pop() 208 | self._compute_intrablock(state) 209 | if self.incorrect: 210 | yield dpResultcls(self._ircfg, initial_state, state, elements) 211 | self.incorrect = False 212 | continue 213 | done_state = state.get_done_state() 214 | if done_state in incorrect_cache_states: 215 | self.incorrect = True 216 | yield dpResultcls(self._ircfg, initial_state, state, elements) 217 | self.incorrect = False 218 | continue 219 | if done_state in done_cache_states | new_cache_states: 220 | self.cached = True 221 | yield dpResultcls(self._ircfg, initial_state, state, elements) 222 | self.cached = False 223 | continue 224 | new_cache_states.add(done_state) 225 | if state.loc_key in heads or not state.pending: 226 | yield dpResultcls(self._ircfg, initial_state, state, elements) 227 | continue 228 | 229 | if self._implicit: 230 | # Force IRDst to be tracked, except in the input block 231 | state.pending[self._ircfg.IRDst] = set() 232 | 233 | state.pending_links.add(done_state) 234 | # Propagate state to parents 235 | for pred in self._ircfg.predecessors_iter(state.loc_key): 236 | todo.add(state.extend(pred)) 237 | done_cache_states.update(new_cache_states) 238 | -------------------------------------------------------------------------------- /stadeo/cff/cff_recognizer.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | 7 | from collections import namedtuple 8 | from hashlib import md5 9 | from miasm.analysis.data_flow import ReachingDefinitions, DiGraphDefUse, AssignblkNode 10 | import miasm.analysis.depgraph as depgraph 11 | from miasm.arch.x86 import regs 12 | from miasm.arch.x86.arch import expr_simp 13 | from miasm.core.locationdb import LocationDB 14 | from miasm.expression.expression import * 15 | from miasm.ir.ir import AssignBlock, IRBlock 16 | import logging 17 | 18 | from miasm.ir.symbexec import SymbolicExecutionEngine 19 | from sortedcontainers import SortedSet 20 | 21 | from stadeo.cff.arcg_cache_depgraph import ArgCacheDependencyGraph, contains_local_variable 22 | from stadeo.utils.extended_asmcfg import ExtendedAsmCFG, is_bad_expr, remove_redundant_and_unpin_blocks 23 | 24 | logger = logging.getLogger('CFFrecognizer') 25 | logger.setLevel(logging.WARNING) 26 | 27 | 28 | # logger.basicConfig(stream=sys.stderr, level=logger.DEBUG) 29 | # logger.basicConfig(filename="solver.log", level=logger.DEBUG) 30 | 31 | 32 | class FlatteningLoop(object): 33 | def __init__(self, head_vars: list, primary_loc_keys: set, affected_lines: dict, affected_exprs: dict 34 | , loc_key: LocKey): 35 | # TODO replace loc_key with seq IDs 36 | self.affected_exprs = affected_exprs 37 | self.loc_key = loc_key 38 | self.head_vars = head_vars 39 | self.affected_lines = affected_lines 40 | self.primary_loc_keys = primary_loc_keys 41 | self.is_default = False 42 | self._seq = 0 43 | 44 | def get_affected_hash(self, symb_exec, block_loc_key, flat_loop, source_hash_value): 45 | hash_list = [block_loc_key, source_hash_value] 46 | for head_var in self.head_vars: 47 | hash_list.append((head_var, symb_exec.eval_expr(head_var))) 48 | for affected_expr in flat_loop.affected_exprs[block_loc_key]: 49 | hash_list.append((affected_expr, symb_exec.eval_expr(affected_expr))) 50 | seq = False 51 | if not flat_loop.affected_exprs[block_loc_key]: 52 | hash_list.append(self._seq) 53 | self._seq += 1 54 | seq = True 55 | new_hash = int(md5(bytes(str(hash_list), 'ascii')).hexdigest(), 16) 56 | return new_hash, seq 57 | 58 | 59 | class FlatteningLoops(object): 60 | def __init__(self): 61 | self._loc_key_to_loop = {} 62 | self.loc_db = LocationDB() 63 | self.loops = [] 64 | # for blocks outside of any loop 65 | self._outside_of_scope = FlatteningLoop([], set(), {}, {}, self.loc_db.add_location()) 66 | self._outside_of_scope.is_default = True 67 | self._address = None 68 | 69 | def get_block(self, block_loc_key, symb_exec, source_flat_block=None): 70 | flat_loop = self[block_loc_key] 71 | flat_hash = source_hash_value = source_loop_loc_key = None 72 | if flat_loop.is_default: 73 | if source_flat_block: 74 | source_loop_loc_key = source_flat_block.source_loop_loc_key or source_flat_block.block_loc_key 75 | source_flat_loop = self[source_loop_loc_key] 76 | source_hash_value = source_flat_block.source_hash_value or source_flat_block.control_hash_value 77 | if block_loc_key in source_flat_loop.affected_lines: 78 | flat_hash, no_affected_expr = \ 79 | flat_loop.get_affected_hash(symb_exec, block_loc_key, source_flat_loop, None) 80 | source_hash_value = None 81 | else: 82 | flat_hash, _ = flat_loop.get_affected_hash(symb_exec, block_loc_key, flat_loop, None) 83 | # TODO check init block too to prevent initial duplicity in case of loops(eliminated by the decompiler) 84 | flat_block = FlatteningBlock(flat_loop.loc_key, source_loop_loc_key, block_loc_key, flat_hash, 85 | source_hash_value) 86 | return flat_block 87 | 88 | def create(self, head_vars, affected_lines, primary_loc_keys, ircfg, address): 89 | self._address = hex(address) if address else "None" 90 | affected_exprs = {} 91 | dp = depgraph.DependencyGraph(ircfg, True) 92 | for block_loc_key in affected_lines: 93 | block = ircfg.blocks[block_loc_key] 94 | cur_affected_exprs = SortedSet(key=lambda x: str(x)) 95 | for line_nb in affected_lines[block_loc_key]: 96 | affected_assignments = block.assignblks[line_nb] 97 | for ind, (dst, src) in enumerate(affected_assignments.items()): 98 | if type(src) not in [ExprInt, ExprMem]: 99 | res = next(dp.get(block_loc_key, {dst}, ind, {block_loc_key})) 100 | cur_affected_exprs.update(filter(lambda x: not is_bad_expr(x), res.pending.keys())) 101 | affected_exprs[block_loc_key] = cur_affected_exprs 102 | loop = FlatteningLoop(list(head_vars), primary_loc_keys, affected_lines, affected_exprs, 103 | self.loc_db.add_location()) 104 | upd = {} 105 | for i in loop.primary_loc_keys: 106 | if i in self._loc_key_to_loop: 107 | raise RuntimeError("Overlap of primary blocks of the flattening loops") 108 | upd[i] = loop 109 | self._loc_key_to_loop.update(upd) 110 | self.loops.append(loop) 111 | return loop 112 | 113 | def __getitem__(self, loc_key): 114 | """ 115 | Retrieves particular flattening loop by ID of the block 116 | :param loc_key: 117 | :return: 118 | """ 119 | return self._loc_key_to_loop.get(loc_key, self._outside_of_scope) 120 | 121 | def __contains__(self, loc_key): 122 | return loc_key in self._loc_key_to_loop 123 | 124 | def __len__(self): 125 | return len(self.loops) 126 | 127 | 128 | FlattenState = namedtuple('FlattenState', 'flat_block, symbols') 129 | 130 | 131 | class ConfirmedMergeFunc(object): 132 | def __init__(self, recognizer, vals): 133 | self.recognizer = recognizer 134 | self.vals = vals 135 | 136 | 137 | class FlatteningBlock(object): 138 | """ 139 | We don't need any what flattening loop the block belongs to since they are all disjunct. 140 | """ 141 | 142 | def __init__(self, loop_loc_key: LocKey, source_loop_loc_key: LocKey, block_loc_key: LocKey, control_hash_value, 143 | source_hash_value): 144 | self.block_loc_key = block_loc_key 145 | self.control_hash_value = control_hash_value 146 | self.loop_loc_key = loop_loc_key 147 | self.source_hash_value = source_hash_value 148 | self.source_loop_loc_key = source_loop_loc_key 149 | 150 | def __hash__(self): 151 | hash_list = [self.loop_loc_key, self.block_loc_key, self.control_hash_value, self.source_hash_value] 152 | new_hash = int(md5(bytes(str(hash_list), "ascii")).hexdigest(), 16) 153 | return new_hash 154 | 155 | def __eq__(self, other): 156 | return self.loop_loc_key == other.loop_loc_key and \ 157 | self.block_loc_key == other.block_loc_key and \ 158 | self.control_hash_value == other.control_hash_value and \ 159 | self.source_hash_value == other.source_hash_value 160 | 161 | 162 | class Analyses(object): 163 | def __init__(self, ircfg, asmcfg): 164 | self.defuse_edges = {} 165 | self.reaching_defs = ReachingDefinitions(ircfg) 166 | defuse = DiGraphDefUse(self.reaching_defs, deref_mem=False, apply_simp=True) 167 | heads = asmcfg.heads() 168 | self.dominators = asmcfg.compute_dominators(heads[0]) 169 | self.immediate_dominators = asmcfg.compute_immediate_dominators(heads[0]) 170 | 171 | self.back_edges = [] 172 | self.rev_back_edges = {} 173 | for node in asmcfg.walk_depth_first_forward(heads[0]): 174 | for successor in asmcfg.successors_iter(node): 175 | # check for a back edge to a dominator 176 | if successor in self.dominators[node]: 177 | edge = (node, successor) 178 | self.rev_back_edges.setdefault(successor, set()).add(node) 179 | self.back_edges.append(edge) 180 | 181 | for src, dst in defuse.edges(): 182 | self.defuse_edges.setdefault(src, []).append(dst) 183 | 184 | 185 | class CFFRecognizer(object): 186 | def __init__(self, file_path, func_address, machine, conn): 187 | self.ir_arch = None 188 | self.func_address = func_address 189 | self.asmcfg = None 190 | self.file_path = file_path 191 | self.all_affected_lines = {} 192 | self.flat_loops = FlatteningLoops() 193 | self.machine = machine 194 | self.mn = machine.mn 195 | self._merging_var_candidates = None 196 | self.merging_var = None 197 | self.possible_merge_funcs = set() 198 | self.conn = conn 199 | self.func_addresses = set(conn.modules.idautils.Functions()) 200 | self.ircfg = None 201 | self.pad = False 202 | self.analyses = None 203 | 204 | @staticmethod 205 | def _resize_top_expr(expr, size): 206 | cls, state = expr.__reduce__() 207 | if expr.is_slice(): 208 | return ExprSlice(expr.arg, 0, size) 209 | elif isinstance(state[-1], int): 210 | # int must be since since all the other args are Expr instance 211 | return cls(*state[:-1], size) 212 | elif expr.is_op() and expr.op.startswith("zeroExt"): 213 | return ExprOp("zeroExt_" + str(size), *expr.args) 214 | return None 215 | 216 | def _normalize_ircfg(self, conn): 217 | # unalias stack miasm.re/blog/2017/02/03/data_flow_analysis_depgraph.html , but involve base pointer too 218 | # TODO remove manual *BP propagation in normalize_ircfg and use standrad Miasm propagation when it is fixed 219 | # remove composes from bigger to smaller, they are not important for us 220 | bp = {} 221 | prev_offset = None 222 | for irb_loc_key in self.ircfg.walk_breadth_first_forward(LocKey(0)): 223 | irs = [] 224 | if irb_loc_key not in self.ircfg.blocks: 225 | continue 226 | irb = self.ircfg.blocks[irb_loc_key] 227 | if irb.dst.is_cond() and irb.dst.cond.is_op() and irb.dst.cond.op == 'CC_EQ': 228 | # TODO propagate cmp ..., arb_int too 229 | # propagate known zeroes to process test eax, eax; jnz ...; lea edi, [eax+4] 230 | symb_exec = SymbolicExecutionEngine(self.ir_arch) 231 | dst = symb_exec.eval_updt_irblock(irb) 232 | if dst.is_cond() and dst.cond.is_id() and not is_bad_expr(dst.cond) and \ 233 | symb_exec.eval_expr(dst.cond) == dst.cond: 234 | # add explicit mov ID, 0 to given irb 235 | target_loc = dst.src2 236 | if target_loc.is_int(): 237 | target_loc = self.asmcfg.loc_db.get_offset_location(int(target_loc)) 238 | elif target_loc.is_loc(): 239 | target_loc = target_loc.loc_key 240 | else: 241 | continue 242 | if len(self.ircfg.predecessors(target_loc)) > 1: 243 | continue 244 | target_irb = self.ircfg.blocks[target_loc] 245 | asign_blk = AssignBlock([ExprAssign(dst.cond, ExprInt(0, dst.cond.size))]) 246 | assignblks = tuple([asign_blk, *target_irb.assignblks]) 247 | new_irb = IRBlock(target_loc, assignblks) 248 | self.ircfg.blocks[target_loc] = new_irb 249 | fix_dct = {} 250 | for assignblk in irb: 251 | offset = prev_offset 252 | if assignblk.instr and assignblk.instr.offset: 253 | offset = assignblk.instr.offset 254 | prev_offset = offset 255 | spd = conn.modules.idc.get_spd(offset) 256 | if spd is not None: 257 | stk_high = ExprInt(spd, self.ir_arch.sp.size) 258 | fix_dct = {self.ir_arch.sp: self.mn.regs.regs_init[self.ir_arch.sp] + stk_high} 259 | fix_dct.update(bp) 260 | else: 261 | logger.warning("Couldn't acquire stack depth at 0x%x" % (offset or 0x0BADF00D)) 262 | 263 | new_assignblk = {} 264 | for dst, src in assignblk.items(): 265 | if src.is_compose(): 266 | slc_arg = None 267 | arg = None 268 | for tmp_arg in src.args: 269 | if not tmp_arg.is_slice(): 270 | arg = tmp_arg 271 | else: 272 | # we're interested only in bigger to smaller 273 | slc_arg = tmp_arg 274 | if slc_arg and arg and len(arg.get_r()) == 1: 275 | top_to_bottom_visitor = ExprVisitorCallbackTopToBottom( 276 | lambda x: self._resize_top_expr(x, src.size)) 277 | src = top_to_bottom_visitor.visit(arg) 278 | if dst == src: 279 | # special compiler anomalies such as lea esp, [esp+0] 280 | continue 281 | if src == self.ir_arch.sp: 282 | src = expr_simp(src.replace_expr(fix_dct)) 283 | if bp and src not in bp.values() and irb_loc_key != LocKey(0): 284 | raise RuntimeError("Ambiguous base pointer") 285 | bp.update({dst: src}) 286 | fix_dct.update(bp) 287 | else: 288 | src = expr_simp(src.replace_expr(fix_dct)) 289 | if dst != self.ir_arch.sp and dst not in bp.keys(): 290 | dst = dst.replace_expr(fix_dct) 291 | 292 | dst, src = expr_simp(dst), expr_simp(src) 293 | new_assignblk[dst] = src 294 | irs.append(AssignBlock(new_assignblk, instr=assignblk.instr)) 295 | self.ircfg.blocks[irb.loc_key] = IRBlock(irb.loc_key, irs) 296 | 297 | def _recog_init(self, merging_var_candidates): 298 | # recognize cff loops and initiate deobfuscation 299 | self._merging_var_candidates = merging_var_candidates 300 | self.ircfg = self.ir_arch.new_ircfg_from_asmcfg(self.asmcfg) 301 | self.asmcfg.rebuild_edges() 302 | 303 | # TODO put constant propagation here when fixed in Miasm 304 | # simp = IRCFGSimplifierSSA(self.ir_arch) 305 | # from datetime import datetime 306 | # startTime = datetime.now() 307 | # ssa = simp.ircfg_to_ssa(self.ircfg, LocKey(0)) 308 | # simp.do_propagate_expressions(ssa, LocKey(0)) 309 | # self.ircfg = simp.ssa_to_unssa(ssa, LocKey(0)) 310 | # print(datetime.now() - startTime) 311 | 312 | # init_infos = self.ir_arch.arch.regs.regs_init 313 | # cst_propag_link = cst_prop.propagate_cst_expr(self.ir_arch, self.ircfg, self.asmcfg.func_addr, init_infos) 314 | 315 | # raise Exception("test") 316 | 317 | self._normalize_ircfg(self.conn) 318 | irb_bak = None 319 | if merging_var_candidates: 320 | self.pad = True 321 | new_line = AssignBlock([ExprAssign(k, k) for k in merging_var_candidates]) 322 | irb_bak = self.ircfg.blocks[LocKey(0)] 323 | new_irb = IRBlock(LocKey(0), tuple([new_line, *self.ircfg.blocks[LocKey(0)].assignblks])) 324 | self.ircfg.blocks[LocKey(0)] = new_irb 325 | 326 | self.analyses = Analyses(self.ircfg, self.asmcfg) 327 | return irb_bak 328 | 329 | def clear_cache(self): 330 | # TODO save to disk and recover when needed 331 | self.asmcfg = None 332 | self.ircfg = None 333 | self.analyses = None 334 | self.ir_arch = None 335 | self.all_affected_lines = {} 336 | self.flat_loops = FlatteningLoops() 337 | self.possible_merge_funcs = set() 338 | self._merging_var_candidates = None 339 | 340 | def _recognize(self, max_loop_num): 341 | symb_engine = SymbolicExecutionEngine(self.ir_arch, regs.regs_init) 342 | todo = [(LocKey(0), symb_engine.get_state())] 343 | done_loc = set() 344 | if not max_loop_num: 345 | max_loop_num = float('inf') 346 | found_loops_num = 0 347 | while todo: 348 | loc_key, symb_state = todo.pop() 349 | if loc_key in done_loc or loc_key not in self.ircfg.blocks: 350 | continue 351 | done_loc.add(loc_key) 352 | ir_block = self.ircfg.blocks[loc_key] 353 | symb_engine.set_state(symb_state) 354 | for ind, assignblk in enumerate(ir_block.assignblks): 355 | for dst, src in assignblk.items(): 356 | if max_loop_num < found_loops_num: 357 | return 358 | if src.is_int() and int(src) in self.func_addresses: 359 | assignblk_node = AssignblkNode(ir_block.loc_key, ind, dst) 360 | # no uses 361 | if assignblk_node not in self.analyses.defuse_edges or not \ 362 | self.analyses.defuse_edges[assignblk_node]: 363 | # possible virtual table initialization 364 | self.possible_merge_funcs.add((int(src), frozenset(), loc_key)) 365 | elif src.is_op("call_func_stack"): 366 | self._process_call(src, dst, symb_engine, assignblk, loc_key) 367 | elif (expr_simp(src).is_int() and not is_bad_expr(dst)) \ 368 | or (ir_block.loc_key == LocKey(0) and dst == src and 369 | (not self._merging_var_candidates or dst in self._merging_var_candidates)): 370 | if self._process_assignment(ir_block, ind, dst): 371 | self._merging_var_candidates = None 372 | found_loops_num += 1 373 | symb_engine.eval_updt_assignblk(assignblk) 374 | 375 | for succ in self.ircfg.successors(loc_key): 376 | todo.append((succ, symb_engine.get_state())) 377 | 378 | def recognize(self, max_loop_num=False, merging_var_candidates=None): 379 | if not merging_var_candidates: 380 | merging_var_candidates = None 381 | if not self.asmcfg: 382 | self.asmcfg = ExtendedAsmCFG(self.file_path, self.conn) 383 | self.asmcfg.disassemble(self.func_address, self.conn) 384 | remove_redundant_and_unpin_blocks(self.asmcfg, LocKey(0), self.asmcfg.mode, unpin=False) 385 | block_nb = len(self.asmcfg.blocks) 386 | if block_nb > 4250: 387 | self.clear_cache() 388 | logger.critical("Function is too big") 389 | raise RuntimeError("Function is too big") 390 | 391 | self.ir_arch = self.machine.ira(self.asmcfg.loc_db) 392 | if self.merging_var: 393 | merging_var_candidates = {self.merging_var} 394 | else: 395 | return 396 | # setting merging vars 397 | try: 398 | irb_bak = self._recog_init(merging_var_candidates) 399 | except RuntimeError: 400 | logger.warning("Exotic stack operations, skipping") 401 | return 402 | self._recognize(max_loop_num) 403 | if merging_var_candidates: 404 | self.ircfg.blocks[LocKey(0)] = irb_bak 405 | 406 | def _process_assignment(self, ir_block, ind, dst): 407 | assignblk_node = AssignblkNode(ir_block.loc_key, ind, dst) 408 | # loop id 0 is the default 409 | logger.debug("Processing %s" % 410 | hex(self.asmcfg.loc_db.get_location_offset(ir_block.loc_key) or 0)) 411 | local_affected_lines = {} 412 | affected_irdsts, possible_nodes = self._get_affected_ir_destinations(assignblk_node, local_affected_lines) 413 | result = False 414 | for node in self.asmcfg.walk_breadth_first_forward(LocKey(0)): 415 | if node in possible_nodes: 416 | filtered_irdsts = self._filter_sequential_loc_keys(node, affected_irdsts) 417 | affected_lines = {} 418 | result |= self._create_flattening_loop(node, filtered_irdsts, affected_lines) 419 | return result 420 | 421 | def _process_call(self, src, dst, symb_engine, assignblk, loc_key): 422 | # adds function to the to be processed list 423 | addr = src.args[0] 424 | if addr.is_mem(): 425 | addr = addr.ptr 426 | if addr.is_loc(): 427 | addr = self.asmcfg.loc_db.get_location_offset(addr.loc_key) 428 | if isinstance(addr, int) or addr.is_int(): 429 | addr = int(addr) 430 | if addr in self.func_addresses: 431 | new_merging_var_candidates = self._get_merging_var_candidates(symb_engine, assignblk, dst) 432 | self.possible_merge_funcs.add((addr, frozenset(new_merging_var_candidates), loc_key)) 433 | 434 | def _get_merging_var_candidates(self, symb_engine, assignblk, dst): 435 | stk_high = ExprInt(self.conn.modules.idc.get_spd(assignblk.instr.offset), 436 | self.ir_arch.sp.size) 437 | init_sp = self.mn.regs.regs_init[self.ir_arch.sp] 438 | fix_dct = {init_sp: - stk_high + init_sp + ExprInt(dst.size // 8, dst.size)} 439 | new_merging_var_candidates = set() # values are tuples key, val 440 | for key, val in symb_engine.modified(regs.regs_init): 441 | if not val.is_int() or not val.size > 1 or type(key) not in [ExprId, ExprMem] \ 442 | or key.is_id() and key.name in ["RIP", "EIP", self.ircfg.IRDst.name]: 443 | continue 444 | if not key.is_id(): 445 | # get relative depth 446 | key = key.replace_expr(fix_dct) 447 | key = expr_simp(key) 448 | new_merging_var_candidates.add((key, val)) 449 | return new_merging_var_candidates 450 | 451 | def _get_affected_ir_destinations(self, assignblk_node, local_affected_lines): 452 | todo = [assignblk_node] 453 | processed = set() # {} 454 | result = {} 455 | possible_nodes = {assignblk_node.label} 456 | while todo: 457 | target = todo.pop() 458 | if not self.flat_loops[target.label].is_default: 459 | logger.debug("Overlap at %s skipping" % hex( 460 | self.ircfg.loc_db.get_location_offset(target.label) or 0xbadf0d)) 461 | return set(), set() 462 | 463 | if target in processed: 464 | continue 465 | local_affected_lines.setdefault(target.label, set()).add(target.index) 466 | processed.add(target) 467 | if target.var == self.ircfg.IRDst: 468 | result[target.label] = target 469 | possible_nodes.add(target.label) 470 | for use in self.analyses.defuse_edges.get(target, []): 471 | todo.append(use) 472 | return result, possible_nodes 473 | 474 | def _filter_sequential_loc_keys(self, node, affected): 475 | if node in self.all_affected_lines: 476 | return set() 477 | todo = [node] 478 | accessible = {} 479 | done = set() 480 | fst = False # found the first comparison 481 | while todo: 482 | target = todo.pop() 483 | if target in accessible or target in done: 484 | continue 485 | done.add(target) 486 | succs = self.asmcfg.successors(target) 487 | irb = self.ircfg.blocks[target] 488 | if target in affected: 489 | fst = True 490 | accessible[target] = affected[target] 491 | elif len(succs) > 1 and fst: 492 | continue 493 | if irb.dst.is_cond() and irb.dst.cond.is_op("CC_EQ"): 494 | succs = [self.ircfg.get_loc_key(irb.dst.src2.loc_key)] 495 | todo += succs 496 | return set(accessible.values()) 497 | 498 | def _add_2_control_vars(self, primary_loc_keys, affected_lines, merging_var, done, head_vars): 499 | # involve cff loops containing compiler "optimization" introducing 2 control variables( 500 | # range(0,x,1) and range(0,x,1*y)); there are multiple due to various compiler anomalies, but 501 | # only one from affected_irdsts 502 | found = False 503 | for disp_loc, last_cff_locs in self.analyses.rev_back_edges.items(): 504 | if disp_loc not in primary_loc_keys: 505 | continue 506 | for last_cff_loc in last_cff_locs: 507 | preds = self.asmcfg.predecessors(last_cff_loc) 508 | succs = self.asmcfg.successors(last_cff_loc) 509 | if not len(succs) > 1 and len(preds) == 1 and last_cff_loc not in affected_lines: 510 | last_cff_loc = preds[0] 511 | succs = self.asmcfg.successors(last_cff_loc) 512 | if last_cff_loc not in primary_loc_keys: 513 | if len(succs) > 1: 514 | primary_loc_keys.add(last_cff_loc) 515 | opti_node = AssignblkNode(last_cff_loc, len(self.ircfg.blocks[last_cff_loc].assignblks) - 1, 516 | self.ir_arch.IRDst) 517 | self._process_affected_irdsts({opti_node}, affected_lines, primary_loc_keys, merging_var, done, 518 | head_vars) 519 | if last_cff_loc in primary_loc_keys: 520 | # otherwise last_cff_loc couldn't be determined and was removed from primaries 521 | found = True 522 | if last_cff_loc in affected_lines: 523 | found = True 524 | else: 525 | found = True 526 | if not found: 527 | raise RuntimeError("There must be a back-edge") 528 | 529 | def _cff_loop_sanity_check(self, primary_loc_keys, node, affected_lines): 530 | wanted_jump_tables = \ 531 | {i for i in self.asmcfg.jmp_table_loc_keys 532 | if set(self.ircfg.predecessors(i)).issubset(primary_loc_keys)} 533 | wanted = affected_lines.keys() | wanted_jump_tables 534 | tolerance = self._merging_var_candidates is not None 535 | for i in primary_loc_keys: 536 | if i not in self.asmcfg.heads() | self.analyses.rev_back_edges.keys() and \ 537 | (self.analyses.immediate_dominators[i] not in wanted and 538 | (set(self.ircfg.predecessors(i)).issubset(wanted) or 539 | i in self.analyses.rev_back_edges.values())): 540 | # immediate dominators must contain affected blocks, except the first one, jump table blocks and in 541 | # case of merged function the initial range check; check predecessors too 542 | if tolerance: 543 | # initial range check check 544 | if all(i in self.analyses.dominators[j] or i == j for j in primary_loc_keys): 545 | tolerance = False 546 | continue 547 | raise RuntimeError() 548 | 549 | if not (len(primary_loc_keys) > 1 and 550 | all(node in self.analyses.dominators[i] for i in primary_loc_keys)): 551 | # the assignment has to dominate all the primary loc_keys 552 | raise RuntimeError() 553 | 554 | def _create_flattening_loop(self, node, assign_blocks, affected_lines): 555 | # assign_blocks are all affected IRDsts 556 | if not len(assign_blocks) > 1: 557 | return False 558 | 559 | primary_loc_keys = {i.label for i in assign_blocks} 560 | done = set() 561 | head_vars = set() 562 | merging_var = self._process_affected_irdsts(assign_blocks, affected_lines, primary_loc_keys, self.merging_var, 563 | done, head_vars) 564 | try: 565 | self._add_2_control_vars(primary_loc_keys, affected_lines, merging_var, done, head_vars) 566 | self._cff_loop_sanity_check(primary_loc_keys, node, affected_lines) 567 | except RuntimeError: 568 | return False 569 | if merging_var: 570 | self.merging_var = merging_var 571 | logger.debug("setting merging var %s" % merging_var) 572 | try: 573 | self.flat_loops.create(head_vars, affected_lines, primary_loc_keys, self.ircfg, 574 | self.ircfg.loc_db.get_location_offset(node)) 575 | except RuntimeError: 576 | return False 577 | logger.debug("adding") 578 | for tmp_loc_key, lines in affected_lines.items(): 579 | self.all_affected_lines.setdefault(tmp_loc_key, SortedSet()).update(lines) 580 | return True 581 | 582 | def _process_affected_irdsts(self, assign_blocks, affected_lines, primary_loc_keys, merging_var, done, head_vars): 583 | dg = ArgCacheDependencyGraph(self, self.ircfg, implicit=False, follow_mem=False, follow_call=False) 584 | possible_merging_var = merging_var 585 | incorrect_cache = set() 586 | for ind, assign_block in enumerate(assign_blocks): 587 | if self.asmcfg.loc_db.get_location_offset(assign_block.label): 588 | logger.debug("processing assign_block %d out of %d at %s" % 589 | (ind + 1, len(assign_blocks), 590 | hex(self.asmcfg.loc_db.get_location_offset(assign_block.label) or 0))) 591 | base_expr_ids = self.ircfg.blocks[assign_block.label][assign_block.index][assign_block.var].get_r() 592 | local_done = set(done) 593 | dr = dg.get(assign_block.label, base_expr_ids, assign_block.index, self.asmcfg.heads(), local_done, 594 | incorrect_cache) 595 | local_affected_lines = {} 596 | local_head_vars = set() 597 | for sol in dr: 598 | if not dg.cached: 599 | if not dg.incorrect: 600 | possible_merging_vars = set() 601 | for pnd in sol.pending: 602 | if not contains_local_variable(pnd, self.ir_arch, self.mn): 603 | if pnd != self.mn.regs.regs_init[self.ir_arch.sp]: 604 | possible_merging_vars.add(pnd) 605 | elif sol.loc_key == LocKey(0): 606 | local_head_vars.add(pnd) 607 | 608 | pmv_len = len(possible_merging_vars) 609 | if self._merging_var_candidates and possible_merging_vars: 610 | possible_merging_var = possible_merging_vars.pop() 611 | if dg.incorrect or pmv_len > (self._merging_var_candidates is not None) \ 612 | or possible_merging_var and self._merging_var_candidates and \ 613 | possible_merging_var not in self._merging_var_candidates or \ 614 | (merging_var and possible_merging_var != merging_var): 615 | incorrect_cache.update(sol.pending_links) 616 | primary_loc_keys.remove(assign_block.label) 617 | if self.asmcfg.loc_db.get_location_offset(assign_block.label): 618 | logger.debug("%s cannot be determined" % 619 | hex(self.asmcfg.loc_db.get_location_offset(assign_block.label) or 0)) 620 | break 621 | else: 622 | merging_var = possible_merging_var 623 | self._add_relevant_nodes(sol.relevant_nodes, local_affected_lines) 624 | else: 625 | done.update(local_done) 626 | head_vars.update(local_head_vars) 627 | affected_lines.setdefault(assign_block.label, SortedSet()).add(assign_block.index) 628 | for loc_key, lines in local_affected_lines.items(): 629 | affected_lines.setdefault(loc_key, SortedSet()).update(lines) 630 | return merging_var 631 | 632 | @staticmethod 633 | def _add_relevant_nodes(relevant_nodes, affected_lines): 634 | for node in relevant_nodes: 635 | affected_lines.setdefault(node.loc_key, SortedSet()).add(node.line_nb) 636 | -------------------------------------------------------------------------------- /stadeo/cff/cff_solver.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | 7 | import logging 8 | 9 | from miasm.analysis.depgraph import DependencyGraph 10 | from miasm.arch.x86.arch import instruction_x86, additional_info 11 | from miasm.core.asmblock import AsmConstraintTo, AsmBlock, AsmConstraint, AsmCFG 12 | from miasm.core.locationdb import LocationDB 13 | from miasm.expression.expression import * 14 | from miasm.ir.ir import AssignBlock, IRBlock 15 | from miasm.ir.symbexec import SymbolicExecutionEngine 16 | 17 | from stadeo.utils.extended_asmcfg import create_jump_instruction 18 | from stadeo.cff.cff_recognizer import FlattenState 19 | 20 | logger = logging.getLogger('CFFsolver') 21 | logger.setLevel(logging.WARNING) 22 | 23 | 24 | class CFFSolver(object): 25 | def __init__(self, recognizer): 26 | self.ircfg = recognizer.ircfg 27 | self.asmcfg = recognizer.asmcfg 28 | self.flat_loops = recognizer.flat_loops 29 | self.all_affected_lines = recognizer.all_affected_lines 30 | self.ir_arch = recognizer.ir_arch 31 | loc_db = LocationDB() 32 | loc_db.merge(recognizer.asmcfg.loc_db) 33 | self.out_asmcfg = AsmCFG(loc_db) 34 | self.merging_var = recognizer.merging_var 35 | self.pad = recognizer.pad 36 | self.possible_merge_funcs = recognizer.possible_merge_funcs 37 | self.relevant_nodes = set() 38 | 39 | def process(self, pending, merging_val, reached_funcs): 40 | if len(self.flat_loops) == 0: 41 | # add all reached functions 42 | for func_addr, possible_merge_vars, loc_key in self.possible_merge_funcs: 43 | reached_funcs.add(func_addr) 44 | for expr, val in possible_merge_vars: 45 | pending.setdefault(func_addr, {}).setdefault(expr, set()).add(val) 46 | return None 47 | 48 | assert len(self.asmcfg.heads()) == 1 49 | 50 | # add merging var to the ircfg 51 | if self.pad: 52 | initial_block_bak = self.ircfg.blocks[LocKey(0)] 53 | if merging_val and self.merging_var: 54 | asgn_blk = AssignBlock([ExprAssign(self.merging_var, merging_val)]) 55 | else: 56 | asgn_blk = AssignBlock() 57 | assignblks = tuple([asgn_blk, *self.ircfg.blocks[LocKey(0)].assignblks]) 58 | self.ircfg.blocks[LocKey(0)] = IRBlock(LocKey(0), assignblks) 59 | 60 | head = self.asmcfg.heads()[0] 61 | head_block = self.asmcfg.loc_key_to_block(head) 62 | new_head = self._deobfuscate_cff_loops(head_block, self.asmcfg.machine.mn.regs.regs_init) 63 | 64 | if self.pad: 65 | self.ircfg.blocks[LocKey(0)] = initial_block_bak 66 | if merging_val and self.merging_var: 67 | mode = self.asmcfg.mode 68 | fix_dct = {self.asmcfg.machine.mn.regs.regs_init[self.ir_arch.sp]: self.ir_arch.sp} 69 | mov = instruction_x86("MOV", mode, [self.merging_var.replace_expr(fix_dct), merging_val]) 70 | mov.additional_info = additional_info() 71 | mov.additional_info.g1.value = 0 72 | self.out_asmcfg.loc_key_to_block(LocKey(0)).lines.insert(0, mov) 73 | 74 | loc_keys = self.relevant_nodes 75 | for func_addr, possible_merge_vars, loc_key in self.possible_merge_funcs: 76 | if loc_key in loc_keys: 77 | reached_funcs.add(func_addr) 78 | for expr, val in possible_merge_vars: 79 | pending.setdefault(func_addr, {}).setdefault(expr, set()).add(val) 80 | 81 | return new_head 82 | 83 | def _insert_flat_block(self, source_flat_block, symb_exec, flat_block_to_loc_key): 84 | """ 85 | Copies source_flat_block and sets its successors according to flat_block_to_loc_key 86 | :param flat_block_to_loc_key: dictionary mapping flat_blocks to respective loc_keys 87 | :param symb_exec: instance of current symbolic execution engine 88 | :param source_flat_block: flat_block to be inserted 89 | :return: dictionary mapping old successor loc_keys to the new ones 90 | """ 91 | # we're not using redirect_successors after copying to avoid executing the same loops multiple times 92 | source_block = self.asmcfg.loc_key_to_block(source_flat_block.block_loc_key) 93 | tobe_processed = {} 94 | new_flat_blocks = set() 95 | new_block_loc_key = flat_block_to_loc_key[source_flat_block] 96 | if self.out_asmcfg.loc_key_to_block(new_block_loc_key) is not None: 97 | raise Exception("Target loc_key is already associated to a block") 98 | new_block = AsmBlock(new_block_loc_key) 99 | 100 | # copy instructions 101 | for ln in source_block.lines: 102 | tmp_ln = instruction_x86(ln.name, ln.mode, [i.copy() for i in ln.args], ln.additional_info) 103 | tmp_ln.b = ln.b 104 | tmp_ln.l = ln.l 105 | tmp_ln.offset = ln.offset 106 | new_block.addline(tmp_ln) 107 | 108 | constraints = source_block.bto 109 | # try to simplify the destination if it's a primary flattening block 110 | if not self.flat_loops[source_block.loc_key].is_default: 111 | logger.debug("current block is a part of primary loc_keys") 112 | simplified_target = symb_exec.eval_expr(self.ircfg.IRDst) 113 | if isinstance(simplified_target, ExprInt): 114 | simplified_target = self.asmcfg.loc_db.get_offset_location(int(simplified_target)) 115 | elif isinstance(simplified_target, ExprLoc): 116 | simplified_target = simplified_target.loc_key 117 | else: 118 | # there's probably a(n) (series of) unknown instruction(s) causing an implicit conditional assignment 119 | # such as CMOV or SBB->AND->ADD, prepend comparison + cond jump if it happens to be common, or add it to 120 | # ExtendedAsmCFG.extended_discovery and split flow on the final instruction 121 | 122 | # it's also possible that it's not related to any cff loop at all 123 | addr = self.asmcfg.loc_db.get_location_offset(source_flat_block.block_loc_key) 124 | addr = hex(addr) if addr else addr 125 | logger.warning("Couldn't simplify loc_key %s at %s, continuing" % 126 | (str(source_flat_block.block_loc_key), addr)) 127 | logger.warning("the simplified target is %s of instance %s" % 128 | (simplified_target, type(simplified_target))) 129 | simplified_target = None 130 | if simplified_target: 131 | constraints = {AsmConstraintTo(simplified_target)} 132 | mode = self.asmcfg.mode 133 | 134 | # remove redundant comparison 135 | dp = DependencyGraph(self.ircfg, True) 136 | block_loc_key = source_block.loc_key 137 | res = next(dp.get(block_loc_key, {self.ircfg.IRDst}, None, {block_loc_key})) 138 | for depnode in res.relevant_nodes: 139 | ind = depnode.line_nb 140 | ind -= (len(self.ircfg.blocks[block_loc_key]) - len(new_block.lines)) 141 | if new_block.lines[ind].name == "CMP": 142 | new_block.lines.pop(ind) 143 | 144 | new_block.lines[-1] = create_jump_instruction(mode, ExprLoc(simplified_target, mode)) 145 | 146 | # copy constraints 147 | new_bto = set() 148 | for constraint in constraints: 149 | if not self.asmcfg.loc_key_to_block(constraint.loc_key): 150 | logger.debug("Skipping bad constraint %s" % constraint.loc_key) 151 | continue 152 | flat_block = self.flat_loops.get_block(constraint.loc_key, symb_exec, source_flat_block) 153 | if flat_block not in flat_block_to_loc_key: 154 | new_flat_blocks.add(flat_block) 155 | new_loc_key = self.out_asmcfg.loc_db.add_location() 156 | tobe_processed[constraint.loc_key] = (new_loc_key, flat_block) 157 | flat_block_to_loc_key[flat_block] = new_loc_key 158 | else: 159 | new_loc_key = flat_block_to_loc_key[flat_block] 160 | new_bto.add(AsmConstraint(new_loc_key, constraint.c_t)) 161 | new_block.bto = new_bto 162 | new_block.alignment = source_block.alignment 163 | 164 | # change jmp targets 165 | if new_block.lines: 166 | for ind, arg in enumerate(list(new_block.lines[-1].args)): 167 | if isinstance(arg, ExprLoc): 168 | if not self.asmcfg.loc_key_to_block(arg.loc_key): 169 | logger.debug("Skipping bad constraint %s" % arg.loc_key) 170 | continue 171 | new_target, flat_block = tobe_processed.get(arg.loc_key, (None, None)) 172 | if not new_target: 173 | flat_block = self.flat_loops.get_block(arg.loc_key, symb_exec, source_flat_block) 174 | new_target = flat_block_to_loc_key.get(flat_block) 175 | # None in case of irrelevant calls 176 | logger.debug("new target: %s" % new_target) 177 | if new_target: 178 | new_block.lines[-1].args[ind] = ExprLoc(new_target, arg.size) 179 | 180 | self.out_asmcfg.add_block(new_block) 181 | return new_flat_blocks 182 | 183 | def _deobfuscate_cff_loops(self, source_block, symbols): 184 | """ 185 | 186 | :param symbols: initial symbols of symbolic execution engine to be created 187 | :param source_block: head of the graph to be deobfuscated 188 | :return: 189 | """ 190 | symb_exec = SymbolicExecutionEngine(self.ir_arch) 191 | flat_block = self.flat_loops.get_block(source_block.loc_key, symb_exec, None) 192 | # maps flattening blocks to their respective loc_keys 193 | new_head = LocKey(0) 194 | flat_block_to_loc_key = {flat_block: new_head} 195 | todo = [FlattenState(flat_block, symbols)] 196 | counter = {} 197 | while todo: 198 | state = todo.pop() 199 | block_loc_key = state.flat_block.block_loc_key 200 | self.relevant_nodes.add(block_loc_key) 201 | counter[block_loc_key] = counter.get(block_loc_key, 0) + 1 202 | logger.debug("Processing block at 0x%x as %s; in all affected: %d; loops_id: %s; the jtc_vars are:" % 203 | (self.asmcfg.loc_db.get_location_offset(block_loc_key) or 0xBAD, str(block_loc_key), 204 | block_loc_key in self.all_affected_lines, 205 | self.flat_loops[block_loc_key].loc_key)) 206 | if counter[block_loc_key] > 500: 207 | raise Exception("Couldn't deobfuscate cff loop, either fell into an infinite loop or processing very " 208 | "big function") 209 | symb_exec.set_state(state.symbols) 210 | # evaluate all affected lines 211 | self._eval_updt_lines(symb_exec, block_loc_key) 212 | for flat_block in self._insert_flat_block(state.flat_block, symb_exec, flat_block_to_loc_key): 213 | todo.append(FlattenState(flat_block, symb_exec.get_state())) 214 | return new_head 215 | 216 | def _eval_updt_lines(self, symb_exec, loc_key): 217 | logger.debug("[DBG} block to eval: %s" % self.ircfg.blocks[loc_key]) 218 | if loc_key not in self.all_affected_lines: 219 | return 220 | logger.debug("[DBG} lines to eval: %s" % str(self.all_affected_lines[loc_key])) 221 | for line_nb in self.all_affected_lines[loc_key]: 222 | assign_blk = self.ircfg.blocks[loc_key].assignblks[line_nb] 223 | symb_exec.eval_updt_assignblk(assign_blk) 224 | -------------------------------------------------------------------------------- /stadeo/cff/cff_strategies.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | 7 | import logging 8 | from pprint import pformat 9 | from unittest import mock 10 | 11 | import rpyc 12 | from miasm.analysis.machine import Machine 13 | from miasm.expression.expression import LocKey 14 | 15 | from stadeo.cff.cff_recognizer import CFFRecognizer, ConfirmedMergeFunc 16 | from stadeo.cff.cff_solver import CFFSolver 17 | from stadeo.utils.extended_asmcfg import write_patches_to_file 18 | from collections.abc import Iterable 19 | 20 | 21 | logger = logging.getLogger('CFFstrategies') 22 | logger.setLevel(logging.WARNING) 23 | 24 | 25 | class CFFStrategies(object): 26 | def __init__(self, arch): 27 | """ 28 | 29 | :param arch: Either 32 or 64 bit architecture 30 | """ 31 | self._pending = {} 32 | self._reached_funcs = set() 33 | if arch not in [32, 64]: 34 | raise ValueError 35 | self._machine = Machine("x86_" + str(arch)) 36 | 37 | def solve_loop(self, func_address, empty_address, context=None, ip='localhost', port=4455, conn=None, 38 | only_one=False): 39 | """ 40 | Deobfuscates single loop 41 | :param func_address: address of the function to be deobfuscated 42 | :param empty_address: address of the resulting deobfuscated function 43 | :param context: optional, dictionary assigning merging variable to its value using Miasm expressions 44 | :param ip: optional, IP of the computer running rpyc server in IDA 45 | :param port: optional, port of the computer running rpyc server in IDA 46 | :param conn: optional, already estabilished connection to running rpyc server in IDA 47 | :param only_one: do not attempt to reveal more than 1 CFF loop 48 | :return: True on successful deobfuscation, otherwise False 49 | """ 50 | close_conn = False 51 | if not conn: 52 | close_conn = True 53 | conn = rpyc.classic.connect(ip, port) 54 | 55 | with mock.patch("builtins.open", conn.builtins.open): 56 | if context is None: 57 | context = {} 58 | 59 | file_path = conn.modules.idaapi.get_input_file_path() 60 | recognizer = CFFRecognizer(file_path, func_address, self._machine, conn) 61 | try: 62 | recognizer.recognize(only_one, context) 63 | except: 64 | return False 65 | val = None 66 | if context: 67 | val = set(context.values()).pop() 68 | new_empty_address = self._solve_loop(empty_address, recognizer, file_path, conn, val) 69 | 70 | if close_conn: 71 | conn.close() 72 | 73 | return new_empty_address == empty_address 74 | 75 | def _solve_loop(self, empty_address, recognizer, file_path, conn=None, val=None): 76 | func = recognizer.asmcfg 77 | deflattener = CFFSolver(recognizer) 78 | new_head = deflattener.process(self._pending, val, self._reached_funcs) 79 | if val: 80 | val = int(val) 81 | if not new_head: 82 | local_mapping = "skipping 0x%08x with val %s: %s\n" % (func.func_addr, recognizer.merging_var, 83 | hex(val or 0x0BADF00D)) 84 | print("%s" % local_mapping, end="") 85 | return empty_address 86 | local_mapping = "0x%08x -> 0x%08x with val %s: %s\n" % (func.func_addr, empty_address, recognizer.merging_var, 87 | hex(val or 0x0BADF00D)) 88 | print("mapping: %s" % local_mapping, end="") 89 | deflattener.out_asmcfg.loc_db.set_location_offset(LocKey(0), empty_address, True) 90 | new_addr = write_patches_to_file(deflattener.out_asmcfg, func.exectbl, empty_address, file_path, func.mode, 91 | 2 ** 64 - 1, new_head) 92 | if conn: 93 | conn.modules.idaapi.reload_file(conn.modules.idaapi.get_input_file_path(), 0) 94 | conn.modules.ida_funcs.add_func(new_addr) 95 | return new_addr 96 | 97 | def process_all(self, empty_address, ip='localhost', port=4455, conn=None): 98 | """ 99 | Tries to deobfuscate all functions recognized by IDA 100 | :param empty_address: address where to put all the deobfuscated functions 101 | :param ip: optional, IP of the computer running rpyc server in IDA 102 | :param port: optional, port of the computer running rpyc server in IDA 103 | :param conn: optional, already estabilished connection to running rpyc server in IDA 104 | :return: dictionary assigning each processed function address either to None in case of failure or to the 105 | respective @ConfirmedMergeFunc instance 106 | """ 107 | close_conn = False 108 | if not conn: 109 | close_conn = True 110 | conn = rpyc.classic.connect(ip, port) 111 | 112 | recognized_funcs = {} 113 | with mock.patch("builtins.open", conn.builtins.open): 114 | file_path = conn.modules.idaapi.get_input_file_path() 115 | for func_addr in conn.modules.idautils.Functions(): 116 | recognizer = CFFRecognizer(file_path, func_addr, self._machine, conn) 117 | try: 118 | recognizer.recognize(True) 119 | except: 120 | recognized_funcs[func_addr] = None 121 | continue 122 | recognized_funcs[func_addr] = ConfirmedMergeFunc(recognizer, empty_address) 123 | empty_address = self._solve_loop(empty_address, recognizer, file_path, conn) 124 | if recognized_funcs[func_addr].vals == empty_address: 125 | recognized_funcs[func_addr] = None 126 | recognizer.clear_cache() 127 | 128 | if close_conn: 129 | conn.close() 130 | 131 | return recognized_funcs 132 | 133 | @staticmethod 134 | def _clear_cache(recognized_funcs): 135 | for merge_func in recognized_funcs.values(): 136 | if not merge_func: 137 | continue 138 | merge_func.recognizer.clear_cache() 139 | 140 | def process_merging(self, func_addresses, empty_address, ip='localhost', port=4455, conn=None, 141 | recognized_funcs=None): 142 | """ 143 | Tries to discover and deobfuscate reachable functions 144 | :param func_addresses: initial function address or addresses 145 | :param empty_address: address where to put all the deobfuscated functions 146 | :param ip: optional, IP of the computer running rpyc server in IDA 147 | :param port: optional, port of the computer running rpyc server in IDA 148 | :param conn: optional, already estabilished connection to running rpyc server in IDA 149 | :param recognized_funcs: optional, dictionary assigning each already processed function address either to None 150 | in case of failure or to the respective @ConfirmedMergeFunc instance; used after repeated execution with 151 | different initial address 152 | :return: dictionary assigning each processed function address either to None in case of failure or to the 153 | respective @ConfirmedMergeFunc instance 154 | """ 155 | if recognized_funcs is None: 156 | recognized_funcs = {} # ConfirmedMergeFunc 157 | if not isinstance(func_addresses, Iterable): 158 | # in case of only one function 159 | func_addresses = {func_addresses} 160 | self._reached_funcs.update(func_addresses) 161 | 162 | processed_blocks = 0 163 | close_conn = False 164 | if not conn: 165 | close_conn = True 166 | conn = rpyc.classic.connect(ip, port) 167 | 168 | with mock.patch("builtins.open", conn.builtins.open): 169 | file_path = conn.modules.idaapi.get_input_file_path() 170 | while self._reached_funcs: 171 | func_addr = self._reached_funcs.pop() 172 | logger.debug("Processing func at 0x%x" % func_addr) 173 | logger.debug("Reached funcs: %s" % {hex(i) for i in self._reached_funcs}) 174 | logger.debug("Pending func_addr: %s" % (pformat(self._pending[func_addr]) if func_addr in self._pending 175 | else "None")) 176 | 177 | if processed_blocks > 800: 178 | # clear cached recognizers to avoid running out of memory 179 | processed_blocks = 0 180 | self._clear_cache(recognized_funcs) 181 | 182 | if func_addr not in recognized_funcs: 183 | # recognize a new func for the first time 184 | ida_func = conn.modules.idaapi.get_func(func_addr) 185 | if ida_func and ida_func.flags & conn.modules.idaapi.FUNC_LIB: 186 | local_mapping = "skipping 0x%08x (library func)\n" % func_addr 187 | print("%s" % local_mapping, end="") 188 | recognized_funcs[func_addr] = None 189 | if func_addr in self._pending: 190 | del self._pending[func_addr] 191 | continue 192 | recognizer = CFFRecognizer(file_path, func_addr, self._machine, conn) 193 | merging_var_candidates = self._pending.get(func_addr, {}) 194 | try: 195 | recognizer.recognize(False, merging_var_candidates) 196 | recognized_funcs[func_addr] = ConfirmedMergeFunc(recognizer, {}) 197 | processed_blocks += len(recognizer.asmcfg.blocks) 198 | except (TypeError, OSError, RuntimeError): 199 | recognized_funcs[func_addr] = None 200 | logger.warning("Skipping exotic func at 0x%x" % func_addr) 201 | if func_addr in self._pending: 202 | del self._pending[func_addr] 203 | continue 204 | elif not self._pending.get(func_addr, None) or \ 205 | not (recognized_funcs[func_addr] and recognized_funcs[func_addr].recognizer.merging_var): 206 | if func_addr in self._pending: 207 | del self._pending[func_addr] 208 | continue 209 | 210 | recognizer = recognized_funcs[func_addr].recognizer 211 | merging_var = recognizer.merging_var 212 | 213 | if not recognizer.asmcfg: 214 | # cache has been cleared 215 | recognizer.recognize() 216 | processed_blocks += len(recognizer.asmcfg.blocks) 217 | 218 | if not merging_var: 219 | # just added non merging; even non-cff funcs can reach this point since they don't have any merging 220 | # var, they are to be considered as processed from now on 221 | recognized_funcs[func_addr].vals = empty_address 222 | empty_address = self._solve_loop(empty_address, recognizer, file_path, conn) 223 | if recognized_funcs[func_addr].vals == empty_address: 224 | recognized_funcs[func_addr].vals = None 225 | continue 226 | if merging_var not in self._pending[func_addr]: 227 | logger.warning("Function 0x%x isn't merging, ignore its previous results" % func_addr) 228 | recognizer.flat_loops.loops.pop(0) # the first loop is merging 229 | recognizer.merging_var = None 230 | empty_address = self._solve_loop(empty_address, recognizer, file_path, conn) 231 | del self._pending[func_addr] 232 | continue 233 | current_vals = self._pending[func_addr][merging_var] 234 | del self._pending[func_addr] 235 | for val in current_vals: 236 | if val in recognized_funcs[func_addr].vals: 237 | continue 238 | recognized_funcs[func_addr].vals[val] = empty_address 239 | empty_address = self._solve_loop(empty_address, recognizer, file_path, conn, val) 240 | if recognized_funcs[func_addr].vals[val] == empty_address: 241 | # failed 242 | recognized_funcs[func_addr] = None 243 | 244 | if close_conn: 245 | conn.close() 246 | 247 | return recognized_funcs 248 | -------------------------------------------------------------------------------- /stadeo/string/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | -------------------------------------------------------------------------------- /stadeo/string/string_revealer.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Contains regexes which are Copyright (C) 2017 FireEye, Inc. 5 | # Author: Vladislav Hrčka 6 | # See LICENSE file for redistribution. 7 | 8 | import logging 9 | import re 10 | from unittest import mock 11 | 12 | import rpyc 13 | from miasm.analysis.dse import DSEEngine 14 | from miasm.expression.simplifications import expr_simp 15 | from miasm.analysis.sandbox import Sandbox_Win_x86_32, Sandbox_Win_x86_64 16 | from sortedcontainers import SortedList 17 | 18 | from stadeo.string.string_symb_stubs import * 19 | from stadeo.utils.extended_asmcfg import ExtendedAsmCFG 20 | 21 | logger = logging.getLogger('StringRevealer') 22 | logger.setLevel(logging.WARNING) 23 | 24 | 25 | class StringRevealer(object): 26 | # patterns borrowed from https://github.com/fireeye/flare-floss/blob/master/floss/strings.py 27 | ASCII_BYTE = br"!\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[" \ 28 | br"\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t " 29 | ASCII_RE = re.compile(b"([%s]{%d,})" % (ASCII_BYTE, 3)) 30 | UNICODE_RE = re.compile(b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, 3)) 31 | 32 | def __init__(self, attrib): 33 | self.sandbox = Sandbox_Win_x86_32 34 | if attrib == 64: 35 | raise Exception("Not supported") 36 | # 64 bit string revealer doesn't work due to https://github.com/cea-sec/miasm/issues/647 37 | self.sandbox = Sandbox_Win_x86_64 38 | parser = self.sandbox.parser() 39 | self.options = parser.parse_args() 40 | self.options.use_windows_structs = True 41 | self.options.usesegm = True 42 | self.options.mimic_env = True 43 | self.options.jitter = "llvm" 44 | self.sb = None 45 | 46 | @staticmethod 47 | def _exec_callback(dse, func, occurances, jitter, strings, get_strings_from_dse): 48 | occurances[jitter.pc] = occurances.get(jitter.pc, 0) + 1 49 | if occurances[jitter.pc] > 500: 50 | return False 51 | # extracts strings more often, but is naturally slower, not needed for Stantinko, one could use it elsewhere: 52 | # if func.loc_db.get_offset_location(jitter.pc): 53 | # # snap = dse.take_snapshot() 54 | # # dse.update_state_from_concrete() 55 | # strings.update(get_strings_from_dse(dse)) 56 | # # dse.restore_snapshot(snap) 57 | dse.callback(jitter) 58 | return True 59 | 60 | def process_all(self, ip='localhost', port=4455, conn=None): 61 | """ 62 | Reveals strings in all functions recognized by IDA 63 | :param ip: optional, IP of the computer running rpyc server in IDA 64 | :param port: optional, port of the computer running rpyc server in IDA 65 | :param conn: optional, already estabilished connection to running rpyc server in IDA 66 | :return: dictionary mapping each processed function address to the respective revealed strings 67 | """ 68 | close_conn = False 69 | if not conn: 70 | close_conn = True 71 | conn = rpyc.classic.connect(ip, port) 72 | 73 | strings = {} 74 | file_path = conn.modules.idaapi.get_input_file_path() 75 | with mock.patch("builtins.open", conn.builtins.open): 76 | self.sb = self.sandbox(file_path, self.options, globals()) 77 | # put some mem above initial SP 78 | sp = self.sb.jitter.arch.getsp(self.sb.jitter.attrib) 79 | setattr(self.sb.jitter.cpu, sp.name, self.sb.jitter.stack_base + self.sb.jitter.stack_size - 0x8 * 80) 80 | for func_addr in conn.modules.idautils.Functions(): 81 | func = ExtendedAsmCFG(file_path) 82 | func.disassemble(func_addr, conn) 83 | strings[func_addr] = self._process_func(func) 84 | 85 | if close_conn: 86 | conn.close() 87 | 88 | return strings 89 | 90 | def process_funcs(self, func_addresses, ip='localhost', port=4455, conn=None): 91 | """ 92 | Reveals strings in all supplied function addresses 93 | :param func_addresses: function addresses to process 94 | :param ip: optional, IP of the computer running rpyc server in IDA 95 | :param port: optional, port of the computer running rpyc server in IDA 96 | :param conn: optional, already estabilished connection to running rpyc server in IDA 97 | :return: dictionary mapping each processed function address to the respective revealed strings 98 | """ 99 | close_conn = False 100 | if not conn: 101 | close_conn = True 102 | conn = rpyc.classic.connect(ip, port) 103 | 104 | strings = {} 105 | file_path = conn.modules.idaapi.get_input_file_path() 106 | with mock.patch("builtins.open", conn.builtins.open): 107 | self.sb = self.sandbox(file_path, self.options, globals()) 108 | # put some mem above initial SP 109 | sp = self.sb.jitter.arch.getsp(self.sb.jitter.attrib) 110 | setattr(self.sb.jitter.cpu, sp.name, self.sb.jitter.stack_base + self.sb.jitter.stack_size - 0x8 * 80) 111 | 112 | # self.sb.jitter.jit.log_regs = True 113 | # self.sb.jitter.jit.log_mn = True 114 | for func_address in func_addresses: 115 | with mock.patch("builtins.open", conn.builtins.open): 116 | func = ExtendedAsmCFG(file_path) 117 | func.disassemble(func_address, conn) 118 | strings[func_address] = self._process_func(func) 119 | 120 | if close_conn: 121 | conn.close() 122 | 123 | return strings 124 | 125 | @staticmethod 126 | def _wipe_dse_errors(dse): 127 | dse.symb.reset_modified() 128 | dse.jitter.vm.set_exception(0) 129 | dse.jitter.cpu.set_exception(0) 130 | dse.jitter.bs._atomic_mode = False 131 | 132 | def _process_func(self, func): 133 | dse = DSEEngine(self.sb.machine) 134 | dse.attach(self.sb.jitter) # needs to be attached before setting exec_cb to overwrite it with ours 135 | bak_snap = dse.take_snapshot() 136 | dse.add_lib_handler(self.sb.libs, globals()) 137 | occurances = {} 138 | addr = func.loc_db.get_location_offset(LocKey(0)) 139 | asmb = func.loc_key_to_block(LocKey(0)) 140 | strings = set() 141 | self.sb.jitter.exec_cb = lambda x: self._exec_callback(dse, func, occurances, x, strings, 142 | self._get_strings_from_dse) 143 | self.sb.jitter.init_run(addr) 144 | try: 145 | self.sb.jitter.run_until(asmb.lines[-1].offset) 146 | except: 147 | pass 148 | dse.update_state_from_concrete() 149 | initial_snap = dse.take_snapshot() # prepared initial context 150 | strings.update(self._get_strings_from_dse(dse)) 151 | dse.restore_snapshot(initial_snap) 152 | for loc_key in func.walk_breadth_first_forward(LocKey(0)): 153 | addr = func.loc_db.get_location_offset(loc_key) 154 | if not addr: 155 | continue 156 | occurances.clear() 157 | self._emul_address(dse, addr) 158 | dse.update_state_from_concrete() 159 | strings.update(self._get_strings_from_dse(dse)) 160 | dse.restore_snapshot(initial_snap) 161 | 162 | dse.restore_snapshot(bak_snap) 163 | strings = self._get_top_level_strings(strings) 164 | return strings 165 | 166 | def _emul_address(self, dse, addr): 167 | self.sb.jitter.init_run(addr) 168 | crashed = set() 169 | while 1: 170 | self._wipe_dse_errors(dse) 171 | try: 172 | self.sb.jitter.continue_run() 173 | except Exception as e: 174 | if isinstance(e, RuntimeError) and \ 175 | e.args and e.args[0] == "Cannot find address" and \ 176 | self.sb.jitter.pc not in crashed: 177 | instr = self.sb.jitter.jit.mdis.dis_instr(self.sb.jitter.pc) 178 | crashed.add(self.sb.jitter.pc) 179 | if instr: 180 | next_addr = self.sb.jitter.pc + instr.l 181 | self.sb.jitter.init_run(next_addr) 182 | continue 183 | break 184 | 185 | @staticmethod 186 | def _get_top_level_strings(strings): 187 | new_strings = set() 188 | while strings: 189 | string = strings.pop() 190 | for tmp_string in strings | new_strings: 191 | if string in tmp_string: 192 | break 193 | else: 194 | new_strings.add(string) 195 | return new_strings 196 | 197 | def _get_strings_from_dse(self, dse): 198 | modified_mem = SortedList(key=lambda x: int(x[0])) 199 | for key, val in dse.symb.modified(ids=False, mems=True): 200 | try: 201 | val = dse.eval_expr(key) 202 | key = dse.eval_expr(key.ptr) 203 | except RuntimeError: 204 | continue 205 | if not key.is_int() or not val.is_int(): 206 | continue 207 | modified_mem.add((key, val)) 208 | following_address = None 209 | current_sequence = b"" 210 | strings = set() 211 | for address, value in modified_mem: 212 | if following_address == address: 213 | current_sequence += int(value).to_bytes(value.size // 8, "little") 214 | else: 215 | self._update_strings_from_sequence(current_sequence, strings) 216 | current_sequence = int(value).to_bytes(value.size // 8, "little") 217 | following_address = expr_simp(address + ExprInt(value.size // 8, address.size)) 218 | self._update_strings_from_sequence(current_sequence, strings) 219 | return strings 220 | 221 | def _update_strings_from_sequence(self, sequence, strings): 222 | strings.update([i.decode() for i in self.ASCII_RE.findall(sequence)]) 223 | strings.update([i.decode("utf-16le") for i in self.UNICODE_RE.findall(sequence)]) 224 | -------------------------------------------------------------------------------- /stadeo/string/string_symb_stubs.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | 7 | from binascii import hexlify 8 | 9 | from miasm.expression.expression import * 10 | 11 | 12 | # TODO add stubs for other string manipulation functions - current stantinko versions has been seen to only use strcat 13 | # and strcpy via win api 14 | 15 | 16 | def get_win_str_data_a(jitter, ad_str, max_char=None): 17 | l = 0 18 | tmp = ad_str 19 | while ((max_char is None or l < max_char) and 20 | jitter.vm.get_mem(tmp, 1) != b"\x00"): 21 | tmp += 1 22 | l += 1 23 | data = jitter.vm.get_mem(ad_str, l) 24 | return data 25 | 26 | 27 | def get_win_str_data_w(jitter, ad_str, max_char=None): 28 | l = 0 29 | tmp = ad_str 30 | while ((max_char is None or l < max_char) and 31 | jitter.vm.get_mem(tmp, 2) != b"\x00\x00"): 32 | tmp += 2 33 | l += 2 34 | s = jitter.vm.get_mem(ad_str, l) 35 | return s 36 | 37 | 38 | def kernel32_lstrcat(dse, get_win_str_data, zero_pad): 39 | arg_ptr2 = dse.jitter.get_arg_n_stdcall(2) 40 | arg_ptr1 = dse.jitter.get_arg_n_stdcall(1) 41 | s2 = get_win_str_data(dse.jitter, arg_ptr2) 42 | s1 = get_win_str_data(dse.jitter, arg_ptr1) 43 | real_len = len(s2) * 8 + zero_pad * 8 44 | value = int(hexlify(s2[::-1]), 16) 45 | rhs = ExprCompose(ExprInt(value, len(s2) * 8), ExprInt(0, zero_pad * 8)) 46 | if dse.jitter.ir_arch.attrib == 32: 47 | stack_ptr = ExprMem(dse.jitter.ir_arch.sp + ExprInt(4, 32), dse.jitter.ir_arch.attrib) 48 | shifted_evaluated_stack_ptr = dse.eval_expr(stack_ptr + ExprInt(len(s1), 32)) 49 | lhs = ExprMem(shifted_evaluated_stack_ptr, real_len) 50 | upd = {lhs: rhs, 51 | ExprId("EAX", 32): dse.eval_expr(stack_ptr)} 52 | else: 53 | lhs = ExprMem(ExprId("RCX", 64) + ExprInt(len(s1), 64), real_len) 54 | upd = {lhs: rhs, 55 | ExprId("RAX", 64): dse.eval_expr(ExprId("RCX", 64))} 56 | dse.update_state(upd) 57 | # apply ret effects 58 | rhs = dse.eval_expr(dse.jitter.ir_arch.sp + ExprInt(12, dse.jitter.ir_arch.sp.size)) 59 | dse.update_state({dse.jitter.ir_arch.sp: rhs}) 60 | 61 | 62 | def kernel32_lstrcatW_symb(dse): 63 | kernel32_lstrcat(dse, get_win_str_data_w, 2) 64 | 65 | 66 | def kernel32_lstrcatA_symb(dse): 67 | kernel32_lstrcat(dse, get_win_str_data_a, 1) 68 | 69 | 70 | def kernel32_lstrcpy(dse, get_win_str_data, zero_pad): 71 | arg_ptr2 = dse.jitter.get_arg_n_stdcall(2) 72 | s2 = get_win_str_data(dse.jitter, arg_ptr2) 73 | real_len = len(s2) * 8 + zero_pad * 8 74 | value = int(hexlify(s2[::-1]), 16) 75 | rhs = ExprCompose(ExprInt(value, len(s2) * 8), ExprInt(0, zero_pad * 8)) 76 | if dse.jitter.ir_arch.attrib == 32: 77 | stack_ptr = ExprMem(dse.jitter.ir_arch.sp + ExprInt(4, 32), dse.jitter.ir_arch.attrib) 78 | evaluated_stack_ptr = dse.eval_expr(stack_ptr) 79 | lhs = ExprMem(evaluated_stack_ptr, real_len) 80 | upd = {lhs: rhs, 81 | ExprId("EAX", 32): lhs.ptr} 82 | else: 83 | lhs = ExprMem(ExprId("RCX", 64), real_len) 84 | upd = {lhs: rhs, 85 | ExprId("RAX", 64): lhs.ptr} 86 | dse.update_state(upd) 87 | # apply ret effects 88 | rhs = dse.eval_expr(dse.jitter.ir_arch.sp + ExprInt(12, dse.jitter.ir_arch.sp.size)) 89 | dse.update_state({dse.jitter.ir_arch.sp: rhs}) 90 | 91 | 92 | def kernel32_lstrcpyA_symb(dse): 93 | kernel32_lstrcpy(dse, get_win_str_data_a, 1) 94 | 95 | 96 | def kernel32_lstrcpyW_symb(dse): 97 | kernel32_lstrcpy(dse, get_win_str_data_w, 2) 98 | -------------------------------------------------------------------------------- /stadeo/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | -------------------------------------------------------------------------------- /stadeo/utils/extended_asmcfg.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | 7 | from binascii import hexlify 8 | 9 | from miasm.analysis.binary import Container 10 | from miasm.analysis.depgraph import * 11 | from miasm.analysis.disasm_cb import get_ira 12 | from miasm.analysis.machine import Machine 13 | from miasm.arch.x86.arch import instruction_x86, additional_info, mn_x86, conditional_branch, unconditional_branch 14 | from miasm.arch.x86.sem import ir_x86_64 15 | from miasm.core.asmblock import AsmBlock, AsmConstraint, AsmConstraintNext, AsmConstraintTo, bbl_simplifier, \ 16 | asm_resolve_final, AsmCFG 17 | from miasm.core.bin_stream import bin_stream_pe 18 | from miasm.core.utils import pck32, pck64, upck64, upck32 19 | from miasm.expression.expression import * 20 | from miasm.ir.ir import IRBlock, IntermediateRepresentation, IRCFG 21 | from miasm.ir.symbexec import SymbolicExecutionEngine 22 | from miasm.loader import pe_init 23 | from miasm.core.interval import interval 24 | from pprint import pformat 25 | import logging 26 | 27 | logger = logging.getLogger('ExtendedAsmCFG') 28 | logger.setLevel(logging.WARNING) 29 | 30 | 31 | def is_bad_expr(expr): 32 | return expr.is_id() and expr.name in ["RIP", "EIP", "zf", "nf", "pf", "of", "cf", "af", "df", "IRDst"] 33 | 34 | 35 | def custom_get(self, loc_key, elements, line_nb, heads): 36 | """Compute the dependencies of @elements at line number @line_nb in 37 | the block named @loc_key in the current IRCFG, before the execution of 38 | this line. Dependency check stop if one of @heads is reached 39 | @loc_key: LocKey instance 40 | @element: set of Expr instances 41 | @line_nb: int 42 | @heads: set of LocKey instances 43 | Return an iterator on DiGraph(DependencyNode) 44 | """ 45 | # Init the algorithm 46 | inputs = {element: set() for element in elements} 47 | initial_state = DependencyState(loc_key, inputs, line_nb) 48 | todo = {initial_state} 49 | done = set() 50 | dpResultcls = DependencyResultImplicit if self._implicit else DependencyResult 51 | 52 | while todo: 53 | state = todo.pop() 54 | self._compute_intrablock(state) 55 | done_state = state.get_done_state() 56 | if done_state in done: 57 | continue 58 | done.add(done_state) 59 | if state.loc_key in heads or not state.pending: 60 | yield dpResultcls(self._ircfg, initial_state, state, elements) 61 | continue 62 | 63 | if self._implicit: 64 | # Force IRDst to be tracked, except in the input block 65 | state.pending[self._ircfg.IRDst] = set() 66 | 67 | # Propagate state to parents 68 | for pred in self._ircfg.predecessors_iter(state.loc_key): 69 | todo.add(state.extend(pred)) 70 | 71 | 72 | def custom_get_range(self): 73 | """Returns the offset hull of an AsmBlock""" 74 | try: 75 | rng = (self.lines[0].offset, 76 | self.lines[-1].offset + self.lines[-1].l) 77 | if None in rng: 78 | rng = 0, 0 79 | except (IndexError, TypeError): 80 | rng = 0, 0 81 | return rng 82 | 83 | 84 | # support for premature termination of tracking depending on heads 85 | DependencyGraph.get = custom_get 86 | AsmBlock.get_range = custom_get_range 87 | 88 | 89 | def custom_break_flow(self): 90 | if self.name in conditional_branch + unconditional_branch: 91 | return True 92 | if self.name.startswith('LOOP'): 93 | return True 94 | if self.name.startswith('RET'): 95 | return True 96 | if self.name.startswith('INT'): 97 | return True 98 | if self.name.startswith('SYS'): 99 | return True 100 | if self.name.startswith('CMOV'): 101 | return True 102 | if self.name.startswith('SBB'): 103 | return True 104 | return self.name in ['CALL', 'HLT', 'IRET', 'IRETD', 'IRETQ', 'ICEBP', 'UD2'] 105 | 106 | 107 | def custom_split_flow(self): 108 | if self.name in conditional_branch: 109 | return True 110 | if self.name in unconditional_branch: 111 | return False 112 | if self.name.startswith('LOOP'): 113 | return True 114 | if self.name.startswith('INT'): 115 | return True 116 | if self.name.startswith('SYS'): 117 | return True 118 | if self.name.startswith('CMOV'): 119 | return True 120 | if self.name.startswith('SBB'): 121 | return True 122 | if self.name in ['CALL']: 123 | return True 124 | return False 125 | 126 | 127 | instruction_x86.breakflow = custom_break_flow 128 | instruction_x86.splitflow = custom_split_flow 129 | 130 | old_mod_pc = ir_x86_64.mod_pc 131 | 132 | 133 | def custom_mod_pc(self, instr, instr_ir, extra_ir): 134 | if None in [instr.offset, instr.l]: 135 | return 136 | old_mod_pc(self, instr, instr_ir, extra_ir) 137 | 138 | 139 | ir_x86_64.mod_pc = custom_mod_pc 140 | 141 | 142 | def custom_get_next_loc_key(self, instr): 143 | if not instr.offset or not instr.l and self.asm_block.lines[-1] == instr: 144 | return [i for i in self.asm_block.bto if i.c_t == "c_next"][0].loc_key 145 | loc_key = self.loc_db.get_or_create_offset_location(instr.offset + instr.l) 146 | self.split_offset = instr.offset + instr.l 147 | return loc_key 148 | 149 | 150 | def custom_new_ircfg_from_asmcfg(self, asmcfg, *args, **kwargs): 151 | """ 152 | Return a new instance of IRCFG from an @asmcfg 153 | @asmcfg: AsmCFG instance 154 | """ 155 | 156 | ircfg = IRCFG(self.IRDst, self.loc_db, *args, **kwargs) 157 | self.new_blocks = [] 158 | for block in asmcfg.blocks: 159 | self.add_asmblock_to_ircfg(block, ircfg, False, asmcfg) 160 | while self.new_blocks: 161 | block = self.new_blocks.pop() 162 | asmcfg.add_block(block) 163 | asmcfg.rebuild_edges() 164 | self.add_asmblock_to_ircfg(block, ircfg, False, asmcfg) 165 | return ircfg 166 | 167 | 168 | def custom_add_asmblock_to_ircfg(self, block, ircfg, gen_pc_updt=False, asmcfg=None): 169 | """ 170 | Add a native block to the current IR 171 | @block: native assembly block 172 | @ircfg: IRCFG instance 173 | @gen_pc_updt: insert PC update effects between instructions 174 | """ 175 | 176 | loc_key = block.loc_key 177 | ir_blocks_all = [] 178 | 179 | assignments = [] 180 | self.asm_block = block 181 | for instr in block.lines: 182 | if loc_key is None: 183 | assignments = [] 184 | loc_key = self.get_loc_key_for_instr(instr) 185 | split = self.add_instr_to_current_state( 186 | instr, block, assignments, 187 | ir_blocks_all, gen_pc_updt 188 | ) 189 | if split: 190 | ir_blocks_all.append(IRBlock(loc_key, assignments)) 191 | loc_key = None 192 | if len(assignments) != len(block.lines) and asmcfg: 193 | new_block = block.split(asmcfg.loc_db, self.split_offset) 194 | self.new_blocks.append(new_block) 195 | break 196 | assignments = [] 197 | if loc_key is not None: 198 | ir_blocks_all.append(IRBlock(loc_key, assignments)) 199 | 200 | new_ir_blocks_all = self.post_add_asmblock_to_ircfg(block, ircfg, ir_blocks_all) 201 | for irblock in new_ir_blocks_all: 202 | ircfg.add_irblock(irblock) 203 | return new_ir_blocks_all 204 | 205 | 206 | # support for manually added assembly without offset and size 207 | IntermediateRepresentation.add_asmblock_to_ircfg = custom_add_asmblock_to_ircfg 208 | IntermediateRepresentation.get_next_loc_key = custom_get_next_loc_key 209 | IntermediateRepresentation.new_ircfg_from_asmcfg = custom_new_ircfg_from_asmcfg 210 | 211 | 212 | def create_jump_instruction(mode, target): 213 | """ 214 | :param mode: 32 or 64, depends on architecture 215 | :param target: Expr to jump to 216 | :return: created instruction 217 | """ 218 | tmp_ln = instruction_x86("JMP", mode, [target]) 219 | tmp_ln.additional_info = additional_info() 220 | tmp_ln.additional_info.g1.value = 0 221 | return tmp_ln 222 | 223 | 224 | def create_mov_instruction(mode, dst, src): 225 | tmp_ln = instruction_x86("MOV", mode, [dst, src]) 226 | tmp_ln.additional_info = additional_info() 227 | tmp_ln.additional_info.g1.value = 0 228 | return tmp_ln 229 | 230 | 231 | def create_cond_branch_instruction(mode, name, target): 232 | tmp_ln = instruction_x86(name, mode, [target]) 233 | tmp_ln.additional_info = additional_info() 234 | tmp_ln.additional_info.g1.value = 0 235 | return tmp_ln 236 | 237 | 238 | def create_cmp_j_instructions(mode, expr, val, target, kind): 239 | cmp_inst = instruction_x86("CMP", mode, [expr, val]) 240 | cmp_inst.additional_info = additional_info() 241 | cmp_inst.additional_info.g1.value = 0 242 | 243 | jz_inst = instruction_x86(kind, mode, [target]) 244 | jz_inst.additional_info = additional_info() 245 | jz_inst.additional_info.g1.value = 0 246 | return [cmp_inst, jz_inst] 247 | 248 | 249 | def create_nop(mode): 250 | nop_inst = instruction_x86("NOP", mode, []) 251 | nop_inst.additional_info = additional_info() 252 | nop_inst.additional_info.g1.value = 0 253 | return nop_inst 254 | 255 | 256 | def remove_redundant_and_unpin_blocks(asmcfg, head, mode, unpin=True): 257 | """ 258 | To unpin a block means to unset associated address. New one can be calculated then. 259 | :return: 260 | """ 261 | reachable_loc_keys = list(asmcfg.reachable_sons(head)) 262 | blocks_to_be_removed = [] 263 | rip = ExprId("RIP", 64) 264 | new_next_addr_card = ExprLoc(asmcfg.loc_db.get_or_create_name_location('_'), 64) 265 | for block in asmcfg.blocks: 266 | if block.loc_key not in reachable_loc_keys: 267 | blocks_to_be_removed.append(block) 268 | elif unpin: 269 | for instr in block.lines: 270 | for ind in range(len(instr.args)): 271 | if rip in instr.args[ind]: 272 | next_addr = ExprInt(instr.offset + instr.l, 64) 273 | fix_dict = {rip: rip + next_addr - new_next_addr_card} 274 | instr.args[ind] = instr.args[ind].replace_expr(fix_dict) 275 | 276 | if not block.lines: 277 | block.lines = [create_nop(mode)] 278 | if unpin and asmcfg.loc_db.get_location_offset(block.loc_key): 279 | asmcfg.loc_db.unset_location_offset(block.loc_key) 280 | 281 | for block in blocks_to_be_removed: 282 | asmcfg.del_block(block) 283 | 284 | 285 | def fix_multiple_next_constraints(asmcfg, mode): 286 | """ 287 | When there are multiple blocks proceeding another block with no jump, add one. 288 | :return: 289 | """ 290 | blocks_to_be_added = [] 291 | for loc_key in asmcfg.nodes(): 292 | next_edges = {edge: constraint for edge, constraint in asmcfg.edges2constraint.items() if 293 | constraint == AsmConstraint.c_next} 294 | pred_next = list(ploc_key for (ploc_key, dloc_key) in next_edges if dloc_key == loc_key) 295 | if len(pred_next) > 1: 296 | for index in range(1, len(pred_next)): 297 | inst = create_jump_instruction(mode, ExprLoc(loc_key, mode)) 298 | 299 | new_block_loc_key = asmcfg.loc_db.add_location() 300 | new_block = AsmBlock(new_block_loc_key) 301 | new_block.addline(inst) 302 | new_block.bto = {AsmConstraintTo(loc_key)} 303 | 304 | asmcfg.loc_key_to_block(pred_next[index]).bto = {AsmConstraintNext(new_block_loc_key)} 305 | blocks_to_be_added.append(new_block) 306 | # one while might be sufficient, depends on type of _nodes 307 | for block in blocks_to_be_added: 308 | asmcfg.add_block(block) 309 | 310 | 311 | def write_patches_to_file(asmcfg, exectbl, out_addr, out_file_name, mode, max_addr=2 ** 64 - 1, head=None): 312 | if head is None: 313 | head = asmcfg.heads()[0] 314 | 315 | asmcfg = bbl_simplifier.apply_simp(asmcfg) 316 | asmcfg.rebuild_edges() 317 | remove_redundant_and_unpin_blocks(asmcfg, head, mode) 318 | fix_multiple_next_constraints(asmcfg, mode) 319 | # careless block reordering might have damaged edges of the graph and introduced dead blocks 320 | asmcfg.rebuild_edges() 321 | 322 | asmcfg.loc_db.set_location_offset(head, out_addr) 323 | patches = asm_resolve_final(mn_x86, asmcfg, asmcfg.loc_db, dst_interval=interval([(out_addr, max_addr)])) 324 | 325 | last_empty_address = 0 326 | for offset, raw in patches.items(): 327 | logger.debug( 328 | "patch addr rva is 0x%x; raw is 0x%x; the patch: %s" % (exectbl.virt2rva(offset), offset, raw.hex())) 329 | exectbl.img_rva[exectbl.virt2rva(offset)] = raw 330 | next_empty_address = offset + len(raw) 331 | if last_empty_address < next_empty_address: 332 | last_empty_address = next_empty_address 333 | 334 | with open(out_file_name + ".bak", 'wb') as bak: 335 | with open(out_file_name, 'rb') as fl: 336 | bak.write(fl.read()) 337 | with open(out_file_name, 'wb') as fl: 338 | fl.write(bytes(exectbl)) 339 | return last_empty_address 340 | 341 | 342 | class MySymbolicExecutionEngine(SymbolicExecutionEngine): 343 | def __init__(self, pool_bin, jtc_var, *args, **kwargs): 344 | super(MySymbolicExecutionEngine, self).__init__(*args, **kwargs) 345 | self.pool_bin = pool_bin 346 | self.jtc_var = jtc_var 347 | 348 | def mem_read(self, expr_mem): 349 | """Memory read wrapper for symbolic execution 350 | @expr_mem: ExprMem""" 351 | if not expr_mem.ptr.is_int() or self.jtc_var == expr_mem: 352 | return super(MySymbolicExecutionEngine, self).mem_read(expr_mem) 353 | addr = expr_mem.ptr.arg.arg 354 | size = expr_mem.size // 8 355 | value = self.pool_bin.getbytes(addr, size) 356 | final = ExprInt(int(hexlify(value[::-1]), 16), expr_mem.size) 357 | return final 358 | 359 | 360 | class JTCVariableDependencyGraph(DependencyGraph): 361 | def __init__(self, loc_key, *args, **kwargs): 362 | super(JTCVariableDependencyGraph, self).__init__(*args, **kwargs) 363 | self.jtc_var = None 364 | self.done = False 365 | self.loc_key = loc_key 366 | 367 | def _track_exprs(self, state, assignblk, line_nb): 368 | """Track pending expression in an assignblock""" 369 | if self.done: 370 | return 371 | future_pending = {} 372 | node_resolved = set() 373 | for dst, src in assignblk.items(): 374 | # Only track pending 375 | if dst not in state.pending: 376 | continue 377 | # Track IRDst in implicit mode only 378 | if dst == self._ircfg.IRDst and not self._implicit: 379 | continue 380 | assert dst not in node_resolved 381 | node_resolved.add(dst) 382 | dependencies = self._follow_apply_cb(src) 383 | 384 | state.link_element(dst, line_nb) 385 | state.link_dependencies(dst, line_nb, 386 | dependencies, future_pending) 387 | 388 | # Update pending nodes 389 | if not self.jtc_var and state.loc_key == self.loc_key: 390 | for expr in future_pending: 391 | if expr.is_mem() or (expr.is_id() and 392 | expr.name not in ["RIP", "EIP", "zf", "nf", "pf", "of", "cf", "af", "df", 393 | self._ircfg.IRDst.name]): 394 | self.jtc_var = expr 395 | state.pending = {} 396 | # break 397 | return 398 | 399 | state.remove_pendings(node_resolved) 400 | state.add_pendings(future_pending) 401 | 402 | 403 | class ExtendedAsmCFG(AsmCFG): 404 | def __init__(self, file_name, conn=None, cont=None, exectbl=None, *args, **kwargs): 405 | super(ExtendedAsmCFG, self).__init__(loc_db=LocationDB(), *args, **kwargs) 406 | self.file_name = file_name 407 | if not cont: 408 | if conn: 409 | stream = conn.builtins.open(file_name, 'rb') 410 | else: 411 | stream = open(file_name, 'rb') 412 | cont = Container.from_stream(stream) 413 | self.cont = cont 414 | self.mode = int(cont.arch[-2:]) 415 | self.address_size = self.mode // 8 416 | self.pck = pck32 417 | self.upck = upck32 418 | self.machine = Machine(cont.arch) 419 | self.disassembler = self.machine.dis_engine 420 | if self.mode == 64: 421 | self.pck = pck64 422 | self.upck = upck64 423 | self._exectbl = exectbl 424 | if not exectbl: 425 | self._exectbl = pe_init.PE(cont.executable) 426 | self._dis_engine = None 427 | self.func_addr = None 428 | self.jmp_table_loc_keys = set() 429 | 430 | def _process_cmov(self, cur_bloc, last_instruction): 431 | assignment_block = AsmBlock(self.loc_db.add_location()) 432 | cond_block = AsmBlock(self.loc_db.add_location()) 433 | dst = last_instruction.args[0] 434 | src = last_instruction.args[1] 435 | assignment_block.lines.append(create_mov_instruction(self.mode, dst, src)) 436 | branch_target = next(iter(cur_bloc.bto)).loc_key 437 | assignment_block.lines.append(create_jump_instruction(self.mode, ExprLoc(branch_target, self.mode))) 438 | branch_name = "J" + last_instruction.name[len("CMOV"):] 439 | cur_bloc.lines.pop() 440 | if not cur_bloc.lines: 441 | cur_bloc.lines = [create_nop(self.mode)] 442 | cond_block.lines.append(create_cond_branch_instruction(self.mode, branch_name, 443 | ExprLoc(assignment_block.loc_key, self.mode))) 444 | assignment_block.bto = {AsmConstraintTo(branch_target)} 445 | cond_block.bto = {AsmConstraintNext(branch_target), AsmConstraintTo(assignment_block.loc_key)} 446 | cur_bloc.bto = {AsmConstraintNext(cond_block.loc_key)} 447 | self.add_block(assignment_block) 448 | self.add_block(cond_block) 449 | 450 | def _process_sbb(self, cur_bloc, last_instruction): 451 | assignment_block = AsmBlock(self.loc_db.add_location()) 452 | cond_block = AsmBlock(self.loc_db.add_location()) 453 | reg = last_instruction.args[0] 454 | assignment_block.lines.append(create_mov_instruction(self.mode, reg, ExprInt(-1, reg.size))) 455 | branch_target = next(iter(cur_bloc.bto)).loc_key 456 | assignment_block.lines.append(create_jump_instruction(self.mode, ExprLoc(branch_target, self.mode))) 457 | branch_name = "JB" # JC is not implemented in miasm, using alias 458 | cur_bloc.lines.pop() 459 | pre_branch_block = AsmBlock(self.loc_db.add_location()) 460 | pre_branch_block.lines = [create_mov_instruction(self.mode, reg, ExprInt(0, reg.size))] 461 | cond_block.lines.append(create_cond_branch_instruction(self.mode, branch_name, 462 | ExprLoc(assignment_block.loc_key, self.mode))) 463 | if not cur_bloc.lines: 464 | cur_bloc.lines = [create_nop(self.mode)] 465 | assignment_block.bto = {AsmConstraintTo(branch_target)} 466 | cur_bloc.bto = {AsmConstraintNext(cond_block.loc_key)} 467 | cond_block.bto = {AsmConstraintNext(pre_branch_block.loc_key), AsmConstraintTo(assignment_block.loc_key)} 468 | pre_branch_block.bto = {AsmConstraintNext(branch_target)} 469 | self.add_block(assignment_block) 470 | self.add_block(cond_block) 471 | self.add_block(pre_branch_block) 472 | 473 | def _process_adc(self, cur_bloc, last_instruction): 474 | assignment_block = AsmBlock(self.loc_db.add_location()) 475 | reg = last_instruction.args[0] 476 | assignment_block.lines.append(create_mov_instruction(self.mode, reg, ExprInt(-1, reg.size))) 477 | branch_target = next(iter(cur_bloc.bto)).loc_key 478 | assignment_block.lines.append(create_jump_instruction(self.mode, ExprLoc(branch_target, self.mode))) 479 | branch_name = "JB" # JC is not implemented in miasm, using alias 480 | cur_bloc.lines.pop() 481 | cur_bloc.lines.append(create_mov_instruction(self.mode, reg, ExprInt(0, reg.size))) 482 | cur_bloc.lines.append(create_cond_branch_instruction(self.mode, branch_name, 483 | ExprLoc(assignment_block.loc_key, self.mode))) 484 | self.add_block(assignment_block) 485 | assignment_block.bto = {AsmConstraintTo(branch_target)} 486 | cur_bloc.bto.add(AsmConstraintTo(assignment_block.loc_key)) 487 | 488 | @staticmethod 489 | def _eliminate_jtc_var_slice_cb(expr, sizes, target): 490 | if expr.is_compose(): 491 | if expr.args[0].is_slice() and expr.args[0].arg.is_id() and expr.args[0].arg == target: 492 | size = expr.args[0].size 493 | sizes.add(size) 494 | if expr.args[0].is_id() and expr.args[0] == target: 495 | size = expr.size 496 | sizes.add(size) 497 | elif expr.is_slice() and expr.arg.is_id() and expr.arg == target: 498 | size = expr.size 499 | sizes.add(size) 500 | 501 | def _process_jmp_table(self, cur_bloc, mn, attrib, loc_db, pool_bin, offsets_to_dis): 502 | # TODO add support for jump tables with "AND cntrl_var, range" boundary check; such jmp tables were present only 503 | # in library functions in Stantinko samples 504 | # add current block to the asmcfg to make it accessible in the ircfg edges, add_block is called anyway right 505 | # after this callback, it will notice that the block has been already added 506 | self.add_block(cur_bloc) 507 | dst_address = loc_db.get_location_offset(cur_bloc.loc_key) 508 | 509 | logger.info("Possible jump table addr: 0x%x" % dst_address) 510 | 511 | ira = get_ira(mn, attrib) 512 | 513 | ir_arch = ira(loc_db) 514 | 515 | ircfg = ir_arch.new_ircfg_from_asmcfg(self) 516 | 517 | # the previous blocks should have exactly 1 predecessor dictating range 518 | predecessors = self.predecessors(cur_bloc.loc_key) 519 | if len(predecessors) != 1: 520 | logger.info("Expected exactly one predecessor") 521 | return 522 | predecessor = ircfg.blocks[predecessors.pop()] 523 | 524 | irdst_block = ircfg.blocks[cur_bloc.loc_key] 525 | if len(irdst_block.assignblks) != len(cur_bloc.lines): 526 | processed = set() 527 | todo = {irdst_block.loc_key} 528 | while not irdst_block.dst.is_mem(): 529 | loc_key = todo.pop() 530 | if loc_key in processed: 531 | continue 532 | processed.add(loc_key) 533 | irdst_block = ircfg.blocks[loc_key] 534 | todo.update(ircfg.successors(loc_key)) 535 | 536 | # we shouldn't stumble upon crashing segm and call operators even thought implicit is required to process 537 | # initial IRDst(mentioned operators cause crashes of the engine behind implicit) since we operate only on the 538 | # 2 crucial basic blocks. The predecessor contains range of the jump table, we use it to determine constructs 539 | # of the jump table and track back base code segment address assignment to target the msvc compiler and x64 540 | # architecture, other compilers use directly RIP related addressing to get the address. 541 | 542 | # get real predecessor 543 | asm_block = self.loc_key_to_block(predecessor.loc_key) 544 | if len(predecessor.assignblks) != len(asm_block.lines): 545 | processed = set() 546 | todo = {predecessor.loc_key} 547 | while cur_bloc.loc_key not in ircfg.successors(predecessor.loc_key): 548 | loc_key = todo.pop() 549 | if loc_key in processed: 550 | continue 551 | processed.add(loc_key) 552 | predecessor = ircfg.blocks[loc_key] 553 | todo.update(ircfg.successors(loc_key)) 554 | 555 | # get jump_table_control_variable from predecessor 556 | dg = DependencyGraph(ircfg, implicit=True, apply_simp=True, follow_mem=True, follow_call=False) 557 | jtcdg = JTCVariableDependencyGraph(predecessor.loc_key, 558 | ircfg, implicit=True, apply_simp=True, follow_mem=False, follow_call=False) 559 | 560 | dependency_result_iter = iter(jtcdg.get(irdst_block.loc_key, {ircfg.IRDst}, len(predecessor.assignblks), 561 | {predecessor.loc_key})) 562 | solution_predecessor = next(dependency_result_iter) 563 | # jump table control variable 564 | jtc_var = jtcdg.jtc_var 565 | if not jtc_var: 566 | logger.info("couldn't determine single jump table control variable") 567 | return 568 | # get symbolic execution engine to be used in both predecessor and jmp table block 569 | symb_exec_both = MySymbolicExecutionEngine(pool_bin, jtc_var, ir_arch) 570 | try: 571 | # symbolically evaluate lines influencing IRDst of the predecessor leading to jtc_var 572 | for line_nb in sorted({node.line_nb for node in solution_predecessor.relevant_nodes 573 | if node.loc_key == predecessor.loc_key}): 574 | assign_blk = predecessor.assignblks[line_nb] 575 | symb_exec_both.eval_updt_assignblk(assign_blk) 576 | except (KeyError, TypeError): 577 | logger.error( 578 | "Couldn't symbolically eval predecessor of 0x%x" % loc_db.get_location_offset(cur_bloc.loc_key)) 579 | # stantinko contains illegal unreachable dereferences prior jmp tables, such as 580 | # xor eax, eax; movsx eax, byte ptr [eax] 581 | return 582 | # get symbolic execution engine supporting binary memory dereference 583 | symb_exec_minimal = MySymbolicExecutionEngine(pool_bin, ir_arch, symb_exec_both.symbols.copy()) 584 | predecessor_irdst_equation = symb_exec_both.symbols[ircfg.IRDst] 585 | 586 | # get equation whose solutions solve the indirect jump 587 | irdst_block = ircfg.blocks[cur_bloc.loc_key] 588 | if len(irdst_block.assignblks) != len(cur_bloc.lines): 589 | processed = set() 590 | todo = {irdst_block.loc_key} 591 | while not irdst_block.dst.is_mem(): 592 | symb_exec_both.eval_updt_irblock(irdst_block) 593 | loc_key = todo.pop() 594 | if loc_key in processed: 595 | continue 596 | processed.add(loc_key) 597 | irdst_block = ircfg.blocks[loc_key] 598 | todo.update(ircfg.successors(loc_key)) 599 | 600 | irdst_equation = symb_exec_both.eval_updt_irblock(irdst_block) 601 | sizes = set() 602 | # prevent mem processing via raw arrays by using var ID instead 603 | # we also want to set a maximum boundary so slices don't cause the sat solver generate a huge number of results 604 | visitor = ExprVisitorCallbackTopToBottom(lambda x: self._eliminate_jtc_var_slice_cb(x, sizes, jtc_var)) 605 | irdst_equation = visitor.visit(irdst_equation) 606 | predecessor_irdst_equation = visitor.visit(predecessor_irdst_equation) 607 | size_boundary = jtc_var.size 608 | sizes = sorted(filter(lambda x: x > 1, sizes)) 609 | if sizes: 610 | size_boundary = sizes[0] 611 | jtc_var_id = ExprId("jtc_var", jtc_var.size) 612 | irdst_equation = irdst_equation.replace_expr({jtc_var: jtc_var_id}) 613 | predecessor_irdst_equation = predecessor_irdst_equation.replace_expr({jtc_var: jtc_var_id}) 614 | # track possible CS base address dependency, ignore control variable from predecessor 615 | eliminated_jtc_var_equation = irdst_equation.replace_expr({jtc_var_id: ExprInt(0, jtc_var_id.size)}) 616 | evaluated_ejtc_var_equation = symb_exec_both.eval_expr(eliminated_jtc_var_equation) 617 | if not evaluated_ejtc_var_equation.is_int(): 618 | # we need to determine code base 619 | dependencies = dg._follow_apply_cb(evaluated_ejtc_var_equation) 620 | expr_deps = {fexpr.element for fexpr in dependencies if fexpr.follow} 621 | dg_base = DependencyGraph(ircfg, implicit=False, apply_simp=True, follow_mem=True, follow_call=False) 622 | dependency_result_iter = iter(dg_base.get(cur_bloc.loc_key, expr_deps, len(cur_bloc.lines), 623 | {self.heads()[0]})) 624 | solution = next(dependency_result_iter) 625 | code_base_dict = {expr: solution.emul(ir_arch)[expr] for expr in expr_deps} 626 | irdst_equation = irdst_equation.replace_expr(code_base_dict) 627 | predecessor_irdst_equation = predecessor_irdst_equation.replace_expr(code_base_dict) 628 | 629 | # we need backward slice of the jump table destination dependencies to retain the other independent assignments 630 | # during cmp chain assembling 631 | dependency_result = dg.get(cur_bloc.loc_key, {ircfg.IRDst}, len(cur_bloc.lines), {cur_bloc.loc_key}) 632 | dependent_line_nbs = {} 633 | for solution in dependency_result: 634 | dependent_line_nbs.setdefault(solution.loc_key, set()).update( 635 | {dn.line_nb for dn in solution.relevant_nodes}) 636 | cur_bloc_new_lines = [] 637 | for loc_key, lines in dependent_line_nbs.items(): 638 | for line_nb, assignblk in enumerate(ircfg.blocks[loc_key].assignblks): 639 | if line_nb not in lines: 640 | symb_exec_minimal.eval_assignblk(assignblk) 641 | cur_bloc_new_lines.append(assignblk.instr) 642 | comparison_reg_id = None 643 | comparison_reg_value = None 644 | if jtc_var not in symb_exec_minimal.symbols.symbols_id: 645 | comparison_reg_id = jtc_var 646 | comparison_reg_value = jtc_var 647 | else: 648 | for symbol, comparison_reg_value in symb_exec_minimal.symbols.symbols_id.items(): 649 | if jtc_var in comparison_reg_value and (symbol.is_mem() or 650 | (symbol.is_id() and symbol.name not in 651 | ["RIP", "EIP", "zf", "nf", "pf", "of", "cf", "af", "df", 652 | ircfg.IRDst.name])): 653 | replaced_jtcv = comparison_reg_value.replace_expr({jtc_var: ExprInt(0, jtc_var.size)}) 654 | if isinstance(symb_exec_minimal.eval_expr(replaced_jtcv), ExprInt): 655 | comparison_reg_id = symbol 656 | break 657 | if not comparison_reg_id or not comparison_reg_value: 658 | logger.debug("Couldn't find any candidate for comparison register at 0x%x" % 659 | loc_db.get_location_offset(cur_bloc.loc_key)) 660 | return 661 | 662 | from miasm.ir.translators import Translator 663 | import z3 664 | translator = Translator.to_language("z3") 665 | solver = z3.Solver() 666 | 667 | logger.debug("predecessor_irdst_equation: %s" % str(predecessor_irdst_equation)) 668 | logger.debug(("dst_address: 0x%x" % dst_address)) 669 | logger.debug(("jump_table_control_variable: %s" % str(jtc_var))) 670 | solver.add(translator.from_expr(predecessor_irdst_equation) == dst_address) 671 | translated_jtc_var = translator.from_expr(jtc_var_id) 672 | solver.add(translated_jtc_var >= 0) 673 | solver.add(translated_jtc_var < 2 ** (size_boundary - 1) - 1) 674 | 675 | if solver.check() != z3.sat: 676 | logger.debug("Couldn't find at least one jump table control variable") 677 | return 678 | 679 | dbg_destinations = set() 680 | next_loc_key = new_block_loc_key = loc_db.add_location() 681 | 682 | logger.debug("comparison_reg_id: %s" % str(comparison_reg_id)) 683 | dst_ranges = {} 684 | counter = 0 685 | while counter < 500: 686 | val = solver.model()[translated_jtc_var].as_long() 687 | final_irdst_equation = irdst_equation.replace_expr({jtc_var_id: ExprInt(val, jtc_var_id.size)}) 688 | final_dst = int(symb_exec_both.eval_expr(final_irdst_equation)) 689 | cmp_reg_val = comparison_reg_value.replace_expr({jtc_var: ExprInt(val, jtc_var.size)}) 690 | cmp_reg_val = int(symb_exec_minimal.eval_expr(cmp_reg_val)) 691 | 692 | dst_ranges[final_dst] = dst_ranges.get(final_dst, interval()).union([(cmp_reg_val, cmp_reg_val)]) 693 | dbg_destinations.add(final_dst) 694 | offsets_to_dis.add(final_dst) 695 | 696 | solver.add(translated_jtc_var != translator.from_expr(ExprInt(val, jtc_var_id.size))) 697 | if solver.check() != z3.sat: 698 | break 699 | counter += 1 700 | 701 | if counter == 500: 702 | raise RuntimeError("Interrupted; there might be a broken slice") 703 | 704 | for dst, interv in dst_ranges.items(): 705 | cond_target_loc_key = loc_db.get_or_create_offset_location(dst) 706 | for lower, upper in interv: 707 | lower = ExprInt(lower, self.mode) 708 | upper = ExprInt(upper, self.mode) 709 | new_asm_block = AsmBlock(new_block_loc_key) 710 | new_block_loc_key = loc_db.add_location() 711 | if lower == upper: 712 | new_asm_block.lines = create_cmp_j_instructions(self.mode, comparison_reg_id, lower, 713 | ExprLoc(cond_target_loc_key, self.mode), "JZ") 714 | new_asm_block.add_cst(cond_target_loc_key, "c_to") 715 | new_asm_block.add_cst(new_block_loc_key, "c_next") 716 | else: 717 | upper_check_loc_key = loc_db.add_location() 718 | # lower boundary check 719 | new_asm_block.lines = create_cmp_j_instructions(self.mode, comparison_reg_id, lower, 720 | ExprLoc(new_block_loc_key, self.mode), "JB") 721 | new_asm_block.add_cst(new_block_loc_key, "c_to") 722 | new_asm_block.add_cst(upper_check_loc_key, "c_next") 723 | # upper boundary check 724 | upper_check_block = AsmBlock(upper_check_loc_key) 725 | upper_check_block.lines = create_cmp_j_instructions(self.mode, comparison_reg_id, upper, 726 | ExprLoc(cond_target_loc_key, self.mode), "JBE") 727 | upper_check_block.add_cst(cond_target_loc_key, "c_to") 728 | upper_check_block.add_cst(new_block_loc_key, "c_next") 729 | self.add_block(upper_check_block) 730 | self.add_block(new_asm_block) 731 | # trigger last jump unconditionally 732 | new_asm_block.bto = {AsmConstraintTo(cond_target_loc_key)} 733 | new_asm_block.lines = [create_jump_instruction(self.mode, ExprLoc(cond_target_loc_key, self.mode))] 734 | 735 | cur_bloc.lines = cur_bloc_new_lines 736 | cur_bloc.add_cst(next_loc_key, "c_next") 737 | if not cur_bloc.lines: 738 | cur_bloc.lines = [create_nop(self.mode)] 739 | self.jmp_table_loc_keys.add(cur_bloc.loc_key) 740 | logger.debug("destinations: %s" % pformat([hex(i or 0) for i in dbg_destinations])) 741 | logger.debug("blocks: %d" % counter) 742 | 743 | # noinspection PyUnusedLocal 744 | def _extended_discovery(self, dism_eng, cur_bloc, offsets_to_dis): 745 | mn = self.machine.mn 746 | attrib = self.mode 747 | pool_bin = dism_eng.bin_stream 748 | loc_db = dism_eng.loc_db 749 | if not cur_bloc.lines: 750 | return 751 | last_instruction = cur_bloc.lines[-1] 752 | if last_instruction.name.startswith("CMOV"): 753 | self._process_cmov(cur_bloc, last_instruction) 754 | elif last_instruction.name.startswith("SBB") and last_instruction.args[0] == last_instruction.args[1]: 755 | self._process_sbb(cur_bloc, last_instruction) 756 | elif last_instruction.name == 'JMP' and type(last_instruction.args[0]) in [ExprMem, ExprId]: 757 | self._process_jmp_table(cur_bloc, mn, attrib, loc_db, pool_bin, offsets_to_dis) 758 | elif last_instruction.name.startswith("INT"): 759 | offsets_to_dis = set() 760 | elif last_instruction.name == 'JMP' and last_instruction.args[0].is_loc(cur_bloc.loc_key) \ 761 | and len(cur_bloc.lines) == 1: 762 | # prevent Miasm eb fe bug https://github.com/cea-sec/miasm/issues/1257 763 | cur_bloc.lines.insert(0, create_nop(self.mode)) 764 | 765 | def disassemble(self, function_address, conn=None): 766 | unreachable = [] 767 | if conn: 768 | ea = conn.modules.idaapi.get_func(function_address) 769 | try: 770 | unreachable = [i.end_ea for i in ea.tails] + [ea.end_ea] 771 | except AttributeError: 772 | pass 773 | self.func_addr = function_address 774 | self.jmp_table_loc_keys = set() 775 | binary_stream = bin_stream_pe(self._exectbl) 776 | self._dis_engine = self.disassembler(binary_stream, loc_db=self.loc_db, dont_dis=unreachable) 777 | self._dis_engine.dis_block_callback = self._extended_discovery 778 | self._dis_engine.dis_multiblock(function_address, self) 779 | 780 | @property 781 | def exectbl(self): 782 | return self._exectbl 783 | -------------------------------------------------------------------------------- /stadeo/utils/xref_patcher.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 -*- 2 | # 3 | # Copyright (c) 2020 ESET spol. s r.o. 4 | # Author: Vladislav Hrčka 5 | # See LICENSE file for redistribution. 6 | 7 | import rpyc 8 | from miasm.analysis.binary import Container 9 | from miasm.expression.expression import * 10 | from miasm.analysis.machine import Machine 11 | from unittest import mock 12 | 13 | 14 | def compare_args(args, scn_args, conn): 15 | for ind, val in args.items(): 16 | possible_vals = (conn.modules.idc.get_operand_value(scn_args[ind], 0), 17 | conn.modules.idc.get_operand_value(scn_args[ind], 1)) 18 | if val not in possible_vals: 19 | return False 20 | return True 21 | 22 | 23 | def patch_xref(instr_offset, patch_offset, mdis, mn, exectbl): 24 | inst = mdis.dis_instr(instr_offset) 25 | new_loc_key = mdis.loc_db.add_location(offset=patch_offset - instr_offset) 26 | inst.args[0] = ExprLoc(new_loc_key, 32) 27 | patch = list(filter(lambda x: len(x) == inst.l, mn.asm(inst, mdis.loc_db))) 28 | assert len(patch) > 0, 'Couldn\'t assemble instruction of the same size.' 29 | exectbl.img_rva[exectbl.virt2rva(instr_offset)] = patch[0] 30 | 31 | 32 | def patch_xrefs(find_addr, patch_addr, args, ip='localhost', port=4455, conn=None): 33 | """ 34 | Patches xrefs with certain arguments 35 | :param find_addr: address of function whose xrefs are to be replaced 36 | :param patch_addr: the new target of the xref call 37 | :param args: dictionary mapping number of argument to its required value 38 | :param ip: optional, IP of the computer running rpyc server in IDA 39 | :param port: optional, port of the computer running rpyc server in IDA 40 | :param conn: optional, already estabilished connection to running rpyc server in IDA 41 | :return: None 42 | """ 43 | close_conn = False 44 | if not conn: 45 | close_conn = True 46 | conn = rpyc.classic.connect(ip, port) 47 | 48 | file_name = conn.modules.idaapi.get_input_file_path() 49 | idautils = conn.root.getmodule("idautils") 50 | with mock.patch("builtins.open", conn.builtins.open): 51 | cont = Container.from_stream(open(file_name, 'rb')) 52 | machine = Machine(cont.arch) 53 | mdis = machine.dis_engine(cont.bin_stream) 54 | exectbl = cont.executable 55 | 56 | for r in idautils.XrefsTo(find_addr): 57 | scn_args = conn.modules.idaapi.get_arg_addrs(r.frm) 58 | if scn_args is None and args: 59 | print("Couldn't find args of %x" % r.frm) 60 | continue 61 | if compare_args(args, scn_args, conn): 62 | patch_xref(r.frm, patch_addr, mdis, machine.mn, exectbl) 63 | 64 | with open(file_name, 'wb') as fl: 65 | fl.write(bytes(exectbl)) 66 | 67 | if close_conn: 68 | conn.close() 69 | --------------------------------------------------------------------------------