├── LICENSE
├── README.md
├── doc
    ├── BHArsenal_slide_deck.pdf
    └── usage_examples.pdf
├── setup.py
└── stadeo
    ├── __init__.py
    ├── cff
        ├── __init__.py
        ├── arcg_cache_depgraph.py
        ├── cff_recognizer.py
        ├── cff_solver.py
        └── cff_strategies.py
    ├── string
        ├── __init__.py
        ├── string_revealer.py
        └── string_symb_stubs.py
    └── utils
        ├── __init__.py
        ├── extended_asmcfg.py
        └── xref_patcher.py


/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright (c) 2020, ESET spol. s r.o.
  2 | All rights reserved.
  3 | 
  4 | Redistribution and use in source and binary forms, with or without
  5 | modification, are permitted provided that the following conditions are met:
  6 | 
  7 | 1. Redistributions of source code must retain the above copyright notice, this
  8 |    list of conditions and the following disclaimer.
  9 | 2. Redistributions in binary form must reproduce the above copyright notice,
 10 |    this list of conditions and the following disclaimer in the documentation
 11 |    and/or other materials provided with the distribution.
 12 | 
 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23 | 
 24 | The views and conclusions contained in the software and documentation are those
 25 | of the authors and should not be interpreted as representing official policies,
 26 | either expressed or implied, of the FreeBSD Project.
 27 | 
 28 | 
 29 | --------------------------------------------------------------------------------
 30 | Some regexes in stadeo/string/string_revealer.py are
 31 | 
 32 | Copyright (C) 2017 FireEye, Inc
 33 | All rights reserved.
 34 | 
 35 |   Apache License
 36 |                            Version 2.0, January 2004
 37 |                         http://www.apache.org/licenses/
 38 | 
 39 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 40 | 
 41 |    1. Definitions.
 42 | 
 43 |       "License" shall mean the terms and conditions for use, reproduction,
 44 |       and distribution as defined by Sections 1 through 9 of this document.
 45 | 
 46 |       "Licensor" shall mean the copyright owner or entity authorized by
 47 |       the copyright owner that is granting the License.
 48 | 
 49 |       "Legal Entity" shall mean the union of the acting entity and all
 50 |       other entities that control, are controlled by, or are under common
 51 |       control with that entity. For the purposes of this definition,
 52 |       "control" means (i) the power, direct or indirect, to cause the
 53 |       direction or management of such entity, whether by contract or
 54 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 55 |       outstanding shares, or (iii) beneficial ownership of such entity.
 56 | 
 57 |       "You" (or "Your") shall mean an individual or Legal Entity
 58 |       exercising permissions granted by this License.
 59 | 
 60 |       "Source" form shall mean the preferred form for making modifications,
 61 |       including but not limited to software source code, documentation
 62 |       source, and configuration files.
 63 | 
 64 |       "Object" form shall mean any form resulting from mechanical
 65 |       transformation or translation of a Source form, including but
 66 |       not limited to compiled object code, generated documentation,
 67 |       and conversions to other media types.
 68 | 
 69 |       "Work" shall mean the work of authorship, whether in Source or
 70 |       Object form, made available under the License, as indicated by a
 71 |       copyright notice that is included in or attached to the work
 72 |       (an example is provided in the Appendix below).
 73 | 
 74 |       "Derivative Works" shall mean any work, whether in Source or Object
 75 |       form, that is based on (or derived from) the Work and for which the
 76 |       editorial revisions, annotations, elaborations, or other modifications
 77 |       represent, as a whole, an original work of authorship. For the purposes
 78 |       of this License, Derivative Works shall not include works that remain
 79 |       separable from, or merely link (or bind by name) to the interfaces of,
 80 |       the Work and Derivative Works thereof.
 81 | 
 82 |       "Contribution" shall mean any work of authorship, including
 83 |       the original version of the Work and any modifications or additions
 84 |       to that Work or Derivative Works thereof, that is intentionally
 85 |       submitted to Licensor for inclusion in the Work by the copyright owner
 86 |       or by an individual or Legal Entity authorized to submit on behalf of
 87 |       the copyright owner. For the purposes of this definition, "submitted"
 88 |       means any form of electronic, verbal, or written communication sent
 89 |       to the Licensor or its representatives, including but not limited to
 90 |       communication on electronic mailing lists, source code control systems,
 91 |       and issue tracking systems that are managed by, or on behalf of, the
 92 |       Licensor for the purpose of discussing and improving the Work, but
 93 |       excluding communication that is conspicuously marked or otherwise
 94 |       designated in writing by the copyright owner as "Not a Contribution."
 95 | 
 96 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 97 |       on behalf of whom a Contribution has been received by Licensor and
 98 |       subsequently incorporated within the Work.
 99 | 
100 |    2. Grant of Copyright License. Subject to the terms and conditions of
101 |       this License, each Contributor hereby grants to You a perpetual,
102 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
103 |       copyright license to reproduce, prepare Derivative Works of,
104 |       publicly display, publicly perform, sublicense, and distribute the
105 |       Work and such Derivative Works in Source or Object form.
106 | 
107 |    3. Grant of Patent License. Subject to the terms and conditions of
108 |       this License, each Contributor hereby grants to You a perpetual,
109 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
110 |       (except as stated in this section) patent license to make, have made,
111 |       use, offer to sell, sell, import, and otherwise transfer the Work,
112 |       where such license applies only to those patent claims licensable
113 |       by such Contributor that are necessarily infringed by their
114 |       Contribution(s) alone or by combination of their Contribution(s)
115 |       with the Work to which such Contribution(s) was submitted. If You
116 |       institute patent litigation against any entity (including a
117 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
118 |       or a Contribution incorporated within the Work constitutes direct
119 |       or contributory patent infringement, then any patent licenses
120 |       granted to You under this License for that Work shall terminate
121 |       as of the date such litigation is filed.
122 | 
123 |    4. Redistribution. You may reproduce and distribute copies of the
124 |       Work or Derivative Works thereof in any medium, with or without
125 |       modifications, and in Source or Object form, provided that You
126 |       meet the following conditions:
127 | 
128 |       (a) You must give any other recipients of the Work or
129 |           Derivative Works a copy of this License; and
130 | 
131 |       (b) You must cause any modified files to carry prominent notices
132 |           stating that You changed the files; and
133 | 
134 |       (c) You must retain, in the Source form of any Derivative Works
135 |           that You distribute, all copyright, patent, trademark, and
136 |           attribution notices from the Source form of the Work,
137 |           excluding those notices that do not pertain to any part of
138 |           the Derivative Works; and
139 | 
140 |       (d) If the Work includes a "NOTICE" text file as part of its
141 |           distribution, then any Derivative Works that You distribute must
142 |           include a readable copy of the attribution notices contained
143 |           within such NOTICE file, excluding those notices that do not
144 |           pertain to any part of the Derivative Works, in at least one
145 |           of the following places: within a NOTICE text file distributed
146 |           as part of the Derivative Works; within the Source form or
147 |           documentation, if provided along with the Derivative Works; or,
148 |           within a display generated by the Derivative Works, if and
149 |           wherever such third-party notices normally appear. The contents
150 |           of the NOTICE file are for informational purposes only and
151 |           do not modify the License. You may add Your own attribution
152 |           notices within Derivative Works that You distribute, alongside
153 |           or as an addendum to the NOTICE text from the Work, provided
154 |           that such additional attribution notices cannot be construed
155 |           as modifying the License.
156 | 
157 |       You may add Your own copyright statement to Your modifications and
158 |       may provide additional or different license terms and conditions
159 |       for use, reproduction, or distribution of Your modifications, or
160 |       for any such Derivative Works as a whole, provided Your use,
161 |       reproduction, and distribution of the Work otherwise complies with
162 |       the conditions stated in this License.
163 | 
164 |    5. Submission of Contributions. Unless You explicitly state otherwise,
165 |       any Contribution intentionally submitted for inclusion in the Work
166 |       by You to the Licensor shall be under the terms and conditions of
167 |       this License, without any additional terms or conditions.
168 |       Notwithstanding the above, nothing herein shall supersede or modify
169 |       the terms of any separate license agreement you may have executed
170 |       with Licensor regarding such Contributions.
171 | 
172 |    6. Trademarks. This License does not grant permission to use the trade
173 |       names, trademarks, service marks, or product names of the Licensor,
174 |       except as required for reasonable and customary use in describing the
175 |       origin of the Work and reproducing the content of the NOTICE file.
176 | 
177 |    7. Disclaimer of Warranty. Unless required by applicable law or
178 |       agreed to in writing, Licensor provides the Work (and each
179 |       Contributor provides its Contributions) on an "AS IS" BASIS,
180 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
181 |       implied, including, without limitation, any warranties or conditions
182 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
183 |       PARTICULAR PURPOSE. You are solely responsible for determining the
184 |       appropriateness of using or redistributing the Work and assume any
185 |       risks associated with Your exercise of permissions under this License.
186 | 
187 |    8. Limitation of Liability. In no event and under no legal theory,
188 |       whether in tort (including negligence), contract, or otherwise,
189 |       unless required by applicable law (such as deliberate and grossly
190 |       negligent acts) or agreed to in writing, shall any Contributor be
191 |       liable to You for damages, including any direct, indirect, special,
192 |       incidental, or consequential damages of any character arising as a
193 |       result of this License or out of the use or inability to use the
194 |       Work (including but not limited to damages for loss of goodwill,
195 |       work stoppage, computer failure or malfunction, or any and all
196 |       other commercial damages or losses), even if such Contributor
197 |       has been advised of the possibility of such damages.
198 | 
199 |    9. Accepting Warranty or Additional Liability. While redistributing
200 |       the Work or Derivative Works thereof, You may choose to offer,
201 |       and charge a fee for, acceptance of support, warranty, indemnity,
202 |       or other liability obligations and/or rights consistent with this
203 |       License. However, in accepting such obligations, You may act only
204 |       on Your own behalf and on Your sole responsibility, not on behalf
205 |       of any other Contributor, and only if You agree to indemnify,
206 |       defend, and hold each Contributor harmless for any liability
207 |       incurred by, or claims asserted against, such Contributor by reason
208 |       of your accepting any such warranty or additional liability.
209 | 
210 |    END OF TERMS AND CONDITIONS
211 | 
212 |    APPENDIX: How to apply the Apache License to your work.
213 | 
214 |       To apply the Apache License to your work, attach the following
215 |       boilerplate notice, with the fields enclosed by brackets "{}"
216 |       replaced with your own identifying information. (Don't include
217 |       the brackets!)  The text should be enclosed in the appropriate
218 |       comment syntax for the file format. We also recommend that a
219 |       file or class name and description of purpose be included on the
220 |       same "printed page" as the copyright notice for easier
221 |       identification within third-party archives.
222 | 
223 |    Copyright {yyyy} {name of copyright owner}
224 | 
225 |    Licensed under the Apache License, Version 2.0 (the "License");
226 |    you may not use this file except in compliance with the License.
227 |    You may obtain a copy of the License at
228 | 
229 |        http://www.apache.org/licenses/LICENSE-2.0
230 | 
231 |    Unless required by applicable law or agreed to in writing, software
232 |    distributed under the License is distributed on an "AS IS" BASIS,
233 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
234 |    See the License for the specific language governing permissions and
235 |    limitations under the License.
236 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Stadeo
 2 | ======
 3 | 
 4 | Stadeo is a set of tools primarily developed to facilitate analysis of
 5 | [Stantinko](https://www.welivesecurity.com/2017/07/20/stantinko-massive-adware-campaign-operating-covertly-since-2012/),
 6 | which is a botnet performing click fraud, ad injection, social network
 7 | fraud, password stealing attacks and
 8 | [cryptomining](https://www.welivesecurity.com/2019/11/26/stantinko-botnet-adds-cryptomining-criminal-activities/).
 9 | 
10 | The scripts, written entirely in Python, deal with Stantinko's unique
11 | control-flow-flattening (CFF) and string obfuscation techniques
12 | described in our March 2020
13 | [blogpost](https://www.welivesecurity.com/2020/03/19/stantinko-new-cryptominer-unique-obfuscation-techniques/).
14 | Additionally, they can be utilized for other purposes: for example,
15 | we’ve already extended our approach to support deobfuscating the CFF
16 | featured in Emotet – a trojan that steals banking credentials and that
17 | downloads additional payloads such as ransomware.
18 | 
19 | Our deobfuscation methods use
20 | [IDA](https://www.hex-rays.com/products/ida/), which is a standard tool
21 | in the industry, and [Miasm](https://github.com/cea-sec/miasm) – an open
22 | source framework providing us with various data-flow analyses, a
23 | symbolic execution engine, a dynamic symbolic execution engine and the
24 | means to reassemble modified functions.
25 | 


--------------------------------------------------------------------------------
/doc/BHArsenal_slide_deck.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eset/stadeo/447842592f3aa6d78be5ba58e0ec9d8e407d7fb2/doc/BHArsenal_slide_deck.pdf


--------------------------------------------------------------------------------
/doc/usage_examples.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eset/stadeo/447842592f3aa6d78be5ba58e0ec9d8e407d7fb2/doc/usage_examples.pdf


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf8 -*-
 2 | #
 3 | # Copyright (c) 2020 ESET spol. s r.o.
 4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
 5 | # See LICENSE file for redistribution.
 6 | 
 7 | from setuptools import setup
 8 | 
 9 | setup(
10 |     name='stadeo',
11 |     version='0.0.1',
12 |     packages=['stadeo', 'stadeo.cff', 'stadeo.utils', 'stadeo.string'],
13 |     url='https://github.com/eset/stadeo',
14 |     license='BSD',
15 |     author='Vladislav Hrčka',
16 |     author_email='vladislav.hrcka@eset.com',
17 |     description='Stadeo is a set of tools for control-flow-flattening and string deobfuscation',
18 |     classifiers=[
19 |         "Development Status :: 5 - Production/Stable",
20 |         "Environment :: Console",
21 |         "License :: OSI Approved :: BSD License",
22 |         "Programming Language :: Python :: 3",
23 |     ],
24 |     install_requires=[
25 |         'z3-solver==4.8.7.0',
26 |         'sortedcontainers',
27 |         'rpyc',
28 |         'future',
29 |         'miasm @ git+https://github.com/cea-sec/miasm@a01c29cd82f5a717e8dee622002e1ca3e189f420',
30 |     ],
31 |     keywords=[
32 |         "reverse engineering",
33 |         "symbolic execution",
34 |         "deobfuscation",
35 |         "control flow flattening",
36 |         "string obfuscation",
37 |         "Stantinko",
38 |         "Emotet",
39 |     ],
40 | )
41 | 


--------------------------------------------------------------------------------
/stadeo/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf8 -*-
2 | #
3 | # Copyright (c) 2020 ESET spol. s r.o.
4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
5 | # See LICENSE file for redistribution.
6 | 


--------------------------------------------------------------------------------
/stadeo/cff/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf8 -*-
2 | #
3 | # Copyright (c) 2020 ESET spol. s r.o.
4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
5 | # See LICENSE file for redistribution.
6 | 


--------------------------------------------------------------------------------
/stadeo/cff/arcg_cache_depgraph.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf8 -*-
  2 | #
  3 | # Copyright (c) 2020 ESET spol. s r.o.
  4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
  5 | # See LICENSE file for redistribution.
  6 | 
  7 | from miasm.analysis import depgraph
  8 | from miasm.analysis.data_flow import AssignblkNode
  9 | from miasm.expression.expression import *
 10 | from miasm.expression.simplifications import expr_simp
 11 | 
 12 | 
 13 | def is_local_variable(expr, ir_arch_a, mn):
 14 |     if not expr.is_mem():
 15 |         return None
 16 |     ptr = expr.ptr
 17 |     diff = expr_simp(ptr - mn.regs.regs_init[ir_arch_a.sp])
 18 |     if diff.is_int() and int(expr_simp(expr_is_signed_lower(diff, ExprInt(0, diff.size)))):
 19 |         return True
 20 |     return None
 21 | 
 22 | 
 23 | def contains_local_variable(expr, ir_arch_a, mn):
 24 |     visitor = ExprWalk(lambda x: is_local_variable(x, ir_arch_a, mn))
 25 |     return visitor.visit(expr)
 26 | 
 27 | 
 28 | def custom_init(self, ircfg, initial_state, state, inputs):
 29 |     super(depgraph.DependencyResult, self).__init__(state.loc_key, state.pending)
 30 |     self.initial_state = initial_state
 31 |     self.history = state.history
 32 |     self.pending = state.pending
 33 |     self.line_nb = state.line_nb
 34 |     self.inputs = inputs
 35 |     self.links = state.links
 36 |     self._ircfg = ircfg
 37 | 
 38 |     # Init lazy elements
 39 |     self._has_loop = None
 40 |     if hasattr(state, 'pending_links'):
 41 |         self.pending_links = state.pending_links
 42 | 
 43 | 
 44 | class MyDependencyState(depgraph.DependencyState):
 45 |     def __init__(self, *args, **kwargs):
 46 |         super(depgraph.DependencyState, self).__init__(*args, **kwargs)
 47 |         self.pending_links = set()
 48 | 
 49 |     # state consisting of the pendings suits much better our needs
 50 |     def get_done_state(self):
 51 |         """Returns immutable object representing current state"""
 52 |         return self.loc_key, frozenset(self.pending)
 53 | 
 54 |     def extend(self, loc_key):
 55 |         """Return a copy of itself, with itself in history
 56 |         @loc_key: LocKey instance for the new DependencyState's loc_key
 57 |         """
 58 |         new_state = self.__class__(loc_key, self.pending)
 59 |         new_state.links = set(self.links)
 60 |         new_state.history = self.history + [loc_key]
 61 |         new_state.pending_links = set(self.pending_links)
 62 |         return new_state
 63 | 
 64 | 
 65 | def custom_visit_inner(self, expr, *args, **kwargs):
 66 |     if expr.is_id():
 67 |         self.follow.add(expr)
 68 |     elif expr.is_int():
 69 |         self.nofollow.add(expr)
 70 |     elif expr.is_loc():
 71 |         self.nofollow.add(expr)
 72 |     elif expr.is_mem():
 73 |         self.follow.add(expr)
 74 |         if not self.follow_mem:
 75 |             return None
 76 |     elif expr.is_function_call():
 77 |         self.follow.add(expr)
 78 |         if not self.follow_call:
 79 |             return None
 80 | 
 81 |     ret = super(depgraph.FilterExprSources, self).visit(expr, *args, **kwargs)
 82 |     return ret
 83 | 
 84 | 
 85 | def is_push_param(recognizer, loc_key, index):
 86 |     initial_irb = recognizer.ircfg.blocks[loc_key]
 87 |     initial_assignblk = initial_irb[index]
 88 |     target_stack_ptr = recognizer.conn.modules.idc.get_spd(initial_assignblk.instr.offset)
 89 |     todo = [(loc_key, index + 1)]
 90 |     done = set()
 91 |     while todo:
 92 |         loc_key, index = todo.pop()
 93 |         if loc_key in done:
 94 |             continue
 95 |         done.add(loc_key)
 96 |         irb = recognizer.ircfg.blocks[loc_key]
 97 |         for assignblk in irb[index:]:
 98 |             if assignblk.instr and assignblk.instr.offset:
 99 |                 stack_ptr = recognizer.conn.modules.idc.get_spd(assignblk.instr.offset)
100 |                 if stack_ptr and stack_ptr >= target_stack_ptr:
101 |                     break
102 |                 for dst, src in assignblk.items():
103 |                     if src.is_function_call():
104 |                         arg_addresses = recognizer.conn.modules.idaapi.get_arg_addrs(assignblk.instr.offset)
105 |                         if arg_addresses and initial_assignblk.instr.offset in arg_addresses:
106 |                             return True
107 |                         break
108 |         else:
109 |             for succ in recognizer.ircfg.successors(loc_key):
110 |                 todo.append((succ, 0))
111 |     return False
112 | 
113 | 
114 | depgraph.FilterExprSources.visit_inner = custom_visit_inner
115 | depgraph.DependencyState = MyDependencyState
116 | depgraph.DependencyResult.__init__ = custom_init
117 | 
118 | 
119 | class ArgCacheDependencyGraph(depgraph.DependencyGraph):
120 |     """
121 |     Since there's typically a number of sequential comparisons in cff loops, we take advantage of the fact and, memoize
122 |     already processed states. We can do this because the graph doesn't change. We also halt on mem stack arguments, they
123 |     cannot be part of cff loops.
124 |     """
125 |     def __init__(self, recognizer, *args, **kwargs):
126 |         super(ArgCacheDependencyGraph, self).__init__(*args, **kwargs)
127 |         self.incorrect = False
128 |         self.ir = recognizer.ir_arch
129 |         self.mn = recognizer.mn
130 |         self.recognizer = recognizer
131 |         self.defuse_edges = recognizer.analyses.defuse_edges
132 |         self.cached = False
133 |         self.new_cache_states = set()
134 | 
135 |     def _track_exprs(self, state, assignblk, line_nb):
136 |         """Track pending expression in an assignblock"""
137 |         if self.incorrect:
138 |             return
139 |         future_pending = {}
140 |         node_resolved = set()
141 |         for dst, src in assignblk.items():
142 |             assignblk_node = AssignblkNode(state.loc_key, line_nb, dst)
143 |             # Only track pending
144 |             if dst not in state.pending:
145 |                 if type(src) in [ExprId, ExprOp, ExprCompose] and any(src in i for i in state.pending):
146 |                     if assignblk_node in self.defuse_edges:
147 |                         # targets function arguments such as lea eax, var; push eax since constant propagation doesn't
148 |                         # work correctly in miasm; https://github.com/cea-sec/miasm/issues/1197;
149 |                         # https://github.com/cea-sec/miasm/issues/1218; https://github.com/cea-sec/miasm/issues/1259;
150 |                         # TODO when constant propagation is fixed, rework this; elaborate on 1259
151 |                         for assignblk_node in self.defuse_edges[assignblk_node]:
152 |                             if is_local_variable(assignblk_node.var, self.ir, self.mn) \
153 |                                     and assignblk_node not in self.defuse_edges:
154 |                                 break
155 |                         else:
156 |                             continue
157 |                     elif not is_local_variable(dst, self.ir, self.mn):
158 |                         continue
159 | 
160 |                     if is_push_param(self.recognizer, assignblk_node.label, assignblk_node.index):
161 |                         # prevents FPs in weird code such as push    [ebp+var_18]; call ...; add     esp, 4
162 |                         # where [ebp+var_18] is not param and it's just pushed
163 |                         self.incorrect = True
164 |                         return
165 |                 continue
166 |             # Track IRDst in implicit mode only
167 |             if dst == self._ircfg.IRDst and not self._implicit:
168 |                 continue
169 |             assert dst not in node_resolved
170 |             node_resolved.add(dst)
171 |             dependencies = self._follow_apply_cb(src)
172 | 
173 |             state.link_element(dst, line_nb)
174 |             state.link_dependencies(dst, line_nb,
175 |                                     dependencies, future_pending)
176 | 
177 |         # Update pending nodes
178 |         state.remove_pendings(node_resolved)
179 |         state.add_pendings(future_pending)
180 | 
181 |     def get(self, loc_key, elements, line_nb, heads, done_cache_states=None, incorrect_cache_states=None):
182 |         """Compute the dependencies of @elements at line number @line_nb in
183 |         the block named @loc_key in the current IRCFG, before the execution of
184 |         this line. Dependency check stop if one of @heads is reached. The difference
185 |         with the Miasm implementation is that we just want to know whether there's a
186 |         non-integer dependency and optimize the computation that way.
187 |         @loc_key: LocKey instance
188 |         @element: set of Expr instances
189 |         @line_nb: int
190 |         @heads: set of LocKey instances
191 |         Return an iterator on DiGraph(DependencyNode)
192 |         """
193 |         # Init the algorithm
194 |         if done_cache_states is None:
195 |             done_cache_states = set()
196 |         if incorrect_cache_states is None:
197 |             incorrect_cache_states = set()
198 |         inputs = {element: set() for element in elements}
199 |         initial_state = depgraph.DependencyState(loc_key, inputs, line_nb)
200 |         todo = {initial_state}
201 |         dpResultcls = depgraph.DependencyResultImplicit if self._implicit else depgraph.DependencyResult
202 |         self.incorrect = False
203 |         new_cache_states = set()
204 |         self.new_cache_states = new_cache_states
205 | 
206 |         while todo:
207 |             state = todo.pop()
208 |             self._compute_intrablock(state)
209 |             if self.incorrect:
210 |                 yield dpResultcls(self._ircfg, initial_state, state, elements)
211 |                 self.incorrect = False
212 |                 continue
213 |             done_state = state.get_done_state()
214 |             if done_state in incorrect_cache_states:
215 |                 self.incorrect = True
216 |                 yield dpResultcls(self._ircfg, initial_state, state, elements)
217 |                 self.incorrect = False
218 |                 continue
219 |             if done_state in done_cache_states | new_cache_states:
220 |                 self.cached = True
221 |                 yield dpResultcls(self._ircfg, initial_state, state, elements)
222 |                 self.cached = False
223 |                 continue
224 |             new_cache_states.add(done_state)
225 |             if state.loc_key in heads or not state.pending:
226 |                 yield dpResultcls(self._ircfg, initial_state, state, elements)
227 |                 continue
228 | 
229 |             if self._implicit:
230 |                 # Force IRDst to be tracked, except in the input block
231 |                 state.pending[self._ircfg.IRDst] = set()
232 | 
233 |             state.pending_links.add(done_state)
234 |             # Propagate state to parents
235 |             for pred in self._ircfg.predecessors_iter(state.loc_key):
236 |                 todo.add(state.extend(pred))
237 |         done_cache_states.update(new_cache_states)
238 | 


--------------------------------------------------------------------------------
/stadeo/cff/cff_recognizer.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf8 -*-
  2 | #
  3 | # Copyright (c) 2020 ESET spol. s r.o.
  4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
  5 | # See LICENSE file for redistribution.
  6 | 
  7 | from collections import namedtuple
  8 | from hashlib import md5
  9 | from miasm.analysis.data_flow import ReachingDefinitions, DiGraphDefUse, AssignblkNode
 10 | import miasm.analysis.depgraph as depgraph
 11 | from miasm.arch.x86 import regs
 12 | from miasm.arch.x86.arch import expr_simp
 13 | from miasm.core.locationdb import LocationDB
 14 | from miasm.expression.expression import *
 15 | from miasm.ir.ir import AssignBlock, IRBlock
 16 | import logging
 17 | 
 18 | from miasm.ir.symbexec import SymbolicExecutionEngine
 19 | from sortedcontainers import SortedSet
 20 | 
 21 | from stadeo.cff.arcg_cache_depgraph import ArgCacheDependencyGraph, contains_local_variable
 22 | from stadeo.utils.extended_asmcfg import ExtendedAsmCFG, is_bad_expr, remove_redundant_and_unpin_blocks
 23 | 
 24 | logger = logging.getLogger('CFFrecognizer')
 25 | logger.setLevel(logging.WARNING)
 26 | 
 27 | 
 28 | # logger.basicConfig(stream=sys.stderr, level=logger.DEBUG)
 29 | # logger.basicConfig(filename="solver.log", level=logger.DEBUG)
 30 | 
 31 | 
 32 | class FlatteningLoop(object):
 33 |     def __init__(self, head_vars: list, primary_loc_keys: set, affected_lines: dict, affected_exprs: dict
 34 |                  , loc_key: LocKey):
 35 |         # TODO replace loc_key with seq IDs
 36 |         self.affected_exprs = affected_exprs
 37 |         self.loc_key = loc_key
 38 |         self.head_vars = head_vars
 39 |         self.affected_lines = affected_lines
 40 |         self.primary_loc_keys = primary_loc_keys
 41 |         self.is_default = False
 42 |         self._seq = 0
 43 | 
 44 |     def get_affected_hash(self, symb_exec, block_loc_key, flat_loop, source_hash_value):
 45 |         hash_list = [block_loc_key, source_hash_value]
 46 |         for head_var in self.head_vars:
 47 |             hash_list.append((head_var, symb_exec.eval_expr(head_var)))
 48 |         for affected_expr in flat_loop.affected_exprs[block_loc_key]:
 49 |             hash_list.append((affected_expr, symb_exec.eval_expr(affected_expr)))
 50 |         seq = False
 51 |         if not flat_loop.affected_exprs[block_loc_key]:
 52 |             hash_list.append(self._seq)
 53 |             self._seq += 1
 54 |             seq = True
 55 |         new_hash = int(md5(bytes(str(hash_list), 'ascii')).hexdigest(), 16)
 56 |         return new_hash, seq
 57 | 
 58 | 
 59 | class FlatteningLoops(object):
 60 |     def __init__(self):
 61 |         self._loc_key_to_loop = {}
 62 |         self.loc_db = LocationDB()
 63 |         self.loops = []
 64 |         # for blocks outside of any loop
 65 |         self._outside_of_scope = FlatteningLoop([], set(), {}, {}, self.loc_db.add_location())
 66 |         self._outside_of_scope.is_default = True
 67 |         self._address = None
 68 | 
 69 |     def get_block(self, block_loc_key, symb_exec, source_flat_block=None):
 70 |         flat_loop = self[block_loc_key]
 71 |         flat_hash = source_hash_value = source_loop_loc_key = None
 72 |         if flat_loop.is_default:
 73 |             if source_flat_block:
 74 |                 source_loop_loc_key = source_flat_block.source_loop_loc_key or source_flat_block.block_loc_key
 75 |                 source_flat_loop = self[source_loop_loc_key]
 76 |                 source_hash_value = source_flat_block.source_hash_value or source_flat_block.control_hash_value
 77 |                 if block_loc_key in source_flat_loop.affected_lines:
 78 |                     flat_hash, no_affected_expr = \
 79 |                         flat_loop.get_affected_hash(symb_exec, block_loc_key, source_flat_loop, None)
 80 |                     source_hash_value = None
 81 |         else:
 82 |             flat_hash, _ = flat_loop.get_affected_hash(symb_exec, block_loc_key, flat_loop, None)
 83 |         # TODO check init block too to prevent initial duplicity in case of loops(eliminated by the decompiler)
 84 |         flat_block = FlatteningBlock(flat_loop.loc_key, source_loop_loc_key, block_loc_key, flat_hash,
 85 |                                      source_hash_value)
 86 |         return flat_block
 87 | 
 88 |     def create(self, head_vars, affected_lines, primary_loc_keys, ircfg, address):
 89 |         self._address = hex(address) if address else "None"
 90 |         affected_exprs = {}
 91 |         dp = depgraph.DependencyGraph(ircfg, True)
 92 |         for block_loc_key in affected_lines:
 93 |             block = ircfg.blocks[block_loc_key]
 94 |             cur_affected_exprs = SortedSet(key=lambda x: str(x))
 95 |             for line_nb in affected_lines[block_loc_key]:
 96 |                 affected_assignments = block.assignblks[line_nb]
 97 |                 for ind, (dst, src) in enumerate(affected_assignments.items()):
 98 |                     if type(src) not in [ExprInt, ExprMem]:
 99 |                         res = next(dp.get(block_loc_key, {dst}, ind, {block_loc_key}))
100 |                         cur_affected_exprs.update(filter(lambda x: not is_bad_expr(x), res.pending.keys()))
101 |             affected_exprs[block_loc_key] = cur_affected_exprs
102 |         loop = FlatteningLoop(list(head_vars), primary_loc_keys, affected_lines, affected_exprs,
103 |                               self.loc_db.add_location())
104 |         upd = {}
105 |         for i in loop.primary_loc_keys:
106 |             if i in self._loc_key_to_loop:
107 |                 raise RuntimeError("Overlap of primary blocks of the flattening loops")
108 |             upd[i] = loop
109 |         self._loc_key_to_loop.update(upd)
110 |         self.loops.append(loop)
111 |         return loop
112 | 
113 |     def __getitem__(self, loc_key):
114 |         """
115 |         Retrieves particular flattening loop by ID of the block
116 |         :param loc_key:
117 |         :return:
118 |         """
119 |         return self._loc_key_to_loop.get(loc_key, self._outside_of_scope)
120 | 
121 |     def __contains__(self, loc_key):
122 |         return loc_key in self._loc_key_to_loop
123 | 
124 |     def __len__(self):
125 |         return len(self.loops)
126 | 
127 | 
128 | FlattenState = namedtuple('FlattenState', 'flat_block, symbols')
129 | 
130 | 
131 | class ConfirmedMergeFunc(object):
132 |     def __init__(self, recognizer, vals):
133 |         self.recognizer = recognizer
134 |         self.vals = vals
135 | 
136 | 
137 | class FlatteningBlock(object):
138 |     """
139 |     We don't need any what flattening loop the block belongs to since they are all disjunct.
140 |     """
141 | 
142 |     def __init__(self, loop_loc_key: LocKey, source_loop_loc_key: LocKey, block_loc_key: LocKey, control_hash_value,
143 |                  source_hash_value):
144 |         self.block_loc_key = block_loc_key
145 |         self.control_hash_value = control_hash_value
146 |         self.loop_loc_key = loop_loc_key
147 |         self.source_hash_value = source_hash_value
148 |         self.source_loop_loc_key = source_loop_loc_key
149 | 
150 |     def __hash__(self):
151 |         hash_list = [self.loop_loc_key, self.block_loc_key, self.control_hash_value, self.source_hash_value]
152 |         new_hash = int(md5(bytes(str(hash_list), "ascii")).hexdigest(), 16)
153 |         return new_hash
154 | 
155 |     def __eq__(self, other):
156 |         return self.loop_loc_key == other.loop_loc_key and \
157 |                self.block_loc_key == other.block_loc_key and \
158 |                self.control_hash_value == other.control_hash_value and \
159 |                self.source_hash_value == other.source_hash_value
160 | 
161 | 
162 | class Analyses(object):
163 |     def __init__(self, ircfg, asmcfg):
164 |         self.defuse_edges = {}
165 |         self.reaching_defs = ReachingDefinitions(ircfg)
166 |         defuse = DiGraphDefUse(self.reaching_defs, deref_mem=False, apply_simp=True)
167 |         heads = asmcfg.heads()
168 |         self.dominators = asmcfg.compute_dominators(heads[0])
169 |         self.immediate_dominators = asmcfg.compute_immediate_dominators(heads[0])
170 | 
171 |         self.back_edges = []
172 |         self.rev_back_edges = {}
173 |         for node in asmcfg.walk_depth_first_forward(heads[0]):
174 |             for successor in asmcfg.successors_iter(node):
175 |                 # check for a back edge to a dominator
176 |                 if successor in self.dominators[node]:
177 |                     edge = (node, successor)
178 |                     self.rev_back_edges.setdefault(successor, set()).add(node)
179 |                     self.back_edges.append(edge)
180 | 
181 |         for src, dst in defuse.edges():
182 |             self.defuse_edges.setdefault(src, []).append(dst)
183 | 
184 | 
185 | class CFFRecognizer(object):
186 |     def __init__(self, file_path, func_address, machine, conn):
187 |         self.ir_arch = None
188 |         self.func_address = func_address
189 |         self.asmcfg = None
190 |         self.file_path = file_path
191 |         self.all_affected_lines = {}
192 |         self.flat_loops = FlatteningLoops()
193 |         self.machine = machine
194 |         self.mn = machine.mn
195 |         self._merging_var_candidates = None
196 |         self.merging_var = None
197 |         self.possible_merge_funcs = set()
198 |         self.conn = conn
199 |         self.func_addresses = set(conn.modules.idautils.Functions())
200 |         self.ircfg = None
201 |         self.pad = False
202 |         self.analyses = None
203 | 
204 |     @staticmethod
205 |     def _resize_top_expr(expr, size):
206 |         cls, state = expr.__reduce__()
207 |         if expr.is_slice():
208 |             return ExprSlice(expr.arg, 0, size)
209 |         elif isinstance(state[-1], int):
210 |             # int must be since since all the other args are Expr instance
211 |             return cls(*state[:-1], size)
212 |         elif expr.is_op() and expr.op.startswith("zeroExt"):
213 |             return ExprOp("zeroExt_" + str(size), *expr.args)
214 |         return None
215 | 
216 |     def _normalize_ircfg(self, conn):
217 |         # unalias stack miasm.re/blog/2017/02/03/data_flow_analysis_depgraph.html , but involve base pointer too
218 |         # TODO remove manual *BP propagation in normalize_ircfg and use standrad Miasm propagation when it is fixed
219 |         # remove composes from bigger to smaller, they are not important for us
220 |         bp = {}
221 |         prev_offset = None
222 |         for irb_loc_key in self.ircfg.walk_breadth_first_forward(LocKey(0)):
223 |             irs = []
224 |             if irb_loc_key not in self.ircfg.blocks:
225 |                 continue
226 |             irb = self.ircfg.blocks[irb_loc_key]
227 |             if irb.dst.is_cond() and irb.dst.cond.is_op() and irb.dst.cond.op == 'CC_EQ':
228 |                 # TODO propagate cmp ..., arb_int too
229 |                 # propagate known zeroes to process test    eax, eax; jnz ...; lea     edi, [eax+4]
230 |                 symb_exec = SymbolicExecutionEngine(self.ir_arch)
231 |                 dst = symb_exec.eval_updt_irblock(irb)
232 |                 if dst.is_cond() and dst.cond.is_id() and not is_bad_expr(dst.cond) and \
233 |                         symb_exec.eval_expr(dst.cond) == dst.cond:
234 |                     # add explicit mov ID, 0 to given irb
235 |                     target_loc = dst.src2
236 |                     if target_loc.is_int():
237 |                         target_loc = self.asmcfg.loc_db.get_offset_location(int(target_loc))
238 |                     elif target_loc.is_loc():
239 |                         target_loc = target_loc.loc_key
240 |                     else:
241 |                         continue
242 |                     if len(self.ircfg.predecessors(target_loc)) > 1:
243 |                         continue
244 |                     target_irb = self.ircfg.blocks[target_loc]
245 |                     asign_blk = AssignBlock([ExprAssign(dst.cond, ExprInt(0, dst.cond.size))])
246 |                     assignblks = tuple([asign_blk, *target_irb.assignblks])
247 |                     new_irb = IRBlock(target_loc, assignblks)
248 |                     self.ircfg.blocks[target_loc] = new_irb
249 |             fix_dct = {}
250 |             for assignblk in irb:
251 |                 offset = prev_offset
252 |                 if assignblk.instr and assignblk.instr.offset:
253 |                     offset = assignblk.instr.offset
254 |                 prev_offset = offset
255 |                 spd = conn.modules.idc.get_spd(offset)
256 |                 if spd is not None:
257 |                     stk_high = ExprInt(spd, self.ir_arch.sp.size)
258 |                     fix_dct = {self.ir_arch.sp: self.mn.regs.regs_init[self.ir_arch.sp] + stk_high}
259 |                     fix_dct.update(bp)
260 |                 else:
261 |                     logger.warning("Couldn't acquire stack depth at 0x%x" % (offset or 0x0BADF00D))
262 | 
263 |                 new_assignblk = {}
264 |                 for dst, src in assignblk.items():
265 |                     if src.is_compose():
266 |                         slc_arg = None
267 |                         arg = None
268 |                         for tmp_arg in src.args:
269 |                             if not tmp_arg.is_slice():
270 |                                 arg = tmp_arg
271 |                             else:
272 |                                 # we're interested only in bigger to smaller
273 |                                 slc_arg = tmp_arg
274 |                         if slc_arg and arg and len(arg.get_r()) == 1:
275 |                             top_to_bottom_visitor = ExprVisitorCallbackTopToBottom(
276 |                                 lambda x: self._resize_top_expr(x, src.size))
277 |                             src = top_to_bottom_visitor.visit(arg)
278 |                     if dst == src:
279 |                         # special compiler anomalies such as lea     esp, [esp+0]
280 |                         continue
281 |                     if src == self.ir_arch.sp:
282 |                         src = expr_simp(src.replace_expr(fix_dct))
283 |                         if bp and src not in bp.values() and irb_loc_key != LocKey(0):
284 |                             raise RuntimeError("Ambiguous base pointer")
285 |                         bp.update({dst: src})
286 |                         fix_dct.update(bp)
287 |                     else:
288 |                         src = expr_simp(src.replace_expr(fix_dct))
289 |                         if dst != self.ir_arch.sp and dst not in bp.keys():
290 |                             dst = dst.replace_expr(fix_dct)
291 | 
292 |                     dst, src = expr_simp(dst), expr_simp(src)
293 |                     new_assignblk[dst] = src
294 |                 irs.append(AssignBlock(new_assignblk, instr=assignblk.instr))
295 |             self.ircfg.blocks[irb.loc_key] = IRBlock(irb.loc_key, irs)
296 | 
297 |     def _recog_init(self, merging_var_candidates):
298 |         # recognize cff loops and initiate deobfuscation
299 |         self._merging_var_candidates = merging_var_candidates
300 |         self.ircfg = self.ir_arch.new_ircfg_from_asmcfg(self.asmcfg)
301 |         self.asmcfg.rebuild_edges()
302 | 
303 |         # TODO put constant propagation here when fixed in Miasm
304 |         # simp = IRCFGSimplifierSSA(self.ir_arch)
305 |         # from datetime import datetime
306 |         # startTime = datetime.now()
307 |         # ssa = simp.ircfg_to_ssa(self.ircfg, LocKey(0))
308 |         # simp.do_propagate_expressions(ssa, LocKey(0))
309 |         # self.ircfg = simp.ssa_to_unssa(ssa, LocKey(0))
310 |         # print(datetime.now() - startTime)
311 | 
312 |         # init_infos = self.ir_arch.arch.regs.regs_init
313 |         # cst_propag_link = cst_prop.propagate_cst_expr(self.ir_arch, self.ircfg, self.asmcfg.func_addr, init_infos)
314 | 
315 |         # raise Exception("test")
316 | 
317 |         self._normalize_ircfg(self.conn)
318 |         irb_bak = None
319 |         if merging_var_candidates:
320 |             self.pad = True
321 |             new_line = AssignBlock([ExprAssign(k, k) for k in merging_var_candidates])
322 |             irb_bak = self.ircfg.blocks[LocKey(0)]
323 |             new_irb = IRBlock(LocKey(0), tuple([new_line, *self.ircfg.blocks[LocKey(0)].assignblks]))
324 |             self.ircfg.blocks[LocKey(0)] = new_irb
325 | 
326 |         self.analyses = Analyses(self.ircfg, self.asmcfg)
327 |         return irb_bak
328 | 
329 |     def clear_cache(self):
330 |         # TODO save to disk and recover when needed
331 |         self.asmcfg = None
332 |         self.ircfg = None
333 |         self.analyses = None
334 |         self.ir_arch = None
335 |         self.all_affected_lines = {}
336 |         self.flat_loops = FlatteningLoops()
337 |         self.possible_merge_funcs = set()
338 |         self._merging_var_candidates = None
339 | 
340 |     def _recognize(self, max_loop_num):
341 |         symb_engine = SymbolicExecutionEngine(self.ir_arch, regs.regs_init)
342 |         todo = [(LocKey(0), symb_engine.get_state())]
343 |         done_loc = set()
344 |         if not max_loop_num:
345 |             max_loop_num = float('inf')
346 |         found_loops_num = 0
347 |         while todo:
348 |             loc_key, symb_state = todo.pop()
349 |             if loc_key in done_loc or loc_key not in self.ircfg.blocks:
350 |                 continue
351 |             done_loc.add(loc_key)
352 |             ir_block = self.ircfg.blocks[loc_key]
353 |             symb_engine.set_state(symb_state)
354 |             for ind, assignblk in enumerate(ir_block.assignblks):
355 |                 for dst, src in assignblk.items():
356 |                     if max_loop_num < found_loops_num:
357 |                         return
358 |                     if src.is_int() and int(src) in self.func_addresses:
359 |                         assignblk_node = AssignblkNode(ir_block.loc_key, ind, dst)
360 |                         # no uses
361 |                         if assignblk_node not in self.analyses.defuse_edges or not \
362 |                                 self.analyses.defuse_edges[assignblk_node]:
363 |                             # possible virtual table initialization
364 |                             self.possible_merge_funcs.add((int(src), frozenset(), loc_key))
365 |                     elif src.is_op("call_func_stack"):
366 |                         self._process_call(src, dst, symb_engine, assignblk, loc_key)
367 |                     elif (expr_simp(src).is_int() and not is_bad_expr(dst)) \
368 |                             or (ir_block.loc_key == LocKey(0) and dst == src and
369 |                                 (not self._merging_var_candidates or dst in self._merging_var_candidates)):
370 |                         if self._process_assignment(ir_block, ind, dst):
371 |                             self._merging_var_candidates = None
372 |                             found_loops_num += 1
373 |                 symb_engine.eval_updt_assignblk(assignblk)
374 | 
375 |             for succ in self.ircfg.successors(loc_key):
376 |                 todo.append((succ, symb_engine.get_state()))
377 | 
378 |     def recognize(self, max_loop_num=False, merging_var_candidates=None):
379 |         if not merging_var_candidates:
380 |             merging_var_candidates = None
381 |         if not self.asmcfg:
382 |             self.asmcfg = ExtendedAsmCFG(self.file_path, self.conn)
383 |             self.asmcfg.disassemble(self.func_address, self.conn)
384 |             remove_redundant_and_unpin_blocks(self.asmcfg, LocKey(0), self.asmcfg.mode, unpin=False)
385 |             block_nb = len(self.asmcfg.blocks)
386 |             if block_nb > 4250:
387 |                 self.clear_cache()
388 |                 logger.critical("Function is too big")
389 |                 raise RuntimeError("Function is too big")
390 | 
391 |             self.ir_arch = self.machine.ira(self.asmcfg.loc_db)
392 |             if self.merging_var:
393 |                 merging_var_candidates = {self.merging_var}
394 |         else:
395 |             return
396 |         # setting merging vars
397 |         try:
398 |             irb_bak = self._recog_init(merging_var_candidates)
399 |         except RuntimeError:
400 |             logger.warning("Exotic stack operations, skipping")
401 |             return
402 |         self._recognize(max_loop_num)
403 |         if merging_var_candidates:
404 |             self.ircfg.blocks[LocKey(0)] = irb_bak
405 | 
406 |     def _process_assignment(self, ir_block, ind, dst):
407 |         assignblk_node = AssignblkNode(ir_block.loc_key, ind, dst)
408 |         # loop id 0 is the default
409 |         logger.debug("Processing %s" %
410 |                      hex(self.asmcfg.loc_db.get_location_offset(ir_block.loc_key) or 0))
411 |         local_affected_lines = {}
412 |         affected_irdsts, possible_nodes = self._get_affected_ir_destinations(assignblk_node, local_affected_lines)
413 |         result = False
414 |         for node in self.asmcfg.walk_breadth_first_forward(LocKey(0)):
415 |             if node in possible_nodes:
416 |                 filtered_irdsts = self._filter_sequential_loc_keys(node, affected_irdsts)
417 |                 affected_lines = {}
418 |                 result |= self._create_flattening_loop(node, filtered_irdsts, affected_lines)
419 |         return result
420 | 
421 |     def _process_call(self, src, dst, symb_engine, assignblk, loc_key):
422 |         # adds function to the to be processed list
423 |         addr = src.args[0]
424 |         if addr.is_mem():
425 |             addr = addr.ptr
426 |         if addr.is_loc():
427 |             addr = self.asmcfg.loc_db.get_location_offset(addr.loc_key)
428 |         if isinstance(addr, int) or addr.is_int():
429 |             addr = int(addr)
430 |             if addr in self.func_addresses:
431 |                 new_merging_var_candidates = self._get_merging_var_candidates(symb_engine, assignblk, dst)
432 |                 self.possible_merge_funcs.add((addr, frozenset(new_merging_var_candidates), loc_key))
433 | 
434 |     def _get_merging_var_candidates(self, symb_engine, assignblk, dst):
435 |         stk_high = ExprInt(self.conn.modules.idc.get_spd(assignblk.instr.offset),
436 |                            self.ir_arch.sp.size)
437 |         init_sp = self.mn.regs.regs_init[self.ir_arch.sp]
438 |         fix_dct = {init_sp: - stk_high + init_sp + ExprInt(dst.size // 8, dst.size)}
439 |         new_merging_var_candidates = set()  # values are tuples key, val
440 |         for key, val in symb_engine.modified(regs.regs_init):
441 |             if not val.is_int() or not val.size > 1 or type(key) not in [ExprId, ExprMem] \
442 |                     or key.is_id() and key.name in ["RIP", "EIP", self.ircfg.IRDst.name]:
443 |                 continue
444 |             if not key.is_id():
445 |                 # get relative depth
446 |                 key = key.replace_expr(fix_dct)
447 |                 key = expr_simp(key)
448 |             new_merging_var_candidates.add((key, val))
449 |         return new_merging_var_candidates
450 | 
451 |     def _get_affected_ir_destinations(self, assignblk_node, local_affected_lines):
452 |         todo = [assignblk_node]
453 |         processed = set()  # {}
454 |         result = {}
455 |         possible_nodes = {assignblk_node.label}
456 |         while todo:
457 |             target = todo.pop()
458 |             if not self.flat_loops[target.label].is_default:
459 |                 logger.debug("Overlap at %s skipping" % hex(
460 |                     self.ircfg.loc_db.get_location_offset(target.label) or 0xbadf0d))
461 |                 return set(), set()
462 | 
463 |             if target in processed:
464 |                 continue
465 |             local_affected_lines.setdefault(target.label, set()).add(target.index)
466 |             processed.add(target)
467 |             if target.var == self.ircfg.IRDst:
468 |                 result[target.label] = target
469 |             possible_nodes.add(target.label)
470 |             for use in self.analyses.defuse_edges.get(target, []):
471 |                 todo.append(use)
472 |         return result, possible_nodes
473 | 
474 |     def _filter_sequential_loc_keys(self, node, affected):
475 |         if node in self.all_affected_lines:
476 |             return set()
477 |         todo = [node]
478 |         accessible = {}
479 |         done = set()
480 |         fst = False  # found the first comparison
481 |         while todo:
482 |             target = todo.pop()
483 |             if target in accessible or target in done:
484 |                 continue
485 |             done.add(target)
486 |             succs = self.asmcfg.successors(target)
487 |             irb = self.ircfg.blocks[target]
488 |             if target in affected:
489 |                 fst = True
490 |                 accessible[target] = affected[target]
491 |             elif len(succs) > 1 and fst:
492 |                 continue
493 |             if irb.dst.is_cond() and irb.dst.cond.is_op("CC_EQ"):
494 |                 succs = [self.ircfg.get_loc_key(irb.dst.src2.loc_key)]
495 |             todo += succs
496 |         return set(accessible.values())
497 | 
498 |     def _add_2_control_vars(self, primary_loc_keys, affected_lines, merging_var, done, head_vars):
499 |         # involve cff loops containing compiler "optimization" introducing 2 control variables(
500 |         # range(0,x,1) and range(0,x,1*y)); there are multiple due to various compiler anomalies, but
501 |         # only one from affected_irdsts
502 |         found = False
503 |         for disp_loc, last_cff_locs in self.analyses.rev_back_edges.items():
504 |             if disp_loc not in primary_loc_keys:
505 |                 continue
506 |             for last_cff_loc in last_cff_locs:
507 |                 preds = self.asmcfg.predecessors(last_cff_loc)
508 |                 succs = self.asmcfg.successors(last_cff_loc)
509 |                 if not len(succs) > 1 and len(preds) == 1 and last_cff_loc not in affected_lines:
510 |                     last_cff_loc = preds[0]
511 |                     succs = self.asmcfg.successors(last_cff_loc)
512 |                 if last_cff_loc not in primary_loc_keys:
513 |                     if len(succs) > 1:
514 |                         primary_loc_keys.add(last_cff_loc)
515 |                         opti_node = AssignblkNode(last_cff_loc, len(self.ircfg.blocks[last_cff_loc].assignblks) - 1,
516 |                                                   self.ir_arch.IRDst)
517 |                         self._process_affected_irdsts({opti_node}, affected_lines, primary_loc_keys, merging_var, done,
518 |                                                       head_vars)
519 |                         if last_cff_loc in primary_loc_keys:
520 |                             # otherwise last_cff_loc couldn't be determined and was removed from primaries
521 |                             found = True
522 |                     if last_cff_loc in affected_lines:
523 |                         found = True
524 |                 else:
525 |                     found = True
526 |         if not found:
527 |             raise RuntimeError("There must be a back-edge")
528 | 
529 |     def _cff_loop_sanity_check(self, primary_loc_keys, node, affected_lines):
530 |         wanted_jump_tables = \
531 |             {i for i in self.asmcfg.jmp_table_loc_keys
532 |              if set(self.ircfg.predecessors(i)).issubset(primary_loc_keys)}
533 |         wanted = affected_lines.keys() | wanted_jump_tables
534 |         tolerance = self._merging_var_candidates is not None
535 |         for i in primary_loc_keys:
536 |             if i not in self.asmcfg.heads() | self.analyses.rev_back_edges.keys() and \
537 |                     (self.analyses.immediate_dominators[i] not in wanted and
538 |                      (set(self.ircfg.predecessors(i)).issubset(wanted) or
539 |                      i in self.analyses.rev_back_edges.values())):
540 |                 # immediate dominators must contain affected blocks, except the first one, jump table blocks and in
541 |                 # case of merged function the initial range check; check predecessors too
542 |                 if tolerance:
543 |                     # initial range check check
544 |                     if all(i in self.analyses.dominators[j] or i == j for j in primary_loc_keys):
545 |                         tolerance = False
546 |                         continue
547 |                 raise RuntimeError()
548 | 
549 |         if not (len(primary_loc_keys) > 1 and
550 |                 all(node in self.analyses.dominators[i] for i in primary_loc_keys)):
551 |             # the assignment has to dominate all the primary loc_keys
552 |             raise RuntimeError()
553 | 
554 |     def _create_flattening_loop(self, node, assign_blocks, affected_lines):
555 |         # assign_blocks are all affected IRDsts
556 |         if not len(assign_blocks) > 1:
557 |             return False
558 | 
559 |         primary_loc_keys = {i.label for i in assign_blocks}
560 |         done = set()
561 |         head_vars = set()
562 |         merging_var = self._process_affected_irdsts(assign_blocks, affected_lines, primary_loc_keys, self.merging_var,
563 |                                                     done, head_vars)
564 |         try:
565 |             self._add_2_control_vars(primary_loc_keys, affected_lines, merging_var, done, head_vars)
566 |             self._cff_loop_sanity_check(primary_loc_keys, node, affected_lines)
567 |         except RuntimeError:
568 |             return False
569 |         if merging_var:
570 |             self.merging_var = merging_var
571 |             logger.debug("setting merging var %s" % merging_var)
572 |         try:
573 |             self.flat_loops.create(head_vars, affected_lines, primary_loc_keys, self.ircfg,
574 |                                    self.ircfg.loc_db.get_location_offset(node))
575 |         except RuntimeError:
576 |             return False
577 |         logger.debug("adding")
578 |         for tmp_loc_key, lines in affected_lines.items():
579 |             self.all_affected_lines.setdefault(tmp_loc_key, SortedSet()).update(lines)
580 |         return True
581 | 
582 |     def _process_affected_irdsts(self, assign_blocks, affected_lines, primary_loc_keys, merging_var, done, head_vars):
583 |         dg = ArgCacheDependencyGraph(self, self.ircfg, implicit=False, follow_mem=False, follow_call=False)
584 |         possible_merging_var = merging_var
585 |         incorrect_cache = set()
586 |         for ind, assign_block in enumerate(assign_blocks):
587 |             if self.asmcfg.loc_db.get_location_offset(assign_block.label):
588 |                 logger.debug("processing assign_block %d out of %d at %s" %
589 |                              (ind + 1, len(assign_blocks),
590 |                               hex(self.asmcfg.loc_db.get_location_offset(assign_block.label) or 0)))
591 |             base_expr_ids = self.ircfg.blocks[assign_block.label][assign_block.index][assign_block.var].get_r()
592 |             local_done = set(done)
593 |             dr = dg.get(assign_block.label, base_expr_ids, assign_block.index, self.asmcfg.heads(), local_done,
594 |                         incorrect_cache)
595 |             local_affected_lines = {}
596 |             local_head_vars = set()
597 |             for sol in dr:
598 |                 if not dg.cached:
599 |                     if not dg.incorrect:
600 |                         possible_merging_vars = set()
601 |                         for pnd in sol.pending:
602 |                             if not contains_local_variable(pnd, self.ir_arch, self.mn):
603 |                                 if pnd != self.mn.regs.regs_init[self.ir_arch.sp]:
604 |                                     possible_merging_vars.add(pnd)
605 |                             elif sol.loc_key == LocKey(0):
606 |                                 local_head_vars.add(pnd)
607 | 
608 |                         pmv_len = len(possible_merging_vars)
609 |                         if self._merging_var_candidates and possible_merging_vars:
610 |                             possible_merging_var = possible_merging_vars.pop()
611 |                     if dg.incorrect or pmv_len > (self._merging_var_candidates is not None) \
612 |                             or possible_merging_var and self._merging_var_candidates and \
613 |                             possible_merging_var not in self._merging_var_candidates or \
614 |                             (merging_var and possible_merging_var != merging_var):
615 |                         incorrect_cache.update(sol.pending_links)
616 |                         primary_loc_keys.remove(assign_block.label)
617 |                         if self.asmcfg.loc_db.get_location_offset(assign_block.label):
618 |                             logger.debug("%s cannot be determined" %
619 |                                          hex(self.asmcfg.loc_db.get_location_offset(assign_block.label) or 0))
620 |                         break
621 |                     else:
622 |                         merging_var = possible_merging_var
623 |                 self._add_relevant_nodes(sol.relevant_nodes, local_affected_lines)
624 |             else:
625 |                 done.update(local_done)
626 |                 head_vars.update(local_head_vars)
627 |                 affected_lines.setdefault(assign_block.label, SortedSet()).add(assign_block.index)
628 |                 for loc_key, lines in local_affected_lines.items():
629 |                     affected_lines.setdefault(loc_key, SortedSet()).update(lines)
630 |         return merging_var
631 | 
632 |     @staticmethod
633 |     def _add_relevant_nodes(relevant_nodes, affected_lines):
634 |         for node in relevant_nodes:
635 |             affected_lines.setdefault(node.loc_key, SortedSet()).add(node.line_nb)
636 | 


--------------------------------------------------------------------------------
/stadeo/cff/cff_solver.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf8 -*-
  2 | #
  3 | # Copyright (c) 2020 ESET spol. s r.o.
  4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
  5 | # See LICENSE file for redistribution.
  6 | 
  7 | import logging
  8 | 
  9 | from miasm.analysis.depgraph import DependencyGraph
 10 | from miasm.arch.x86.arch import instruction_x86, additional_info
 11 | from miasm.core.asmblock import AsmConstraintTo, AsmBlock, AsmConstraint, AsmCFG
 12 | from miasm.core.locationdb import LocationDB
 13 | from miasm.expression.expression import *
 14 | from miasm.ir.ir import AssignBlock, IRBlock
 15 | from miasm.ir.symbexec import SymbolicExecutionEngine
 16 | 
 17 | from stadeo.utils.extended_asmcfg import create_jump_instruction
 18 | from stadeo.cff.cff_recognizer import FlattenState
 19 | 
 20 | logger = logging.getLogger('CFFsolver')
 21 | logger.setLevel(logging.WARNING)
 22 | 
 23 | 
 24 | class CFFSolver(object):
 25 |     def __init__(self, recognizer):
 26 |         self.ircfg = recognizer.ircfg
 27 |         self.asmcfg = recognizer.asmcfg
 28 |         self.flat_loops = recognizer.flat_loops
 29 |         self.all_affected_lines = recognizer.all_affected_lines
 30 |         self.ir_arch = recognizer.ir_arch
 31 |         loc_db = LocationDB()
 32 |         loc_db.merge(recognizer.asmcfg.loc_db)
 33 |         self.out_asmcfg = AsmCFG(loc_db)
 34 |         self.merging_var = recognizer.merging_var
 35 |         self.pad = recognizer.pad
 36 |         self.possible_merge_funcs = recognizer.possible_merge_funcs
 37 |         self.relevant_nodes = set()
 38 | 
 39 |     def process(self, pending, merging_val, reached_funcs):
 40 |         if len(self.flat_loops) == 0:
 41 |             # add all reached functions
 42 |             for func_addr, possible_merge_vars, loc_key in self.possible_merge_funcs:
 43 |                 reached_funcs.add(func_addr)
 44 |                 for expr, val in possible_merge_vars:
 45 |                     pending.setdefault(func_addr, {}).setdefault(expr, set()).add(val)
 46 |             return None
 47 | 
 48 |         assert len(self.asmcfg.heads()) == 1
 49 | 
 50 |         # add merging var to the ircfg
 51 |         if self.pad:
 52 |             initial_block_bak = self.ircfg.blocks[LocKey(0)]
 53 |             if merging_val and self.merging_var:
 54 |                 asgn_blk = AssignBlock([ExprAssign(self.merging_var, merging_val)])
 55 |             else:
 56 |                 asgn_blk = AssignBlock()
 57 |             assignblks = tuple([asgn_blk, *self.ircfg.blocks[LocKey(0)].assignblks])
 58 |             self.ircfg.blocks[LocKey(0)] = IRBlock(LocKey(0), assignblks)
 59 | 
 60 |         head = self.asmcfg.heads()[0]
 61 |         head_block = self.asmcfg.loc_key_to_block(head)
 62 |         new_head = self._deobfuscate_cff_loops(head_block, self.asmcfg.machine.mn.regs.regs_init)
 63 | 
 64 |         if self.pad:
 65 |             self.ircfg.blocks[LocKey(0)] = initial_block_bak
 66 |             if merging_val and self.merging_var:
 67 |                 mode = self.asmcfg.mode
 68 |                 fix_dct = {self.asmcfg.machine.mn.regs.regs_init[self.ir_arch.sp]: self.ir_arch.sp}
 69 |                 mov = instruction_x86("MOV", mode, [self.merging_var.replace_expr(fix_dct), merging_val])
 70 |                 mov.additional_info = additional_info()
 71 |                 mov.additional_info.g1.value = 0
 72 |                 self.out_asmcfg.loc_key_to_block(LocKey(0)).lines.insert(0, mov)
 73 | 
 74 |         loc_keys = self.relevant_nodes
 75 |         for func_addr, possible_merge_vars, loc_key in self.possible_merge_funcs:
 76 |             if loc_key in loc_keys:
 77 |                 reached_funcs.add(func_addr)
 78 |                 for expr, val in possible_merge_vars:
 79 |                     pending.setdefault(func_addr, {}).setdefault(expr, set()).add(val)
 80 | 
 81 |         return new_head
 82 | 
 83 |     def _insert_flat_block(self, source_flat_block, symb_exec, flat_block_to_loc_key):
 84 |         """
 85 |         Copies source_flat_block and sets its successors according to flat_block_to_loc_key
 86 |         :param flat_block_to_loc_key: dictionary mapping flat_blocks to respective loc_keys
 87 |         :param symb_exec: instance of current symbolic execution engine
 88 |         :param source_flat_block: flat_block to be inserted
 89 |         :return: dictionary mapping old successor loc_keys to the new ones
 90 |         """
 91 |         # we're not using redirect_successors after copying to avoid executing the same loops multiple times
 92 |         source_block = self.asmcfg.loc_key_to_block(source_flat_block.block_loc_key)
 93 |         tobe_processed = {}
 94 |         new_flat_blocks = set()
 95 |         new_block_loc_key = flat_block_to_loc_key[source_flat_block]
 96 |         if self.out_asmcfg.loc_key_to_block(new_block_loc_key) is not None:
 97 |             raise Exception("Target loc_key is already associated to a block")
 98 |         new_block = AsmBlock(new_block_loc_key)
 99 | 
100 |         # copy instructions
101 |         for ln in source_block.lines:
102 |             tmp_ln = instruction_x86(ln.name, ln.mode, [i.copy() for i in ln.args], ln.additional_info)
103 |             tmp_ln.b = ln.b
104 |             tmp_ln.l = ln.l
105 |             tmp_ln.offset = ln.offset
106 |             new_block.addline(tmp_ln)
107 | 
108 |         constraints = source_block.bto
109 |         # try to simplify the destination if it's a primary flattening block
110 |         if not self.flat_loops[source_block.loc_key].is_default:
111 |             logger.debug("current block is a part of primary loc_keys")
112 |             simplified_target = symb_exec.eval_expr(self.ircfg.IRDst)
113 |             if isinstance(simplified_target, ExprInt):
114 |                 simplified_target = self.asmcfg.loc_db.get_offset_location(int(simplified_target))
115 |             elif isinstance(simplified_target, ExprLoc):
116 |                 simplified_target = simplified_target.loc_key
117 |             else:
118 |                 # there's probably a(n) (series of) unknown instruction(s) causing an implicit conditional assignment
119 |                 # such as CMOV or SBB->AND->ADD, prepend comparison + cond jump if it happens to be common, or add it to
120 |                 # ExtendedAsmCFG.extended_discovery and split flow on the final instruction
121 | 
122 |                 # it's also possible that it's not related to any cff loop at all
123 |                 addr = self.asmcfg.loc_db.get_location_offset(source_flat_block.block_loc_key)
124 |                 addr = hex(addr) if addr else addr
125 |                 logger.warning("Couldn't simplify loc_key %s at %s, continuing" %
126 |                                (str(source_flat_block.block_loc_key), addr))
127 |                 logger.warning("the simplified target is %s of instance %s" %
128 |                                (simplified_target, type(simplified_target)))
129 |                 simplified_target = None
130 |             if simplified_target:
131 |                 constraints = {AsmConstraintTo(simplified_target)}
132 |                 mode = self.asmcfg.mode
133 | 
134 |                 # remove redundant comparison
135 |                 dp = DependencyGraph(self.ircfg, True)
136 |                 block_loc_key = source_block.loc_key
137 |                 res = next(dp.get(block_loc_key, {self.ircfg.IRDst}, None, {block_loc_key}))
138 |                 for depnode in res.relevant_nodes:
139 |                     ind = depnode.line_nb
140 |                     ind -= (len(self.ircfg.blocks[block_loc_key]) - len(new_block.lines))
141 |                     if new_block.lines[ind].name == "CMP":
142 |                         new_block.lines.pop(ind)
143 | 
144 |                 new_block.lines[-1] = create_jump_instruction(mode, ExprLoc(simplified_target, mode))
145 | 
146 |         # copy constraints
147 |         new_bto = set()
148 |         for constraint in constraints:
149 |             if not self.asmcfg.loc_key_to_block(constraint.loc_key):
150 |                 logger.debug("Skipping bad constraint %s" % constraint.loc_key)
151 |                 continue
152 |             flat_block = self.flat_loops.get_block(constraint.loc_key, symb_exec, source_flat_block)
153 |             if flat_block not in flat_block_to_loc_key:
154 |                 new_flat_blocks.add(flat_block)
155 |                 new_loc_key = self.out_asmcfg.loc_db.add_location()
156 |                 tobe_processed[constraint.loc_key] = (new_loc_key, flat_block)
157 |                 flat_block_to_loc_key[flat_block] = new_loc_key
158 |             else:
159 |                 new_loc_key = flat_block_to_loc_key[flat_block]
160 |             new_bto.add(AsmConstraint(new_loc_key, constraint.c_t))
161 |         new_block.bto = new_bto
162 |         new_block.alignment = source_block.alignment
163 | 
164 |         # change jmp targets
165 |         if new_block.lines:
166 |             for ind, arg in enumerate(list(new_block.lines[-1].args)):
167 |                 if isinstance(arg, ExprLoc):
168 |                     if not self.asmcfg.loc_key_to_block(arg.loc_key):
169 |                         logger.debug("Skipping bad constraint %s" % arg.loc_key)
170 |                         continue
171 |                     new_target, flat_block = tobe_processed.get(arg.loc_key, (None, None))
172 |                     if not new_target:
173 |                         flat_block = self.flat_loops.get_block(arg.loc_key, symb_exec, source_flat_block)
174 |                         new_target = flat_block_to_loc_key.get(flat_block)
175 |                     # None in case of irrelevant calls
176 |                     logger.debug("new target: %s" % new_target)
177 |                     if new_target:
178 |                         new_block.lines[-1].args[ind] = ExprLoc(new_target, arg.size)
179 | 
180 |         self.out_asmcfg.add_block(new_block)
181 |         return new_flat_blocks
182 | 
183 |     def _deobfuscate_cff_loops(self, source_block, symbols):
184 |         """
185 | 
186 |         :param symbols: initial symbols of symbolic execution engine to be created
187 |         :param source_block: head of the graph to be deobfuscated
188 |         :return:
189 |         """
190 |         symb_exec = SymbolicExecutionEngine(self.ir_arch)
191 |         flat_block = self.flat_loops.get_block(source_block.loc_key, symb_exec, None)
192 |         # maps flattening blocks to their respective loc_keys
193 |         new_head = LocKey(0)
194 |         flat_block_to_loc_key = {flat_block: new_head}
195 |         todo = [FlattenState(flat_block, symbols)]
196 |         counter = {}
197 |         while todo:
198 |             state = todo.pop()
199 |             block_loc_key = state.flat_block.block_loc_key
200 |             self.relevant_nodes.add(block_loc_key)
201 |             counter[block_loc_key] = counter.get(block_loc_key, 0) + 1
202 |             logger.debug("Processing block at 0x%x as %s; in all affected: %d; loops_id: %s; the jtc_vars are:" %
203 |                          (self.asmcfg.loc_db.get_location_offset(block_loc_key) or 0xBAD, str(block_loc_key),
204 |                           block_loc_key in self.all_affected_lines,
205 |                           self.flat_loops[block_loc_key].loc_key))
206 |             if counter[block_loc_key] > 500:
207 |                 raise Exception("Couldn't deobfuscate cff loop, either fell into an infinite loop or processing very "
208 |                                 "big function")
209 |             symb_exec.set_state(state.symbols)
210 |             # evaluate all affected lines
211 |             self._eval_updt_lines(symb_exec, block_loc_key)
212 |             for flat_block in self._insert_flat_block(state.flat_block, symb_exec, flat_block_to_loc_key):
213 |                 todo.append(FlattenState(flat_block, symb_exec.get_state()))
214 |         return new_head
215 | 
216 |     def _eval_updt_lines(self, symb_exec, loc_key):
217 |         logger.debug("[DBG} block to eval: %s" % self.ircfg.blocks[loc_key])
218 |         if loc_key not in self.all_affected_lines:
219 |             return
220 |         logger.debug("[DBG} lines to eval: %s" % str(self.all_affected_lines[loc_key]))
221 |         for line_nb in self.all_affected_lines[loc_key]:
222 |             assign_blk = self.ircfg.blocks[loc_key].assignblks[line_nb]
223 |             symb_exec.eval_updt_assignblk(assign_blk)
224 | 


--------------------------------------------------------------------------------
/stadeo/cff/cff_strategies.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf8 -*-
  2 | #
  3 | # Copyright (c) 2020 ESET spol. s r.o.
  4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
  5 | # See LICENSE file for redistribution.
  6 | 
  7 | import logging
  8 | from pprint import pformat
  9 | from unittest import mock
 10 | 
 11 | import rpyc
 12 | from miasm.analysis.machine import Machine
 13 | from miasm.expression.expression import LocKey
 14 | 
 15 | from stadeo.cff.cff_recognizer import CFFRecognizer, ConfirmedMergeFunc
 16 | from stadeo.cff.cff_solver import CFFSolver
 17 | from stadeo.utils.extended_asmcfg import write_patches_to_file
 18 | from collections.abc import Iterable
 19 | 
 20 | 
 21 | logger = logging.getLogger('CFFstrategies')
 22 | logger.setLevel(logging.WARNING)
 23 | 
 24 | 
 25 | class CFFStrategies(object):
 26 |     def __init__(self, arch):
 27 |         """
 28 | 
 29 |         :param arch: Either 32 or 64 bit architecture
 30 |         """
 31 |         self._pending = {}
 32 |         self._reached_funcs = set()
 33 |         if arch not in [32, 64]:
 34 |             raise ValueError
 35 |         self._machine = Machine("x86_" + str(arch))
 36 | 
 37 |     def solve_loop(self, func_address, empty_address, context=None, ip='localhost', port=4455, conn=None,
 38 |                    only_one=False):
 39 |         """
 40 |         Deobfuscates single loop
 41 |         :param func_address: address of the function to be deobfuscated
 42 |         :param empty_address: address of the resulting deobfuscated function
 43 |         :param context: optional, dictionary assigning merging variable to its value using Miasm expressions
 44 |         :param ip: optional, IP of the computer running rpyc server in IDA
 45 |         :param port: optional, port of the computer running rpyc server in IDA
 46 |         :param conn: optional, already estabilished connection to running rpyc server in IDA
 47 |         :param only_one: do not attempt to reveal more than 1 CFF loop
 48 |         :return: True on successful deobfuscation, otherwise False
 49 |         """
 50 |         close_conn = False
 51 |         if not conn:
 52 |             close_conn = True
 53 |             conn = rpyc.classic.connect(ip, port)
 54 | 
 55 |         with mock.patch("builtins.open", conn.builtins.open):
 56 |             if context is None:
 57 |                 context = {}
 58 | 
 59 |             file_path = conn.modules.idaapi.get_input_file_path()
 60 |             recognizer = CFFRecognizer(file_path, func_address, self._machine, conn)
 61 |             try:
 62 |                 recognizer.recognize(only_one, context)
 63 |             except:
 64 |                 return False
 65 |             val = None
 66 |             if context:
 67 |                 val = set(context.values()).pop()
 68 |             new_empty_address = self._solve_loop(empty_address, recognizer, file_path, conn, val)
 69 | 
 70 |         if close_conn:
 71 |             conn.close()
 72 | 
 73 |         return new_empty_address == empty_address
 74 | 
 75 |     def _solve_loop(self, empty_address, recognizer, file_path, conn=None, val=None):
 76 |         func = recognizer.asmcfg
 77 |         deflattener = CFFSolver(recognizer)
 78 |         new_head = deflattener.process(self._pending, val, self._reached_funcs)
 79 |         if val:
 80 |             val = int(val)
 81 |         if not new_head:
 82 |             local_mapping = "skipping 0x%08x with val %s: %s\n" % (func.func_addr, recognizer.merging_var,
 83 |                                                                    hex(val or 0x0BADF00D))
 84 |             print("%s" % local_mapping, end="")
 85 |             return empty_address
 86 |         local_mapping = "0x%08x -> 0x%08x with val %s: %s\n" % (func.func_addr, empty_address, recognizer.merging_var,
 87 |                                                                 hex(val or 0x0BADF00D))
 88 |         print("mapping: %s" % local_mapping, end="")
 89 |         deflattener.out_asmcfg.loc_db.set_location_offset(LocKey(0), empty_address, True)
 90 |         new_addr = write_patches_to_file(deflattener.out_asmcfg, func.exectbl, empty_address, file_path, func.mode,
 91 |                                          2 ** 64 - 1, new_head)
 92 |         if conn:
 93 |             conn.modules.idaapi.reload_file(conn.modules.idaapi.get_input_file_path(), 0)
 94 |             conn.modules.ida_funcs.add_func(new_addr)
 95 |         return new_addr
 96 | 
 97 |     def process_all(self, empty_address, ip='localhost', port=4455, conn=None):
 98 |         """
 99 |         Tries to deobfuscate all functions recognized by IDA
100 |         :param empty_address: address where to put all the deobfuscated functions
101 |         :param ip: optional, IP of the computer running rpyc server in IDA
102 |         :param port: optional, port of the computer running rpyc server in IDA
103 |         :param conn: optional, already estabilished connection to running rpyc server in IDA
104 |         :return: dictionary assigning each processed function address either to None in case of failure or to the
105 |         respective @ConfirmedMergeFunc instance
106 |         """
107 |         close_conn = False
108 |         if not conn:
109 |             close_conn = True
110 |             conn = rpyc.classic.connect(ip, port)
111 | 
112 |         recognized_funcs = {}
113 |         with mock.patch("builtins.open", conn.builtins.open):
114 |             file_path = conn.modules.idaapi.get_input_file_path()
115 |             for func_addr in conn.modules.idautils.Functions():
116 |                 recognizer = CFFRecognizer(file_path, func_addr, self._machine, conn)
117 |                 try:
118 |                     recognizer.recognize(True)
119 |                 except:
120 |                     recognized_funcs[func_addr] = None
121 |                     continue
122 |                 recognized_funcs[func_addr] = ConfirmedMergeFunc(recognizer, empty_address)
123 |                 empty_address = self._solve_loop(empty_address, recognizer, file_path, conn)
124 |                 if recognized_funcs[func_addr].vals == empty_address:
125 |                     recognized_funcs[func_addr] = None
126 |                 recognizer.clear_cache()
127 | 
128 |         if close_conn:
129 |             conn.close()
130 | 
131 |         return recognized_funcs
132 | 
133 |     @staticmethod
134 |     def _clear_cache(recognized_funcs):
135 |         for merge_func in recognized_funcs.values():
136 |             if not merge_func:
137 |                 continue
138 |             merge_func.recognizer.clear_cache()
139 | 
140 |     def process_merging(self, func_addresses, empty_address, ip='localhost', port=4455, conn=None,
141 |                         recognized_funcs=None):
142 |         """
143 |         Tries to discover and deobfuscate reachable functions
144 |         :param func_addresses: initial function address or addresses
145 |         :param empty_address: address where to put all the deobfuscated functions
146 |         :param ip: optional, IP of the computer running rpyc server in IDA
147 |         :param port: optional, port of the computer running rpyc server in IDA
148 |         :param conn: optional, already estabilished connection to running rpyc server in IDA
149 |         :param recognized_funcs: optional, dictionary assigning each already processed function address either to None
150 |         in case of failure or to the respective @ConfirmedMergeFunc instance; used after repeated execution with
151 |         different initial address
152 |         :return: dictionary assigning each processed function address either to None in case of failure or to the
153 |         respective @ConfirmedMergeFunc instance
154 |         """
155 |         if recognized_funcs is None:
156 |             recognized_funcs = {}  # ConfirmedMergeFunc
157 |         if not isinstance(func_addresses, Iterable):
158 |             # in case of only one function
159 |             func_addresses = {func_addresses}
160 |         self._reached_funcs.update(func_addresses)
161 | 
162 |         processed_blocks = 0
163 |         close_conn = False
164 |         if not conn:
165 |             close_conn = True
166 |             conn = rpyc.classic.connect(ip, port)
167 | 
168 |         with mock.patch("builtins.open", conn.builtins.open):
169 |             file_path = conn.modules.idaapi.get_input_file_path()
170 |             while self._reached_funcs:
171 |                 func_addr = self._reached_funcs.pop()
172 |                 logger.debug("Processing func at 0x%x" % func_addr)
173 |                 logger.debug("Reached funcs: %s" % {hex(i) for i in self._reached_funcs})
174 |                 logger.debug("Pending func_addr: %s" % (pformat(self._pending[func_addr]) if func_addr in self._pending
175 |                              else "None"))
176 | 
177 |                 if processed_blocks > 800:
178 |                     # clear cached recognizers to avoid running out of memory
179 |                     processed_blocks = 0
180 |                     self._clear_cache(recognized_funcs)
181 | 
182 |                 if func_addr not in recognized_funcs:
183 |                     # recognize a new func for the first time
184 |                     ida_func = conn.modules.idaapi.get_func(func_addr)
185 |                     if ida_func and ida_func.flags & conn.modules.idaapi.FUNC_LIB:
186 |                         local_mapping = "skipping 0x%08x (library func)\n" % func_addr
187 |                         print("%s" % local_mapping, end="")
188 |                         recognized_funcs[func_addr] = None
189 |                         if func_addr in self._pending:
190 |                             del self._pending[func_addr]
191 |                         continue
192 |                     recognizer = CFFRecognizer(file_path, func_addr, self._machine, conn)
193 |                     merging_var_candidates = self._pending.get(func_addr, {})
194 |                     try:
195 |                         recognizer.recognize(False, merging_var_candidates)
196 |                         recognized_funcs[func_addr] = ConfirmedMergeFunc(recognizer, {})
197 |                         processed_blocks += len(recognizer.asmcfg.blocks)
198 |                     except (TypeError, OSError, RuntimeError):
199 |                         recognized_funcs[func_addr] = None
200 |                         logger.warning("Skipping exotic func at 0x%x" % func_addr)
201 |                         if func_addr in self._pending:
202 |                             del self._pending[func_addr]
203 |                         continue
204 |                 elif not self._pending.get(func_addr, None) or \
205 |                         not (recognized_funcs[func_addr] and recognized_funcs[func_addr].recognizer.merging_var):
206 |                     if func_addr in self._pending:
207 |                         del self._pending[func_addr]
208 |                     continue
209 | 
210 |                 recognizer = recognized_funcs[func_addr].recognizer
211 |                 merging_var = recognizer.merging_var
212 | 
213 |                 if not recognizer.asmcfg:
214 |                     # cache has been cleared
215 |                     recognizer.recognize()
216 |                     processed_blocks += len(recognizer.asmcfg.blocks)
217 | 
218 |                 if not merging_var:
219 |                     # just added non merging; even non-cff funcs can reach this point since they don't have any merging
220 |                     # var, they are to be considered as processed from now on
221 |                     recognized_funcs[func_addr].vals = empty_address
222 |                     empty_address = self._solve_loop(empty_address, recognizer, file_path, conn)
223 |                     if recognized_funcs[func_addr].vals == empty_address:
224 |                         recognized_funcs[func_addr].vals = None
225 |                     continue
226 |                 if merging_var not in self._pending[func_addr]:
227 |                     logger.warning("Function 0x%x isn't merging, ignore its previous results" % func_addr)
228 |                     recognizer.flat_loops.loops.pop(0)  # the first loop is merging
229 |                     recognizer.merging_var = None
230 |                     empty_address = self._solve_loop(empty_address, recognizer, file_path, conn)
231 |                     del self._pending[func_addr]
232 |                     continue
233 |                 current_vals = self._pending[func_addr][merging_var]
234 |                 del self._pending[func_addr]
235 |                 for val in current_vals:
236 |                     if val in recognized_funcs[func_addr].vals:
237 |                         continue
238 |                     recognized_funcs[func_addr].vals[val] = empty_address
239 |                     empty_address = self._solve_loop(empty_address, recognizer, file_path, conn, val)
240 |                     if recognized_funcs[func_addr].vals[val] == empty_address:
241 |                         # failed
242 |                         recognized_funcs[func_addr] = None
243 | 
244 |         if close_conn:
245 |             conn.close()
246 | 
247 |         return recognized_funcs
248 | 


--------------------------------------------------------------------------------
/stadeo/string/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf8 -*-
2 | #
3 | # Copyright (c) 2020 ESET spol. s r.o.
4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
5 | # See LICENSE file for redistribution.
6 | 


--------------------------------------------------------------------------------
/stadeo/string/string_revealer.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf8 -*-
  2 | #
  3 | # Copyright (c) 2020 ESET spol. s r.o.
  4 | # Contains regexes which are Copyright (C) 2017 FireEye, Inc.
  5 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
  6 | # See LICENSE file for redistribution.
  7 | 
  8 | import logging
  9 | import re
 10 | from unittest import mock
 11 | 
 12 | import rpyc
 13 | from miasm.analysis.dse import DSEEngine
 14 | from miasm.expression.simplifications import expr_simp
 15 | from miasm.analysis.sandbox import Sandbox_Win_x86_32, Sandbox_Win_x86_64
 16 | from sortedcontainers import SortedList
 17 | 
 18 | from stadeo.string.string_symb_stubs import *
 19 | from stadeo.utils.extended_asmcfg import ExtendedAsmCFG
 20 | 
 21 | logger = logging.getLogger('StringRevealer')
 22 | logger.setLevel(logging.WARNING)
 23 | 
 24 | 
 25 | class StringRevealer(object):
 26 |     # patterns borrowed from https://github.com/fireeye/flare-floss/blob/master/floss/strings.py
 27 |     ASCII_BYTE = br"!\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[" \
 28 |                  br"\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t "
 29 |     ASCII_RE = re.compile(b"([%s]{%d,})" % (ASCII_BYTE, 3))
 30 |     UNICODE_RE = re.compile(b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, 3))
 31 | 
 32 |     def __init__(self, attrib):
 33 |         self.sandbox = Sandbox_Win_x86_32
 34 |         if attrib == 64:
 35 |             raise Exception("Not supported")
 36 |             # 64 bit string revealer doesn't work due to https://github.com/cea-sec/miasm/issues/647
 37 |             self.sandbox = Sandbox_Win_x86_64
 38 |         parser = self.sandbox.parser()
 39 |         self.options = parser.parse_args()
 40 |         self.options.use_windows_structs = True
 41 |         self.options.usesegm = True
 42 |         self.options.mimic_env = True
 43 |         self.options.jitter = "llvm"
 44 |         self.sb = None
 45 | 
 46 |     @staticmethod
 47 |     def _exec_callback(dse, func, occurances, jitter, strings, get_strings_from_dse):
 48 |         occurances[jitter.pc] = occurances.get(jitter.pc, 0) + 1
 49 |         if occurances[jitter.pc] > 500:
 50 |             return False
 51 |         # extracts strings more often, but is naturally slower, not needed for Stantinko, one could use it elsewhere:
 52 |         # if func.loc_db.get_offset_location(jitter.pc):
 53 |         #     # snap = dse.take_snapshot()
 54 |         #     # dse.update_state_from_concrete()
 55 |         #     strings.update(get_strings_from_dse(dse))
 56 |         #     # dse.restore_snapshot(snap)
 57 |         dse.callback(jitter)
 58 |         return True
 59 | 
 60 |     def process_all(self, ip='localhost', port=4455, conn=None):
 61 |         """
 62 |         Reveals strings in all functions recognized by IDA
 63 |         :param ip: optional, IP of the computer running rpyc server in IDA
 64 |         :param port: optional, port of the computer running rpyc server in IDA
 65 |         :param conn: optional, already estabilished connection to running rpyc server in IDA
 66 |         :return: dictionary mapping each processed function address to the respective revealed strings
 67 |         """
 68 |         close_conn = False
 69 |         if not conn:
 70 |             close_conn = True
 71 |             conn = rpyc.classic.connect(ip, port)
 72 | 
 73 |         strings = {}
 74 |         file_path = conn.modules.idaapi.get_input_file_path()
 75 |         with mock.patch("builtins.open", conn.builtins.open):
 76 |             self.sb = self.sandbox(file_path, self.options, globals())
 77 |             # put some mem above initial SP
 78 |             sp = self.sb.jitter.arch.getsp(self.sb.jitter.attrib)
 79 |             setattr(self.sb.jitter.cpu, sp.name, self.sb.jitter.stack_base + self.sb.jitter.stack_size - 0x8 * 80)
 80 |             for func_addr in conn.modules.idautils.Functions():
 81 |                 func = ExtendedAsmCFG(file_path)
 82 |                 func.disassemble(func_addr, conn)
 83 |                 strings[func_addr] = self._process_func(func)
 84 | 
 85 |         if close_conn:
 86 |             conn.close()
 87 | 
 88 |         return strings
 89 | 
 90 |     def process_funcs(self, func_addresses, ip='localhost', port=4455, conn=None):
 91 |         """
 92 |         Reveals strings in all supplied function addresses
 93 |         :param func_addresses: function addresses to process
 94 |         :param ip: optional, IP of the computer running rpyc server in IDA
 95 |         :param port: optional, port of the computer running rpyc server in IDA
 96 |         :param conn: optional, already estabilished connection to running rpyc server in IDA
 97 |         :return: dictionary mapping each processed function address to the respective revealed strings
 98 |         """
 99 |         close_conn = False
100 |         if not conn:
101 |             close_conn = True
102 |             conn = rpyc.classic.connect(ip, port)
103 | 
104 |         strings = {}
105 |         file_path = conn.modules.idaapi.get_input_file_path()
106 |         with mock.patch("builtins.open", conn.builtins.open):
107 |             self.sb = self.sandbox(file_path, self.options, globals())
108 |         # put some mem above initial SP
109 |         sp = self.sb.jitter.arch.getsp(self.sb.jitter.attrib)
110 |         setattr(self.sb.jitter.cpu, sp.name, self.sb.jitter.stack_base + self.sb.jitter.stack_size - 0x8 * 80)
111 | 
112 |         # self.sb.jitter.jit.log_regs = True
113 |         # self.sb.jitter.jit.log_mn = True
114 |         for func_address in func_addresses:
115 |             with mock.patch("builtins.open", conn.builtins.open):
116 |                 func = ExtendedAsmCFG(file_path)
117 |                 func.disassemble(func_address, conn)
118 |             strings[func_address] = self._process_func(func)
119 | 
120 |         if close_conn:
121 |             conn.close()
122 | 
123 |         return strings
124 | 
125 |     @staticmethod
126 |     def _wipe_dse_errors(dse):
127 |         dse.symb.reset_modified()
128 |         dse.jitter.vm.set_exception(0)
129 |         dse.jitter.cpu.set_exception(0)
130 |         dse.jitter.bs._atomic_mode = False
131 | 
132 |     def _process_func(self, func):
133 |         dse = DSEEngine(self.sb.machine)
134 |         dse.attach(self.sb.jitter)  # needs to be attached before setting exec_cb to overwrite it with ours
135 |         bak_snap = dse.take_snapshot()
136 |         dse.add_lib_handler(self.sb.libs, globals())
137 |         occurances = {}
138 |         addr = func.loc_db.get_location_offset(LocKey(0))
139 |         asmb = func.loc_key_to_block(LocKey(0))
140 |         strings = set()
141 |         self.sb.jitter.exec_cb = lambda x: self._exec_callback(dse, func, occurances, x, strings,
142 |                                                                self._get_strings_from_dse)
143 |         self.sb.jitter.init_run(addr)
144 |         try:
145 |             self.sb.jitter.run_until(asmb.lines[-1].offset)
146 |         except:
147 |             pass
148 |         dse.update_state_from_concrete()
149 |         initial_snap = dse.take_snapshot()  # prepared initial context
150 |         strings.update(self._get_strings_from_dse(dse))
151 |         dse.restore_snapshot(initial_snap)
152 |         for loc_key in func.walk_breadth_first_forward(LocKey(0)):
153 |             addr = func.loc_db.get_location_offset(loc_key)
154 |             if not addr:
155 |                 continue
156 |             occurances.clear()
157 |             self._emul_address(dse, addr)
158 |             dse.update_state_from_concrete()
159 |             strings.update(self._get_strings_from_dse(dse))
160 |             dse.restore_snapshot(initial_snap)
161 | 
162 |         dse.restore_snapshot(bak_snap)
163 |         strings = self._get_top_level_strings(strings)
164 |         return strings
165 | 
166 |     def _emul_address(self, dse, addr):
167 |         self.sb.jitter.init_run(addr)
168 |         crashed = set()
169 |         while 1:
170 |             self._wipe_dse_errors(dse)
171 |             try:
172 |                 self.sb.jitter.continue_run()
173 |             except Exception as e:
174 |                 if isinstance(e, RuntimeError) and \
175 |                         e.args and e.args[0] == "Cannot find address" and \
176 |                         self.sb.jitter.pc not in crashed:
177 |                     instr = self.sb.jitter.jit.mdis.dis_instr(self.sb.jitter.pc)
178 |                     crashed.add(self.sb.jitter.pc)
179 |                     if instr:
180 |                         next_addr = self.sb.jitter.pc + instr.l
181 |                         self.sb.jitter.init_run(next_addr)
182 |                         continue
183 |             break
184 | 
185 |     @staticmethod
186 |     def _get_top_level_strings(strings):
187 |         new_strings = set()
188 |         while strings:
189 |             string = strings.pop()
190 |             for tmp_string in strings | new_strings:
191 |                 if string in tmp_string:
192 |                     break
193 |             else:
194 |                 new_strings.add(string)
195 |         return new_strings
196 | 
197 |     def _get_strings_from_dse(self, dse):
198 |         modified_mem = SortedList(key=lambda x: int(x[0]))
199 |         for key, val in dse.symb.modified(ids=False, mems=True):
200 |             try:
201 |                 val = dse.eval_expr(key)
202 |                 key = dse.eval_expr(key.ptr)
203 |             except RuntimeError:
204 |                 continue
205 |             if not key.is_int() or not val.is_int():
206 |                 continue
207 |             modified_mem.add((key, val))
208 |         following_address = None
209 |         current_sequence = b""
210 |         strings = set()
211 |         for address, value in modified_mem:
212 |             if following_address == address:
213 |                 current_sequence += int(value).to_bytes(value.size // 8, "little")
214 |             else:
215 |                 self._update_strings_from_sequence(current_sequence, strings)
216 |                 current_sequence = int(value).to_bytes(value.size // 8, "little")
217 |             following_address = expr_simp(address + ExprInt(value.size // 8, address.size))
218 |         self._update_strings_from_sequence(current_sequence, strings)
219 |         return strings
220 | 
221 |     def _update_strings_from_sequence(self, sequence, strings):
222 |         strings.update([i.decode() for i in self.ASCII_RE.findall(sequence)])
223 |         strings.update([i.decode("utf-16le") for i in self.UNICODE_RE.findall(sequence)])
224 | 


--------------------------------------------------------------------------------
/stadeo/string/string_symb_stubs.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf8 -*-
 2 | #
 3 | # Copyright (c) 2020 ESET spol. s r.o.
 4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
 5 | # See LICENSE file for redistribution.
 6 | 
 7 | from binascii import hexlify
 8 | 
 9 | from miasm.expression.expression import *
10 | 
11 | 
12 | # TODO add stubs for other string manipulation functions - current stantinko versions has been seen to only use strcat
13 | #  and strcpy via win api
14 | 
15 | 
16 | def get_win_str_data_a(jitter, ad_str, max_char=None):
17 |     l = 0
18 |     tmp = ad_str
19 |     while ((max_char is None or l < max_char) and
20 |            jitter.vm.get_mem(tmp, 1) != b"\x00"):
21 |         tmp += 1
22 |         l += 1
23 |     data = jitter.vm.get_mem(ad_str, l)
24 |     return data
25 | 
26 | 
27 | def get_win_str_data_w(jitter, ad_str, max_char=None):
28 |     l = 0
29 |     tmp = ad_str
30 |     while ((max_char is None or l < max_char) and
31 |            jitter.vm.get_mem(tmp, 2) != b"\x00\x00"):
32 |         tmp += 2
33 |         l += 2
34 |     s = jitter.vm.get_mem(ad_str, l)
35 |     return s
36 | 
37 | 
38 | def kernel32_lstrcat(dse, get_win_str_data, zero_pad):
39 |     arg_ptr2 = dse.jitter.get_arg_n_stdcall(2)
40 |     arg_ptr1 = dse.jitter.get_arg_n_stdcall(1)
41 |     s2 = get_win_str_data(dse.jitter, arg_ptr2)
42 |     s1 = get_win_str_data(dse.jitter, arg_ptr1)
43 |     real_len = len(s2) * 8 + zero_pad * 8
44 |     value = int(hexlify(s2[::-1]), 16)
45 |     rhs = ExprCompose(ExprInt(value, len(s2) * 8), ExprInt(0, zero_pad * 8))
46 |     if dse.jitter.ir_arch.attrib == 32:
47 |         stack_ptr = ExprMem(dse.jitter.ir_arch.sp + ExprInt(4, 32), dse.jitter.ir_arch.attrib)
48 |         shifted_evaluated_stack_ptr = dse.eval_expr(stack_ptr + ExprInt(len(s1), 32))
49 |         lhs = ExprMem(shifted_evaluated_stack_ptr, real_len)
50 |         upd = {lhs: rhs,
51 |                ExprId("EAX", 32): dse.eval_expr(stack_ptr)}
52 |     else:
53 |         lhs = ExprMem(ExprId("RCX", 64) + ExprInt(len(s1), 64), real_len)
54 |         upd = {lhs: rhs,
55 |                ExprId("RAX", 64): dse.eval_expr(ExprId("RCX", 64))}
56 |     dse.update_state(upd)
57 |     # apply ret effects
58 |     rhs = dse.eval_expr(dse.jitter.ir_arch.sp + ExprInt(12, dse.jitter.ir_arch.sp.size))
59 |     dse.update_state({dse.jitter.ir_arch.sp: rhs})
60 | 
61 | 
62 | def kernel32_lstrcatW_symb(dse):
63 |     kernel32_lstrcat(dse, get_win_str_data_w, 2)
64 | 
65 | 
66 | def kernel32_lstrcatA_symb(dse):
67 |     kernel32_lstrcat(dse, get_win_str_data_a, 1)
68 | 
69 | 
70 | def kernel32_lstrcpy(dse, get_win_str_data, zero_pad):
71 |     arg_ptr2 = dse.jitter.get_arg_n_stdcall(2)
72 |     s2 = get_win_str_data(dse.jitter, arg_ptr2)
73 |     real_len = len(s2) * 8 + zero_pad * 8
74 |     value = int(hexlify(s2[::-1]), 16)
75 |     rhs = ExprCompose(ExprInt(value, len(s2) * 8), ExprInt(0, zero_pad * 8))
76 |     if dse.jitter.ir_arch.attrib == 32:
77 |         stack_ptr = ExprMem(dse.jitter.ir_arch.sp + ExprInt(4, 32), dse.jitter.ir_arch.attrib)
78 |         evaluated_stack_ptr = dse.eval_expr(stack_ptr)
79 |         lhs = ExprMem(evaluated_stack_ptr, real_len)
80 |         upd = {lhs: rhs,
81 |                ExprId("EAX", 32): lhs.ptr}
82 |     else:
83 |         lhs = ExprMem(ExprId("RCX", 64), real_len)
84 |         upd = {lhs: rhs,
85 |                ExprId("RAX", 64): lhs.ptr}
86 |     dse.update_state(upd)
87 |     # apply ret effects
88 |     rhs = dse.eval_expr(dse.jitter.ir_arch.sp + ExprInt(12, dse.jitter.ir_arch.sp.size))
89 |     dse.update_state({dse.jitter.ir_arch.sp: rhs})
90 | 
91 | 
92 | def kernel32_lstrcpyA_symb(dse):
93 |     kernel32_lstrcpy(dse, get_win_str_data_a, 1)
94 | 
95 | 
96 | def kernel32_lstrcpyW_symb(dse):
97 |     kernel32_lstrcpy(dse, get_win_str_data_w, 2)
98 | 


--------------------------------------------------------------------------------
/stadeo/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf8 -*-
2 | #
3 | # Copyright (c) 2020 ESET spol. s r.o.
4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
5 | # See LICENSE file for redistribution.
6 | 


--------------------------------------------------------------------------------
/stadeo/utils/extended_asmcfg.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf8 -*-
  2 | #
  3 | # Copyright (c) 2020 ESET spol. s r.o.
  4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
  5 | # See LICENSE file for redistribution.
  6 | 
  7 | from binascii import hexlify
  8 | 
  9 | from miasm.analysis.binary import Container
 10 | from miasm.analysis.depgraph import *
 11 | from miasm.analysis.disasm_cb import get_ira
 12 | from miasm.analysis.machine import Machine
 13 | from miasm.arch.x86.arch import instruction_x86, additional_info, mn_x86, conditional_branch, unconditional_branch
 14 | from miasm.arch.x86.sem import ir_x86_64
 15 | from miasm.core.asmblock import AsmBlock, AsmConstraint, AsmConstraintNext, AsmConstraintTo, bbl_simplifier, \
 16 |     asm_resolve_final, AsmCFG
 17 | from miasm.core.bin_stream import bin_stream_pe
 18 | from miasm.core.utils import pck32, pck64, upck64, upck32
 19 | from miasm.expression.expression import *
 20 | from miasm.ir.ir import IRBlock, IntermediateRepresentation, IRCFG
 21 | from miasm.ir.symbexec import SymbolicExecutionEngine
 22 | from miasm.loader import pe_init
 23 | from miasm.core.interval import interval
 24 | from pprint import pformat
 25 | import logging
 26 | 
 27 | logger = logging.getLogger('ExtendedAsmCFG')
 28 | logger.setLevel(logging.WARNING)
 29 | 
 30 | 
 31 | def is_bad_expr(expr):
 32 |     return expr.is_id() and expr.name in ["RIP", "EIP", "zf", "nf", "pf", "of", "cf", "af", "df", "IRDst"]
 33 | 
 34 | 
 35 | def custom_get(self, loc_key, elements, line_nb, heads):
 36 |     """Compute the dependencies of @elements at line number @line_nb in
 37 |     the block named @loc_key in the current IRCFG, before the execution of
 38 |     this line. Dependency check stop if one of @heads is reached
 39 |     @loc_key: LocKey instance
 40 |     @element: set of Expr instances
 41 |     @line_nb: int
 42 |     @heads: set of LocKey instances
 43 |     Return an iterator on DiGraph(DependencyNode)
 44 |     """
 45 |     # Init the algorithm
 46 |     inputs = {element: set() for element in elements}
 47 |     initial_state = DependencyState(loc_key, inputs, line_nb)
 48 |     todo = {initial_state}
 49 |     done = set()
 50 |     dpResultcls = DependencyResultImplicit if self._implicit else DependencyResult
 51 | 
 52 |     while todo:
 53 |         state = todo.pop()
 54 |         self._compute_intrablock(state)
 55 |         done_state = state.get_done_state()
 56 |         if done_state in done:
 57 |             continue
 58 |         done.add(done_state)
 59 |         if state.loc_key in heads or not state.pending:
 60 |             yield dpResultcls(self._ircfg, initial_state, state, elements)
 61 |             continue
 62 | 
 63 |         if self._implicit:
 64 |             # Force IRDst to be tracked, except in the input block
 65 |             state.pending[self._ircfg.IRDst] = set()
 66 | 
 67 |         # Propagate state to parents
 68 |         for pred in self._ircfg.predecessors_iter(state.loc_key):
 69 |             todo.add(state.extend(pred))
 70 | 
 71 | 
 72 | def custom_get_range(self):
 73 |     """Returns the offset hull of an AsmBlock"""
 74 |     try:
 75 |         rng = (self.lines[0].offset,
 76 |                self.lines[-1].offset + self.lines[-1].l)
 77 |         if None in rng:
 78 |             rng = 0, 0
 79 |     except (IndexError, TypeError):
 80 |         rng = 0, 0
 81 |     return rng
 82 | 
 83 | 
 84 | # support for premature termination of tracking depending on heads
 85 | DependencyGraph.get = custom_get
 86 | AsmBlock.get_range = custom_get_range
 87 | 
 88 | 
 89 | def custom_break_flow(self):
 90 |     if self.name in conditional_branch + unconditional_branch:
 91 |         return True
 92 |     if self.name.startswith('LOOP'):
 93 |         return True
 94 |     if self.name.startswith('RET'):
 95 |         return True
 96 |     if self.name.startswith('INT'):
 97 |         return True
 98 |     if self.name.startswith('SYS'):
 99 |         return True
100 |     if self.name.startswith('CMOV'):
101 |         return True
102 |     if self.name.startswith('SBB'):
103 |         return True
104 |     return self.name in ['CALL', 'HLT', 'IRET', 'IRETD', 'IRETQ', 'ICEBP', 'UD2']
105 | 
106 | 
107 | def custom_split_flow(self):
108 |     if self.name in conditional_branch:
109 |         return True
110 |     if self.name in unconditional_branch:
111 |         return False
112 |     if self.name.startswith('LOOP'):
113 |         return True
114 |     if self.name.startswith('INT'):
115 |         return True
116 |     if self.name.startswith('SYS'):
117 |         return True
118 |     if self.name.startswith('CMOV'):
119 |         return True
120 |     if self.name.startswith('SBB'):
121 |         return True
122 |     if self.name in ['CALL']:
123 |         return True
124 |     return False
125 | 
126 | 
127 | instruction_x86.breakflow = custom_break_flow
128 | instruction_x86.splitflow = custom_split_flow
129 | 
130 | old_mod_pc = ir_x86_64.mod_pc
131 | 
132 | 
133 | def custom_mod_pc(self, instr, instr_ir, extra_ir):
134 |     if None in [instr.offset, instr.l]:
135 |         return
136 |     old_mod_pc(self, instr, instr_ir, extra_ir)
137 | 
138 | 
139 | ir_x86_64.mod_pc = custom_mod_pc
140 | 
141 | 
142 | def custom_get_next_loc_key(self, instr):
143 |     if not instr.offset or not instr.l and self.asm_block.lines[-1] == instr:
144 |         return [i for i in self.asm_block.bto if i.c_t == "c_next"][0].loc_key
145 |     loc_key = self.loc_db.get_or_create_offset_location(instr.offset + instr.l)
146 |     self.split_offset = instr.offset + instr.l
147 |     return loc_key
148 | 
149 | 
150 | def custom_new_ircfg_from_asmcfg(self, asmcfg, *args, **kwargs):
151 |     """
152 |     Return a new instance of IRCFG from an @asmcfg
153 |     @asmcfg: AsmCFG instance
154 |     """
155 | 
156 |     ircfg = IRCFG(self.IRDst, self.loc_db, *args, **kwargs)
157 |     self.new_blocks = []
158 |     for block in asmcfg.blocks:
159 |         self.add_asmblock_to_ircfg(block, ircfg, False, asmcfg)
160 |     while self.new_blocks:
161 |         block = self.new_blocks.pop()
162 |         asmcfg.add_block(block)
163 |         asmcfg.rebuild_edges()
164 |         self.add_asmblock_to_ircfg(block, ircfg, False, asmcfg)
165 |     return ircfg
166 | 
167 | 
168 | def custom_add_asmblock_to_ircfg(self, block, ircfg, gen_pc_updt=False, asmcfg=None):
169 |     """
170 |     Add a native block to the current IR
171 |     @block: native assembly block
172 |     @ircfg: IRCFG instance
173 |     @gen_pc_updt: insert PC update effects between instructions
174 |     """
175 | 
176 |     loc_key = block.loc_key
177 |     ir_blocks_all = []
178 | 
179 |     assignments = []
180 |     self.asm_block = block
181 |     for instr in block.lines:
182 |         if loc_key is None:
183 |             assignments = []
184 |             loc_key = self.get_loc_key_for_instr(instr)
185 |         split = self.add_instr_to_current_state(
186 |             instr, block, assignments,
187 |             ir_blocks_all, gen_pc_updt
188 |         )
189 |         if split:
190 |             ir_blocks_all.append(IRBlock(loc_key, assignments))
191 |             loc_key = None
192 |             if len(assignments) != len(block.lines) and asmcfg:
193 |                 new_block = block.split(asmcfg.loc_db, self.split_offset)
194 |                 self.new_blocks.append(new_block)
195 |                 break
196 |             assignments = []
197 |     if loc_key is not None:
198 |         ir_blocks_all.append(IRBlock(loc_key, assignments))
199 | 
200 |     new_ir_blocks_all = self.post_add_asmblock_to_ircfg(block, ircfg, ir_blocks_all)
201 |     for irblock in new_ir_blocks_all:
202 |         ircfg.add_irblock(irblock)
203 |     return new_ir_blocks_all
204 | 
205 | 
206 | # support for manually added assembly without offset and size
207 | IntermediateRepresentation.add_asmblock_to_ircfg = custom_add_asmblock_to_ircfg
208 | IntermediateRepresentation.get_next_loc_key = custom_get_next_loc_key
209 | IntermediateRepresentation.new_ircfg_from_asmcfg = custom_new_ircfg_from_asmcfg
210 | 
211 | 
212 | def create_jump_instruction(mode, target):
213 |     """
214 |     :param mode: 32 or 64, depends on architecture
215 |     :param target: Expr to jump to
216 |     :return: created instruction
217 |     """
218 |     tmp_ln = instruction_x86("JMP", mode, [target])
219 |     tmp_ln.additional_info = additional_info()
220 |     tmp_ln.additional_info.g1.value = 0
221 |     return tmp_ln
222 | 
223 | 
224 | def create_mov_instruction(mode, dst, src):
225 |     tmp_ln = instruction_x86("MOV", mode, [dst, src])
226 |     tmp_ln.additional_info = additional_info()
227 |     tmp_ln.additional_info.g1.value = 0
228 |     return tmp_ln
229 | 
230 | 
231 | def create_cond_branch_instruction(mode, name, target):
232 |     tmp_ln = instruction_x86(name, mode, [target])
233 |     tmp_ln.additional_info = additional_info()
234 |     tmp_ln.additional_info.g1.value = 0
235 |     return tmp_ln
236 | 
237 | 
238 | def create_cmp_j_instructions(mode, expr, val, target, kind):
239 |     cmp_inst = instruction_x86("CMP", mode, [expr, val])
240 |     cmp_inst.additional_info = additional_info()
241 |     cmp_inst.additional_info.g1.value = 0
242 | 
243 |     jz_inst = instruction_x86(kind, mode, [target])
244 |     jz_inst.additional_info = additional_info()
245 |     jz_inst.additional_info.g1.value = 0
246 |     return [cmp_inst, jz_inst]
247 | 
248 | 
249 | def create_nop(mode):
250 |     nop_inst = instruction_x86("NOP", mode, [])
251 |     nop_inst.additional_info = additional_info()
252 |     nop_inst.additional_info.g1.value = 0
253 |     return nop_inst
254 | 
255 | 
256 | def remove_redundant_and_unpin_blocks(asmcfg, head, mode, unpin=True):
257 |     """
258 |     To unpin a block means to unset associated address. New one can be calculated then.
259 |     :return:
260 |     """
261 |     reachable_loc_keys = list(asmcfg.reachable_sons(head))
262 |     blocks_to_be_removed = []
263 |     rip = ExprId("RIP", 64)
264 |     new_next_addr_card = ExprLoc(asmcfg.loc_db.get_or_create_name_location('_'), 64)
265 |     for block in asmcfg.blocks:
266 |         if block.loc_key not in reachable_loc_keys:
267 |             blocks_to_be_removed.append(block)
268 |         elif unpin:
269 |             for instr in block.lines:
270 |                 for ind in range(len(instr.args)):
271 |                     if rip in instr.args[ind]:
272 |                         next_addr = ExprInt(instr.offset + instr.l, 64)
273 |                         fix_dict = {rip: rip + next_addr - new_next_addr_card}
274 |                         instr.args[ind] = instr.args[ind].replace_expr(fix_dict)
275 | 
276 |         if not block.lines:
277 |             block.lines = [create_nop(mode)]
278 |         if unpin and asmcfg.loc_db.get_location_offset(block.loc_key):
279 |             asmcfg.loc_db.unset_location_offset(block.loc_key)
280 | 
281 |     for block in blocks_to_be_removed:
282 |         asmcfg.del_block(block)
283 | 
284 | 
285 | def fix_multiple_next_constraints(asmcfg, mode):
286 |     """
287 |     When there are multiple blocks proceeding another block with no jump, add one.
288 |     :return:
289 |     """
290 |     blocks_to_be_added = []
291 |     for loc_key in asmcfg.nodes():
292 |         next_edges = {edge: constraint for edge, constraint in asmcfg.edges2constraint.items() if
293 |                       constraint == AsmConstraint.c_next}
294 |         pred_next = list(ploc_key for (ploc_key, dloc_key) in next_edges if dloc_key == loc_key)
295 |         if len(pred_next) > 1:
296 |             for index in range(1, len(pred_next)):
297 |                 inst = create_jump_instruction(mode, ExprLoc(loc_key, mode))
298 | 
299 |                 new_block_loc_key = asmcfg.loc_db.add_location()
300 |                 new_block = AsmBlock(new_block_loc_key)
301 |                 new_block.addline(inst)
302 |                 new_block.bto = {AsmConstraintTo(loc_key)}
303 | 
304 |                 asmcfg.loc_key_to_block(pred_next[index]).bto = {AsmConstraintNext(new_block_loc_key)}
305 |                 blocks_to_be_added.append(new_block)
306 |     # one while might be sufficient, depends on type of _nodes
307 |     for block in blocks_to_be_added:
308 |         asmcfg.add_block(block)
309 | 
310 | 
311 | def write_patches_to_file(asmcfg, exectbl, out_addr, out_file_name, mode, max_addr=2 ** 64 - 1, head=None):
312 |     if head is None:
313 |         head = asmcfg.heads()[0]
314 | 
315 |     asmcfg = bbl_simplifier.apply_simp(asmcfg)
316 |     asmcfg.rebuild_edges()
317 |     remove_redundant_and_unpin_blocks(asmcfg, head, mode)
318 |     fix_multiple_next_constraints(asmcfg, mode)
319 |     # careless block reordering might have damaged edges of the graph and introduced dead blocks
320 |     asmcfg.rebuild_edges()
321 | 
322 |     asmcfg.loc_db.set_location_offset(head, out_addr)
323 |     patches = asm_resolve_final(mn_x86, asmcfg, asmcfg.loc_db, dst_interval=interval([(out_addr, max_addr)]))
324 | 
325 |     last_empty_address = 0
326 |     for offset, raw in patches.items():
327 |         logger.debug(
328 |             "patch addr rva is 0x%x; raw is 0x%x; the patch: %s" % (exectbl.virt2rva(offset), offset, raw.hex()))
329 |         exectbl.img_rva[exectbl.virt2rva(offset)] = raw
330 |         next_empty_address = offset + len(raw)
331 |         if last_empty_address < next_empty_address:
332 |             last_empty_address = next_empty_address
333 | 
334 |     with open(out_file_name + ".bak", 'wb') as bak:
335 |         with open(out_file_name, 'rb') as fl:
336 |             bak.write(fl.read())
337 |     with open(out_file_name, 'wb') as fl:
338 |         fl.write(bytes(exectbl))
339 |     return last_empty_address
340 | 
341 | 
342 | class MySymbolicExecutionEngine(SymbolicExecutionEngine):
343 |     def __init__(self, pool_bin, jtc_var, *args, **kwargs):
344 |         super(MySymbolicExecutionEngine, self).__init__(*args, **kwargs)
345 |         self.pool_bin = pool_bin
346 |         self.jtc_var = jtc_var
347 | 
348 |     def mem_read(self, expr_mem):
349 |         """Memory read wrapper for symbolic execution
350 |         @expr_mem: ExprMem"""
351 |         if not expr_mem.ptr.is_int() or self.jtc_var == expr_mem:
352 |             return super(MySymbolicExecutionEngine, self).mem_read(expr_mem)
353 |         addr = expr_mem.ptr.arg.arg
354 |         size = expr_mem.size // 8
355 |         value = self.pool_bin.getbytes(addr, size)
356 |         final = ExprInt(int(hexlify(value[::-1]), 16), expr_mem.size)
357 |         return final
358 | 
359 | 
360 | class JTCVariableDependencyGraph(DependencyGraph):
361 |     def __init__(self, loc_key, *args, **kwargs):
362 |         super(JTCVariableDependencyGraph, self).__init__(*args, **kwargs)
363 |         self.jtc_var = None
364 |         self.done = False
365 |         self.loc_key = loc_key
366 | 
367 |     def _track_exprs(self, state, assignblk, line_nb):
368 |         """Track pending expression in an assignblock"""
369 |         if self.done:
370 |             return
371 |         future_pending = {}
372 |         node_resolved = set()
373 |         for dst, src in assignblk.items():
374 |             # Only track pending
375 |             if dst not in state.pending:
376 |                 continue
377 |             # Track IRDst in implicit mode only
378 |             if dst == self._ircfg.IRDst and not self._implicit:
379 |                 continue
380 |             assert dst not in node_resolved
381 |             node_resolved.add(dst)
382 |             dependencies = self._follow_apply_cb(src)
383 | 
384 |             state.link_element(dst, line_nb)
385 |             state.link_dependencies(dst, line_nb,
386 |                                     dependencies, future_pending)
387 | 
388 |         # Update pending nodes
389 |         if not self.jtc_var and state.loc_key == self.loc_key:
390 |             for expr in future_pending:
391 |                 if expr.is_mem() or (expr.is_id() and
392 |                                      expr.name not in ["RIP", "EIP", "zf", "nf", "pf", "of", "cf", "af", "df",
393 |                                                        self._ircfg.IRDst.name]):
394 |                     self.jtc_var = expr
395 |                     state.pending = {}
396 |                     # break
397 |                     return
398 | 
399 |         state.remove_pendings(node_resolved)
400 |         state.add_pendings(future_pending)
401 | 
402 | 
403 | class ExtendedAsmCFG(AsmCFG):
404 |     def __init__(self, file_name, conn=None, cont=None, exectbl=None, *args, **kwargs):
405 |         super(ExtendedAsmCFG, self).__init__(loc_db=LocationDB(), *args, **kwargs)
406 |         self.file_name = file_name
407 |         if not cont:
408 |             if conn:
409 |                 stream = conn.builtins.open(file_name, 'rb')
410 |             else:
411 |                 stream = open(file_name, 'rb')
412 |             cont = Container.from_stream(stream)
413 |         self.cont = cont
414 |         self.mode = int(cont.arch[-2:])
415 |         self.address_size = self.mode // 8
416 |         self.pck = pck32
417 |         self.upck = upck32
418 |         self.machine = Machine(cont.arch)
419 |         self.disassembler = self.machine.dis_engine
420 |         if self.mode == 64:
421 |             self.pck = pck64
422 |             self.upck = upck64
423 |         self._exectbl = exectbl
424 |         if not exectbl:
425 |             self._exectbl = pe_init.PE(cont.executable)
426 |         self._dis_engine = None
427 |         self.func_addr = None
428 |         self.jmp_table_loc_keys = set()
429 | 
430 |     def _process_cmov(self, cur_bloc, last_instruction):
431 |         assignment_block = AsmBlock(self.loc_db.add_location())
432 |         cond_block = AsmBlock(self.loc_db.add_location())
433 |         dst = last_instruction.args[0]
434 |         src = last_instruction.args[1]
435 |         assignment_block.lines.append(create_mov_instruction(self.mode, dst, src))
436 |         branch_target = next(iter(cur_bloc.bto)).loc_key
437 |         assignment_block.lines.append(create_jump_instruction(self.mode, ExprLoc(branch_target, self.mode)))
438 |         branch_name = "J" + last_instruction.name[len("CMOV"):]
439 |         cur_bloc.lines.pop()
440 |         if not cur_bloc.lines:
441 |             cur_bloc.lines = [create_nop(self.mode)]
442 |         cond_block.lines.append(create_cond_branch_instruction(self.mode, branch_name,
443 |                                                                ExprLoc(assignment_block.loc_key, self.mode)))
444 |         assignment_block.bto = {AsmConstraintTo(branch_target)}
445 |         cond_block.bto = {AsmConstraintNext(branch_target), AsmConstraintTo(assignment_block.loc_key)}
446 |         cur_bloc.bto = {AsmConstraintNext(cond_block.loc_key)}
447 |         self.add_block(assignment_block)
448 |         self.add_block(cond_block)
449 | 
450 |     def _process_sbb(self, cur_bloc, last_instruction):
451 |         assignment_block = AsmBlock(self.loc_db.add_location())
452 |         cond_block = AsmBlock(self.loc_db.add_location())
453 |         reg = last_instruction.args[0]
454 |         assignment_block.lines.append(create_mov_instruction(self.mode, reg, ExprInt(-1, reg.size)))
455 |         branch_target = next(iter(cur_bloc.bto)).loc_key
456 |         assignment_block.lines.append(create_jump_instruction(self.mode, ExprLoc(branch_target, self.mode)))
457 |         branch_name = "JB"  # JC is not implemented in miasm, using alias
458 |         cur_bloc.lines.pop()
459 |         pre_branch_block = AsmBlock(self.loc_db.add_location())
460 |         pre_branch_block.lines = [create_mov_instruction(self.mode, reg, ExprInt(0, reg.size))]
461 |         cond_block.lines.append(create_cond_branch_instruction(self.mode, branch_name,
462 |                                                                ExprLoc(assignment_block.loc_key, self.mode)))
463 |         if not cur_bloc.lines:
464 |             cur_bloc.lines = [create_nop(self.mode)]
465 |         assignment_block.bto = {AsmConstraintTo(branch_target)}
466 |         cur_bloc.bto = {AsmConstraintNext(cond_block.loc_key)}
467 |         cond_block.bto = {AsmConstraintNext(pre_branch_block.loc_key), AsmConstraintTo(assignment_block.loc_key)}
468 |         pre_branch_block.bto = {AsmConstraintNext(branch_target)}
469 |         self.add_block(assignment_block)
470 |         self.add_block(cond_block)
471 |         self.add_block(pre_branch_block)
472 | 
473 |     def _process_adc(self, cur_bloc, last_instruction):
474 |         assignment_block = AsmBlock(self.loc_db.add_location())
475 |         reg = last_instruction.args[0]
476 |         assignment_block.lines.append(create_mov_instruction(self.mode, reg, ExprInt(-1, reg.size)))
477 |         branch_target = next(iter(cur_bloc.bto)).loc_key
478 |         assignment_block.lines.append(create_jump_instruction(self.mode, ExprLoc(branch_target, self.mode)))
479 |         branch_name = "JB"  # JC is not implemented in miasm, using alias
480 |         cur_bloc.lines.pop()
481 |         cur_bloc.lines.append(create_mov_instruction(self.mode, reg, ExprInt(0, reg.size)))
482 |         cur_bloc.lines.append(create_cond_branch_instruction(self.mode, branch_name,
483 |                                                              ExprLoc(assignment_block.loc_key, self.mode)))
484 |         self.add_block(assignment_block)
485 |         assignment_block.bto = {AsmConstraintTo(branch_target)}
486 |         cur_bloc.bto.add(AsmConstraintTo(assignment_block.loc_key))
487 | 
488 |     @staticmethod
489 |     def _eliminate_jtc_var_slice_cb(expr, sizes, target):
490 |         if expr.is_compose():
491 |             if expr.args[0].is_slice() and expr.args[0].arg.is_id() and expr.args[0].arg == target:
492 |                 size = expr.args[0].size
493 |                 sizes.add(size)
494 |             if expr.args[0].is_id() and expr.args[0] == target:
495 |                 size = expr.size
496 |                 sizes.add(size)
497 |         elif expr.is_slice() and expr.arg.is_id() and expr.arg == target:
498 |             size = expr.size
499 |             sizes.add(size)
500 | 
501 |     def _process_jmp_table(self, cur_bloc, mn, attrib, loc_db, pool_bin, offsets_to_dis):
502 |         # TODO add support for jump tables with "AND cntrl_var, range" boundary check; such jmp tables were present only
503 |         #   in library functions in Stantinko samples
504 |         # add current block to the asmcfg to make it accessible in the ircfg edges, add_block is called anyway right
505 |         # after this callback, it will notice that the block has been already added
506 |         self.add_block(cur_bloc)
507 |         dst_address = loc_db.get_location_offset(cur_bloc.loc_key)
508 | 
509 |         logger.info("Possible jump table addr: 0x%x" % dst_address)
510 | 
511 |         ira = get_ira(mn, attrib)
512 | 
513 |         ir_arch = ira(loc_db)
514 | 
515 |         ircfg = ir_arch.new_ircfg_from_asmcfg(self)
516 | 
517 |         # the previous blocks should have exactly 1 predecessor dictating range
518 |         predecessors = self.predecessors(cur_bloc.loc_key)
519 |         if len(predecessors) != 1:
520 |             logger.info("Expected exactly one predecessor")
521 |             return
522 |         predecessor = ircfg.blocks[predecessors.pop()]
523 | 
524 |         irdst_block = ircfg.blocks[cur_bloc.loc_key]
525 |         if len(irdst_block.assignblks) != len(cur_bloc.lines):
526 |             processed = set()
527 |             todo = {irdst_block.loc_key}
528 |             while not irdst_block.dst.is_mem():
529 |                 loc_key = todo.pop()
530 |                 if loc_key in processed:
531 |                     continue
532 |                 processed.add(loc_key)
533 |                 irdst_block = ircfg.blocks[loc_key]
534 |                 todo.update(ircfg.successors(loc_key))
535 | 
536 |         # we shouldn't stumble upon crashing segm and call operators even thought implicit is required to process
537 |         # initial IRDst(mentioned operators cause crashes of the engine behind implicit) since we operate only on the
538 |         # 2 crucial basic blocks. The predecessor contains range of the jump table, we use it to determine constructs
539 |         # of the jump table and track back base code segment address assignment to target the msvc compiler and x64
540 |         # architecture, other compilers use directly RIP related addressing to get the address.
541 | 
542 |         # get real predecessor
543 |         asm_block = self.loc_key_to_block(predecessor.loc_key)
544 |         if len(predecessor.assignblks) != len(asm_block.lines):
545 |             processed = set()
546 |             todo = {predecessor.loc_key}
547 |             while cur_bloc.loc_key not in ircfg.successors(predecessor.loc_key):
548 |                 loc_key = todo.pop()
549 |                 if loc_key in processed:
550 |                     continue
551 |                 processed.add(loc_key)
552 |                 predecessor = ircfg.blocks[loc_key]
553 |                 todo.update(ircfg.successors(loc_key))
554 | 
555 |         # get jump_table_control_variable from predecessor
556 |         dg = DependencyGraph(ircfg, implicit=True, apply_simp=True, follow_mem=True, follow_call=False)
557 |         jtcdg = JTCVariableDependencyGraph(predecessor.loc_key,
558 |                                            ircfg, implicit=True, apply_simp=True, follow_mem=False, follow_call=False)
559 | 
560 |         dependency_result_iter = iter(jtcdg.get(irdst_block.loc_key, {ircfg.IRDst}, len(predecessor.assignblks),
561 |                                                 {predecessor.loc_key}))
562 |         solution_predecessor = next(dependency_result_iter)
563 |         # jump table control variable
564 |         jtc_var = jtcdg.jtc_var
565 |         if not jtc_var:
566 |             logger.info("couldn't determine single jump table control variable")
567 |             return
568 |         # get symbolic execution engine to be used in both predecessor and jmp table block
569 |         symb_exec_both = MySymbolicExecutionEngine(pool_bin, jtc_var, ir_arch)
570 |         try:
571 |             # symbolically evaluate lines influencing IRDst of the predecessor leading to jtc_var
572 |             for line_nb in sorted({node.line_nb for node in solution_predecessor.relevant_nodes
573 |                                    if node.loc_key == predecessor.loc_key}):
574 |                 assign_blk = predecessor.assignblks[line_nb]
575 |                 symb_exec_both.eval_updt_assignblk(assign_blk)
576 |         except (KeyError, TypeError):
577 |             logger.error(
578 |                 "Couldn't symbolically eval predecessor of 0x%x" % loc_db.get_location_offset(cur_bloc.loc_key))
579 |             # stantinko contains illegal unreachable dereferences prior jmp tables, such as
580 |             # xor     eax, eax; movsx   eax, byte ptr [eax]
581 |             return
582 |         # get symbolic execution engine supporting binary memory dereference
583 |         symb_exec_minimal = MySymbolicExecutionEngine(pool_bin, ir_arch, symb_exec_both.symbols.copy())
584 |         predecessor_irdst_equation = symb_exec_both.symbols[ircfg.IRDst]
585 | 
586 |         # get equation whose solutions solve the indirect jump
587 |         irdst_block = ircfg.blocks[cur_bloc.loc_key]
588 |         if len(irdst_block.assignblks) != len(cur_bloc.lines):
589 |             processed = set()
590 |             todo = {irdst_block.loc_key}
591 |             while not irdst_block.dst.is_mem():
592 |                 symb_exec_both.eval_updt_irblock(irdst_block)
593 |                 loc_key = todo.pop()
594 |                 if loc_key in processed:
595 |                     continue
596 |                 processed.add(loc_key)
597 |                 irdst_block = ircfg.blocks[loc_key]
598 |                 todo.update(ircfg.successors(loc_key))
599 | 
600 |         irdst_equation = symb_exec_both.eval_updt_irblock(irdst_block)
601 |         sizes = set()
602 |         # prevent mem processing via raw arrays by using var ID instead
603 |         # we also want to set a maximum boundary so slices don't cause the sat solver generate a huge number of results
604 |         visitor = ExprVisitorCallbackTopToBottom(lambda x: self._eliminate_jtc_var_slice_cb(x, sizes, jtc_var))
605 |         irdst_equation = visitor.visit(irdst_equation)
606 |         predecessor_irdst_equation = visitor.visit(predecessor_irdst_equation)
607 |         size_boundary = jtc_var.size
608 |         sizes = sorted(filter(lambda x: x > 1, sizes))
609 |         if sizes:
610 |             size_boundary = sizes[0]
611 |         jtc_var_id = ExprId("jtc_var", jtc_var.size)
612 |         irdst_equation = irdst_equation.replace_expr({jtc_var: jtc_var_id})
613 |         predecessor_irdst_equation = predecessor_irdst_equation.replace_expr({jtc_var: jtc_var_id})
614 |         # track possible CS base address dependency, ignore control variable from predecessor
615 |         eliminated_jtc_var_equation = irdst_equation.replace_expr({jtc_var_id: ExprInt(0, jtc_var_id.size)})
616 |         evaluated_ejtc_var_equation = symb_exec_both.eval_expr(eliminated_jtc_var_equation)
617 |         if not evaluated_ejtc_var_equation.is_int():
618 |             # we need to determine code base
619 |             dependencies = dg._follow_apply_cb(evaluated_ejtc_var_equation)
620 |             expr_deps = {fexpr.element for fexpr in dependencies if fexpr.follow}
621 |             dg_base = DependencyGraph(ircfg, implicit=False, apply_simp=True, follow_mem=True, follow_call=False)
622 |             dependency_result_iter = iter(dg_base.get(cur_bloc.loc_key, expr_deps, len(cur_bloc.lines),
623 |                                                       {self.heads()[0]}))
624 |             solution = next(dependency_result_iter)
625 |             code_base_dict = {expr: solution.emul(ir_arch)[expr] for expr in expr_deps}
626 |             irdst_equation = irdst_equation.replace_expr(code_base_dict)
627 |             predecessor_irdst_equation = predecessor_irdst_equation.replace_expr(code_base_dict)
628 | 
629 |         # we need backward slice of the jump table destination dependencies to retain the other independent assignments
630 |         # during cmp chain assembling
631 |         dependency_result = dg.get(cur_bloc.loc_key, {ircfg.IRDst}, len(cur_bloc.lines), {cur_bloc.loc_key})
632 |         dependent_line_nbs = {}
633 |         for solution in dependency_result:
634 |             dependent_line_nbs.setdefault(solution.loc_key, set()).update(
635 |                 {dn.line_nb for dn in solution.relevant_nodes})
636 |         cur_bloc_new_lines = []
637 |         for loc_key, lines in dependent_line_nbs.items():
638 |             for line_nb, assignblk in enumerate(ircfg.blocks[loc_key].assignblks):
639 |                 if line_nb not in lines:
640 |                     symb_exec_minimal.eval_assignblk(assignblk)
641 |                     cur_bloc_new_lines.append(assignblk.instr)
642 |         comparison_reg_id = None
643 |         comparison_reg_value = None
644 |         if jtc_var not in symb_exec_minimal.symbols.symbols_id:
645 |             comparison_reg_id = jtc_var
646 |             comparison_reg_value = jtc_var
647 |         else:
648 |             for symbol, comparison_reg_value in symb_exec_minimal.symbols.symbols_id.items():
649 |                 if jtc_var in comparison_reg_value and (symbol.is_mem() or
650 |                                                         (symbol.is_id() and symbol.name not in
651 |                                                          ["RIP", "EIP", "zf", "nf", "pf", "of", "cf", "af", "df",
652 |                                                           ircfg.IRDst.name])):
653 |                     replaced_jtcv = comparison_reg_value.replace_expr({jtc_var: ExprInt(0, jtc_var.size)})
654 |                     if isinstance(symb_exec_minimal.eval_expr(replaced_jtcv), ExprInt):
655 |                         comparison_reg_id = symbol
656 |                         break
657 |         if not comparison_reg_id or not comparison_reg_value:
658 |             logger.debug("Couldn't find any candidate for comparison register at 0x%x" %
659 |                          loc_db.get_location_offset(cur_bloc.loc_key))
660 |             return
661 | 
662 |         from miasm.ir.translators import Translator
663 |         import z3
664 |         translator = Translator.to_language("z3")
665 |         solver = z3.Solver()
666 | 
667 |         logger.debug("predecessor_irdst_equation: %s" % str(predecessor_irdst_equation))
668 |         logger.debug(("dst_address: 0x%x" % dst_address))
669 |         logger.debug(("jump_table_control_variable: %s" % str(jtc_var)))
670 |         solver.add(translator.from_expr(predecessor_irdst_equation) == dst_address)
671 |         translated_jtc_var = translator.from_expr(jtc_var_id)
672 |         solver.add(translated_jtc_var >= 0)
673 |         solver.add(translated_jtc_var < 2 ** (size_boundary - 1) - 1)
674 | 
675 |         if solver.check() != z3.sat:
676 |             logger.debug("Couldn't find at least one jump table control variable")
677 |             return
678 | 
679 |         dbg_destinations = set()
680 |         next_loc_key = new_block_loc_key = loc_db.add_location()
681 | 
682 |         logger.debug("comparison_reg_id: %s" % str(comparison_reg_id))
683 |         dst_ranges = {}
684 |         counter = 0
685 |         while counter < 500:
686 |             val = solver.model()[translated_jtc_var].as_long()
687 |             final_irdst_equation = irdst_equation.replace_expr({jtc_var_id: ExprInt(val, jtc_var_id.size)})
688 |             final_dst = int(symb_exec_both.eval_expr(final_irdst_equation))
689 |             cmp_reg_val = comparison_reg_value.replace_expr({jtc_var: ExprInt(val, jtc_var.size)})
690 |             cmp_reg_val = int(symb_exec_minimal.eval_expr(cmp_reg_val))
691 | 
692 |             dst_ranges[final_dst] = dst_ranges.get(final_dst, interval()).union([(cmp_reg_val, cmp_reg_val)])
693 |             dbg_destinations.add(final_dst)
694 |             offsets_to_dis.add(final_dst)
695 | 
696 |             solver.add(translated_jtc_var != translator.from_expr(ExprInt(val, jtc_var_id.size)))
697 |             if solver.check() != z3.sat:
698 |                 break
699 |             counter += 1
700 | 
701 |         if counter == 500:
702 |             raise RuntimeError("Interrupted; there might be a broken slice")
703 | 
704 |         for dst, interv in dst_ranges.items():
705 |             cond_target_loc_key = loc_db.get_or_create_offset_location(dst)
706 |             for lower, upper in interv:
707 |                 lower = ExprInt(lower, self.mode)
708 |                 upper = ExprInt(upper, self.mode)
709 |                 new_asm_block = AsmBlock(new_block_loc_key)
710 |                 new_block_loc_key = loc_db.add_location()
711 |                 if lower == upper:
712 |                     new_asm_block.lines = create_cmp_j_instructions(self.mode, comparison_reg_id, lower,
713 |                                                                     ExprLoc(cond_target_loc_key, self.mode), "JZ")
714 |                     new_asm_block.add_cst(cond_target_loc_key, "c_to")
715 |                     new_asm_block.add_cst(new_block_loc_key, "c_next")
716 |                 else:
717 |                     upper_check_loc_key = loc_db.add_location()
718 |                     # lower boundary check
719 |                     new_asm_block.lines = create_cmp_j_instructions(self.mode, comparison_reg_id, lower,
720 |                                                                     ExprLoc(new_block_loc_key, self.mode), "JB")
721 |                     new_asm_block.add_cst(new_block_loc_key, "c_to")
722 |                     new_asm_block.add_cst(upper_check_loc_key, "c_next")
723 |                     # upper boundary check
724 |                     upper_check_block = AsmBlock(upper_check_loc_key)
725 |                     upper_check_block.lines = create_cmp_j_instructions(self.mode, comparison_reg_id, upper,
726 |                                                                         ExprLoc(cond_target_loc_key, self.mode), "JBE")
727 |                     upper_check_block.add_cst(cond_target_loc_key, "c_to")
728 |                     upper_check_block.add_cst(new_block_loc_key, "c_next")
729 |                     self.add_block(upper_check_block)
730 |                 self.add_block(new_asm_block)
731 |         # trigger last jump unconditionally
732 |         new_asm_block.bto = {AsmConstraintTo(cond_target_loc_key)}
733 |         new_asm_block.lines = [create_jump_instruction(self.mode, ExprLoc(cond_target_loc_key, self.mode))]
734 | 
735 |         cur_bloc.lines = cur_bloc_new_lines
736 |         cur_bloc.add_cst(next_loc_key, "c_next")
737 |         if not cur_bloc.lines:
738 |             cur_bloc.lines = [create_nop(self.mode)]
739 |         self.jmp_table_loc_keys.add(cur_bloc.loc_key)
740 |         logger.debug("destinations: %s" % pformat([hex(i or 0) for i in dbg_destinations]))
741 |         logger.debug("blocks: %d" % counter)
742 | 
743 |     # noinspection PyUnusedLocal
744 |     def _extended_discovery(self, dism_eng, cur_bloc, offsets_to_dis):
745 |         mn = self.machine.mn
746 |         attrib = self.mode
747 |         pool_bin = dism_eng.bin_stream
748 |         loc_db = dism_eng.loc_db
749 |         if not cur_bloc.lines:
750 |             return
751 |         last_instruction = cur_bloc.lines[-1]
752 |         if last_instruction.name.startswith("CMOV"):
753 |             self._process_cmov(cur_bloc, last_instruction)
754 |         elif last_instruction.name.startswith("SBB") and last_instruction.args[0] == last_instruction.args[1]:
755 |             self._process_sbb(cur_bloc, last_instruction)
756 |         elif last_instruction.name == 'JMP' and type(last_instruction.args[0]) in [ExprMem, ExprId]:
757 |             self._process_jmp_table(cur_bloc, mn, attrib, loc_db, pool_bin, offsets_to_dis)
758 |         elif last_instruction.name.startswith("INT"):
759 |             offsets_to_dis = set()
760 |         elif last_instruction.name == 'JMP' and last_instruction.args[0].is_loc(cur_bloc.loc_key) \
761 |                 and len(cur_bloc.lines) == 1:
762 |             # prevent Miasm eb fe bug https://github.com/cea-sec/miasm/issues/1257
763 |             cur_bloc.lines.insert(0, create_nop(self.mode))
764 | 
765 |     def disassemble(self, function_address, conn=None):
766 |         unreachable = []
767 |         if conn:
768 |             ea = conn.modules.idaapi.get_func(function_address)
769 |             try:
770 |                 unreachable = [i.end_ea for i in ea.tails] + [ea.end_ea]
771 |             except AttributeError:
772 |                 pass
773 |         self.func_addr = function_address
774 |         self.jmp_table_loc_keys = set()
775 |         binary_stream = bin_stream_pe(self._exectbl)
776 |         self._dis_engine = self.disassembler(binary_stream, loc_db=self.loc_db, dont_dis=unreachable)
777 |         self._dis_engine.dis_block_callback = self._extended_discovery
778 |         self._dis_engine.dis_multiblock(function_address, self)
779 | 
780 |     @property
781 |     def exectbl(self):
782 |         return self._exectbl
783 | 


--------------------------------------------------------------------------------
/stadeo/utils/xref_patcher.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf8 -*-
 2 | #
 3 | # Copyright (c) 2020 ESET spol. s r.o.
 4 | # Author: Vladislav Hrčka <vladislav.hrcka@eset.com>
 5 | # See LICENSE file for redistribution.
 6 | 
 7 | import rpyc
 8 | from miasm.analysis.binary import Container
 9 | from miasm.expression.expression import *
10 | from miasm.analysis.machine import Machine
11 | from unittest import mock
12 | 
13 | 
14 | def compare_args(args, scn_args, conn):
15 |     for ind, val in args.items():
16 |         possible_vals = (conn.modules.idc.get_operand_value(scn_args[ind], 0),
17 |                          conn.modules.idc.get_operand_value(scn_args[ind], 1))
18 |         if val not in possible_vals:
19 |             return False
20 |     return True
21 | 
22 | 
23 | def patch_xref(instr_offset, patch_offset, mdis, mn, exectbl):
24 |     inst = mdis.dis_instr(instr_offset)
25 |     new_loc_key = mdis.loc_db.add_location(offset=patch_offset - instr_offset)
26 |     inst.args[0] = ExprLoc(new_loc_key, 32)
27 |     patch = list(filter(lambda x: len(x) == inst.l, mn.asm(inst, mdis.loc_db)))
28 |     assert len(patch) > 0, 'Couldn\'t assemble instruction of the same size.'
29 |     exectbl.img_rva[exectbl.virt2rva(instr_offset)] = patch[0]
30 | 
31 | 
32 | def patch_xrefs(find_addr, patch_addr, args, ip='localhost', port=4455, conn=None):
33 |     """
34 |     Patches xrefs with certain arguments
35 |     :param find_addr: address of function whose xrefs are to be replaced
36 |     :param patch_addr: the new target of the xref call
37 |     :param args: dictionary mapping number of argument to its required value
38 |     :param ip: optional, IP of the computer running rpyc server in IDA
39 |     :param port: optional, port of the computer running rpyc server in IDA
40 |     :param conn: optional, already estabilished connection to running rpyc server in IDA
41 |     :return: None
42 |     """
43 |     close_conn = False
44 |     if not conn:
45 |         close_conn = True
46 |         conn = rpyc.classic.connect(ip, port)
47 | 
48 |     file_name = conn.modules.idaapi.get_input_file_path()
49 |     idautils = conn.root.getmodule("idautils")
50 |     with mock.patch("builtins.open", conn.builtins.open):
51 |         cont = Container.from_stream(open(file_name, 'rb'))
52 |         machine = Machine(cont.arch)
53 |         mdis = machine.dis_engine(cont.bin_stream)
54 |         exectbl = cont.executable
55 | 
56 |         for r in idautils.XrefsTo(find_addr):
57 |             scn_args = conn.modules.idaapi.get_arg_addrs(r.frm)
58 |             if scn_args is None and args:
59 |                 print("Couldn't find args of %x" % r.frm)
60 |                 continue
61 |             if compare_args(args, scn_args, conn):
62 |                 patch_xref(r.frm, patch_addr, mdis, machine.mn, exectbl)
63 | 
64 |         with open(file_name, 'wb') as fl:
65 |             fl.write(bytes(exectbl))
66 | 
67 |     if close_conn:
68 |         conn.close()
69 | 


--------------------------------------------------------------------------------