├── .gitignore
├── LICENSE.txt
├── README.md
├── docs
├── bfs_highlight.png
├── compatibility.md
├── data.md
├── dragon1.png
├── emulator.md
├── getting_started.md
├── graph.png
├── index.md
├── reference.md
└── typecheck.png
├── examples
├── AsmLevelDataFlow.py
├── ContextRecovery.py
├── DumpFunctionAST.py
├── DumpHighPcode.py
├── EmulatorHooks.py
├── FixUnaffectedRegisters.py
├── LummaPatternBasedDeobfuscation.py
├── README.md
├── RecoverFunctionPointers.py
└── SwitchOverride.py
├── ghidralib.py
├── mkdocs.yml
├── pyproject.toml
└── tests
└── ghidralib_test.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | site/
3 | shell.nix
4 | __pycache__
5 | ghidralib.egg-info
6 | build/
7 | dist/
8 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ghidralib
2 |
3 | 
4 |
5 | This library is an attempt to provide a Pythonic standard library for Ghidra.
6 |
7 | The main goal is to make writing quick&dirty scripts actually quick, and not that dirty.
8 |
9 | ## Installation
10 |
11 | **PyGhidra (Python 3, since Ghidra 11.3)**
12 |
13 | Activate Ghidra python virtual environment, then `pip install ghidralib`.
14 |
15 | **Jython (Python 2, all Ghidra versions)**
16 |
17 | Just copy the [ghidralib.py](https://github.com/msm-code/ghidralib/blob/master/ghidralib.py) file to your ghidra_scripts directory.
18 | Later just `from ghidralib import *`.
19 |
20 | ## Usage
21 |
22 | Check out the [documentation](https://msm-code.github.io/ghidralib/) or official [examples](./examples/).
23 | A short demonstration of a basic ghidralib usage first:
24 |
25 | 1. Get all function instructions (similarly for basic blocks, low and high pcode, calls and xrefs):
26 |
27 | ```python
28 | print(Function("main").instructions)
29 | ```
30 |
31 |
32 | For comparison, plain Ghidra equivalent:
33 |
34 | ```python
35 | function_manager = currentProgram.getFunctionManager()
36 | symbol_table = currentProgram.getSymbolTable()
37 | main = list(symbol_table.getSymbols('main'))[0].getAddress()
38 | function = function_manager.getFunctionAt(main)
39 | instructions = currentProgram.getListing().getInstructions(function.getBody(), True)
40 | print(list(instructions))
41 | ```
42 |
43 |
44 | 2. You have a structure `uint8_t *data; uint32_t len;` at 0x1000 and you want to read it:
45 |
46 | ```python
47 | pos, len_bytes = get_u32(0x10000), get_u32(0x10000 + 4)
48 | print(get_bytes(pos, len_bytes))
49 | ```
50 |
51 |
52 | For comparison, plain Ghidra equivalent:
53 |
54 | ```python
55 | start_address = toAddr(0x10000)
56 | pos = currentProgram.getMemory().getInt(start_address)
57 | len_bytes = currentProgram.getMemory().getInt(start_address.add(4))
58 | data = getBytes(toAddr(pos), len_bytes)
59 | print(" ".join(chr(c % 256) for byte in data)) # signed bytes <3
60 | ```
61 |
62 |
63 | 3. Find all calls to a string deobfuscation function and get call parameters:
64 |
65 | ```python
66 | for call in Function("MyCustomCrypto").calls:
67 | ctx = call.infer_context()
68 | key, data = ctx["eax"], ctx["edx"]
69 | datalen = get_u32(data - 4)
70 | print(call.address, decode(get_bytes(data, datalen)))
71 | ```
72 |
73 |
74 | For comparison, plain Ghidra equivalent:
75 |
76 | Just joking! Too long to fit in this README.
77 |
78 |
79 | You can also emulate a function call and read the result:
80 |
81 | ```python
82 | ctx = Function("GetFuncNameByHash").emulate(0x698766968)
83 | print(ctx.read_cstring(ctx["eax"]))
84 | ```
85 |
86 | 4. Tons more QoL features:
87 |
88 | ```python
89 | DataType("_NT_TIB") # Get a datatype by name
90 | DataType.from_c("typedef void* HINTERNET;") # Quickly parse structs and typedefs
91 |
92 | func = Function("main") # Work at various abstract levels
93 | print(function.instructions) # Get instructions...
94 | print(function.basicblocks) # ..basic blocks...
95 | print(function.pcode) # ...low pcode...
96 | print(function.high_pcode) # ...high pcode...
97 | print(function.decompile()) # ...or decompile a whole function
98 |
99 | for xref in Symbol("PTR_GetProcAddress").xrefs_to:
100 | Instruction(xref.from_address).highlight() # highlight symbol xrefs
101 | ```
102 |
103 | 5. There are also some flashy (but not necessarily useful) features that might
104 | grab your attention.
105 |
106 | Get the control flow graph of the main function, and display it:
107 |
108 | ```python
109 | Function("main").control_flow.show()
110 | ```
111 |
112 | 
113 |
114 | Find the shortest path from source to target in the program control flow graph.
115 | If it exists, highlight all basic blocks along the way.
116 |
117 | ```python
118 | source, target = BasicBlock("entry"), BasicBlock(0x00405073)
119 | path = Program.control_flow().bfs(source)
120 | while path.get(target):
121 | target.highlight()
122 | target = path[target]
123 | ```
124 |
125 | 
126 |
127 | 6. Thanks to type hints, scripting gets *much* easier if your IDE supports that.
128 |
129 | Finally, ghidralib doesn't lock you in - you can always retreat to familiar Ghidra types -
130 | just get them from the `.raw` property. For example `instruction.raw`
131 | is a Ghidra Instruction object, similarly `function.raw` is a Ghidra Function.
132 | So you can do the routine stuff in ghidralib, and fall back to Java if something
133 | is not implemented.
134 |
135 | ## Learn more
136 |
137 | **Check out the [documentation](https://msm-code.github.io/ghidralib/)**, especially the
138 | [getting started](https://msm-code.github.io/ghidralib/getting_started/) page.
139 |
140 | More detailed tutorial about specific features is in development. Completed chapters:
141 |
142 | * [Emulator](https://msm-code.github.io/ghidralib/emulator/)
143 |
144 | If you prefer to learn by example, you can also browse the [examples](./examples/) directory.
145 |
146 | A fair warning: ghidralib is still actively developed and the API may change
147 | slightly in the future. But this doesn't matter for your one-off scripts, does it?
148 |
149 | ## Contributing
150 |
151 | PRs are welcome. Feel free to open PRs to add things you need.
152 |
153 | You can also just report issues and send feature requests. I started this library to
154 | cover my own needs, but I'm interested in learning what other people use it for.
155 |
156 | *Dragon icon at the top created by cube29, flaticon*
157 |
--------------------------------------------------------------------------------
/docs/bfs_highlight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/msm-code/ghidralib/ec738e49259b741d774a0f11b524d2a0e134541f/docs/bfs_highlight.png
--------------------------------------------------------------------------------
/docs/compatibility.md:
--------------------------------------------------------------------------------
1 | # Compatibility
2 |
3 | This library uses unstable Ghidra APIs, so it's expected to break from time to time
4 | (when Ghidra changes internal implementation details).
5 |
6 | In this document I'll keep track of the compatibility status of the library. I don't
7 | to backport fixes and do complex hacks to support more than one Ghidra version at once.
8 | Instead, for each Ghidra version I'll try to provide a working ghidralib version.
9 |
10 | Keep in mind, that this library is still in rapid development, and the API may and will
11 | change before we reach the first stable release (v1.0).
12 |
13 | ### Compatibility matrix
14 |
15 | Here is a compatibility matrix of tested Ghidra and ghidralib versions:
16 |
17 | ghidralib \ ghidra |11.2.1|11.3|11.3.2|
18 | --------------------|------|----|------|
19 | 0.1.0 |✅ | | |
20 | 0.2.0 |✅ | ✅ | ✅ |
21 |
22 | (Compatibility is checked by running a [testsuite](https://github.com/msm-code/ghidralib/blob/master/tests/ghidralib_test.py)
23 | on a test binary)
24 |
25 | ### Architectures
26 |
27 | I work almost exclusively on x86 and x86_64, so the library is tested
28 | on these architectures. There is nothing specific to x86 in the code,
29 | but I expect that some exotic architectures will not work correctly.
30 | Freel free to submit issues/PRs if you find something is broken.
31 |
32 | ### Python 3
33 |
34 | The script is Python 3 compatible. Ghidralib builds are tested using PyGhidra. Right now the support is unstable.
35 |
36 | Known problems:
37 |
38 | * Ghidralib will **always** work on the program that you had open when you imported
39 | Ghidralib. Because of this, using ghdiralib on multiple programs at once may cause chaos.
40 | To switch Ghidralib to another program, just `del sys.modules["ghidralib"]` and import it again.
41 | This issue is because of the differences in how Jython and PyGhidra work, and I believe it's
42 | unsolvable currently (upstream discussion: https://github.com/NationalSecurityAgency/ghidra/issues/8011)
43 |
44 | I don't personally use Ghidrathon, so I didn't test Ghidrathon compatibility.
45 |
--------------------------------------------------------------------------------
/docs/data.md:
--------------------------------------------------------------------------------
1 | # Working with global data
2 |
3 | Globally defined data is the second most important thing a reverse-engineer can find in a binary (the first
4 | most important is of course the code itself). That's why Ghidralib includes many helpful utilities
5 | to work with it. Most important Ghidralib wrappers used to work with global data are:
6 |
7 | * [Data](reference.md#ghidralib.Data) - represents a fragment of binary that is used to store a piece of data. Wraps `ghidra.program.model.listing.Data`.
8 | * [DataType](reference.md#ghidralib.DataType) - all data objects have an assigned type, that determines many things, including the way it's displayed, decompiled and more. Wraps `ghidra.program.model.data.DataType`.
9 |
10 |
11 | ### Defining data
12 |
13 | When one runs auto-analysis, large chunks of the program are automatically analysed and marked as code
14 | or data. But sometimes, during analysis, we discover a new piece of data that was not previously
15 | defined. We may want to automate adding it. Ghidra's FlatProgramAPI is pretty good here - we
16 | have a lot of functions like `createByte`, `createChar`, `createDouble`, `createDWord`, etc.
17 |
18 | But one very annoying problem with them is that they raise an exception when a data is already defined there.
19 | For example, given:
20 |
21 | ```asm
22 | 00457994 34 32 dw 3234h
23 | ```
24 |
25 | When we attempt to:
26 |
27 | ```python
28 | createByte(toAddr(0x0457994)) # remember that you need toAddr here
29 | ```
30 |
31 | We'll get a long exception about conflicting data types. With Ghidralib we can do it
32 | a bit more safely by leveraging `Program.create_data`:
33 |
34 | ```python
35 | data = Program.create_data(0x0457994, "byte")
36 | ```
37 |
38 | Or alternatively, using a DataType object:
39 |
40 | ```python
41 | data = DataType("byte").create_at(0x0457994)
42 | ```
43 |
44 | As usual, we can also access the existing defined data:
45 |
46 | ```python
47 | data = Data(0x0457994) # Get by address
48 | data = Data("DAT_00457994") # Get by name, if exists
49 | ```
50 |
51 | With a `Data` instance we can easily access a lot of information, but most importantly we can:
52 |
53 | * Access it's address, size, raw bytes, etc
54 |
55 | ```python
56 | >>> Data(0x0400078).address
57 | 4194424L
58 | >>> Data(0x0400078).length
59 | 248
60 | ```
61 |
62 | As a fun exercise, like with everything that occupies bytes in the binary address space, we
63 | can also highlight it in the listing:
64 |
65 | ```python
66 | Data(0x0400078).highlight()s
67 | ```
68 |
69 | * Get its type with `data_type` or `base_data_type`.
70 |
71 | ```python
72 | >>> Data(0x0457994).data_type
73 | word
74 | ```
75 |
76 | * Introspect it, for example `is_pointer`, `is_constant`, `is_writable`, `is_array`, `is_structure`, etc.
77 |
78 | ```python
79 | >>> Data(0x0400078).is_pointer
80 | False
81 | >>> Data(0x0400078).is_writable
82 | False
83 | ```
84 |
85 | * For primitive types, cast it to a Python type (when it makes sense):
86 |
87 | ```python
88 | >>> Data(0x0457994).value
89 | 0x3234
90 | ```
91 |
92 | * For structures, access the nested fields with no boilerplate:
93 |
94 | ```python
95 | >>> Data(0x0400000).e_magic
96 | char[2] "MZ"
97 | >>> Data(0x0400000).e_magic.value
98 | 'MZ'
99 | >>> Data(0x400078).OptionalHeader.DataDirectory[1].Size
100 | ddw 8Ch
101 | ```
102 |
103 | ### Data types
104 |
105 | Every `Data` object has a type assigned. Types are represented by a
106 | [DataType](reference.md#ghidralib.DataType) object. It can be used to query information about how
107 | that data behaves.
108 |
109 | It's possible to get the type by name, or to enumarate all data types:
110 |
111 | ```python
112 | >>> len(DataType.all())
113 | 110528
114 | >>> DataType("IMAGE_OPTIONAL_HEADER32")
115 | /PE/IMAGE_OPTIONAL_HEADER32
116 | pack(disabled)
117 | Structure IMAGE_OPTIONAL_HEADER32 {
118 | 0 word 2 Magic ""
119 | 2 byte 1 MajorLinkerVersion ""
120 | 3 byte 1 MinorLinkerVersion ""
121 | ...
122 | ```
123 |
124 | Currently Ghidralib has a limited support for data type introspetion - it's
125 | possible to get the type name, size in bytes, and not much more. For more advanced operations,
126 | it may be necessary to use the raw Java object directly. For example:
127 |
128 | ```python
129 | >>> DataType("IMAGE_OPTIONAL_HEADER32").raw.getPathName()
130 | u'/PE/IMAGE_OPTIONAL_HEADER32'
131 | >>> DataType("IMAGE_OPTIONAL_HEADER32").raw.getDescription()
132 | u''
133 | >>> DataType("IMAGE_OPTIONAL_HEADER32").raw.getAlignment()
134 | 1
135 | ```
136 |
137 | As usual, in the future missing wrappers may be added.
138 |
139 | One interesting feature is C code parsing, for example:
140 |
141 | ```python
142 | >>> DataType.from_c('typedef void* HINTERNET;')
143 | HINTERNET
144 | >>> DataType.from_c("struct test { short a; short b; short c;};")
145 | pack()
146 | Structure test {
147 | 0 short 2 a ""
148 | 2 short 2 b ""
149 | 4 short 2 c ""
150 | }
151 | Length: 6 Alignment: 2
152 | ```
153 |
154 | Adding a data type programatically is sometimes much easier than doing it manually in the structure editor.
155 |
156 |
--------------------------------------------------------------------------------
/docs/dragon1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/msm-code/ghidralib/ec738e49259b741d774a0f11b524d2a0e134541f/docs/dragon1.png
--------------------------------------------------------------------------------
/docs/emulator.md:
--------------------------------------------------------------------------------
1 | # Emulator
2 |
3 | Ghidra features a powerful PCode emulator, which can be used to emulate whole
4 | functions or pieces of code.
5 |
6 | Ghidralib wraps this emulator with a class called `Emulator`. The basic usage is
7 | as follows:
8 |
9 | ```python
10 | emu = Emulator()
11 | emu.emulate(0x400000, 0x400010)
12 | print(emu["eax"])
13 | ```
14 |
15 | ### Basics
16 |
17 | Looks easy enough, now let's try it in practice. Create and compile a following C
18 | program:
19 |
20 | ```c
21 | #include
22 | #include
23 |
24 | int hash(int value) {
25 | return (value * 10) + (value ^ 7);
26 | }
27 |
28 | void check(int value) {
29 | if (hash(value) == 189) {
30 | printf("Success!");
31 | }
32 | }
33 |
34 | int main(int argc, char *argv[]) {
35 | if (argc != 2) { return 1; }
36 | check(atoi(argv[1]));
37 | return 0;
38 | }
39 | ```
40 |
41 | Compile it with `gcc -O0 test.c -o test`. Make sure to disable optimisation with
42 | `-O0`, so the functions are not optimized out. Now load this program to Ghidra,
43 | and open interactive console. And check that everything is in order:
44 |
45 | ```python
46 | >>> from ghidralib import *
47 | >>> print(hex(Function("main").address))
48 | 0x401190
49 | >>> print(hex(Function("check").address))
50 | 0x401159
51 | >>> print(hex(Function("hash").address))
52 | 0x401136
53 | ```
54 |
55 | Now the main point, let's try the Emulator. We may want to emulate the `hash` function
56 | to check how it behaves. For emulation we need a start and end address. You can copy
57 | it from the listing view, but for the demonstration I will print that with python too:
58 |
59 | ```python
60 | hash_function = Function("hash")
61 | >>> for instr in hash_function.instructions:
62 | ... print("0x{:x} {}".format(instr.address, instr))
63 | ...
64 | 0x401136 PUSH RBP
65 | 0x401137 MOV RBP,RSP
66 | 0x40113a MOV dword ptr [RBP + -0x4],EDI
67 | 0x40113d MOV EDX,dword ptr [RBP + -0x4]
68 | 0x401140 MOV EAX,EDX
69 | 0x401142 SHL EAX,0x2
70 | 0x401145 ADD EAX,EDX
71 | 0x401147 ADD EAX,EAX
72 | 0x401149 MOV EDX,EAX
73 | 0x40114b MOV EAX,dword ptr [RBP + -0x4]
74 | 0x40114e XOR EAX,0x7
75 | 0x401151 ADD EAX,EDX
76 | 0x401153 POP RBP
77 | 0x401154 XOR EDX,EDX
78 | 0x401156 XOR EDI,EDI
79 | 0x401158 RET
80 | ```
81 |
82 | **Note**: we assume Linux x64 ABI everywhere. If you're on Windows or another
83 | architecture, you'll need to adjust the code (especially register names).
84 |
85 | So we want to emulate between 0x401136 and 0x401158, and the parameter is in EDI.
86 |
87 | ```python
88 | >>> emu = Emulator()
89 | >>> emu["RDI"] = 10 # We asume Linux x64 ABI
90 | >>> emu.emulate(0x401136, 0x401158)
91 | >>> print(emu["RAX"])
92 | 113
93 | ```
94 |
95 | Great, we successfully emulated our first function. Instead of using hardcoded
96 | addresses, it's usually easier to use object attributes. This is equivalent:
97 |
98 | ```python
99 | >>> emu = Emulator()
100 | >>> emu["RDI"] = 10
101 | >>> emu.emulate(hash_function.address, hash_function.exitpoints)
102 | >>> print(emu["RAX"])
103 | 113
104 | ```
105 |
106 | Function.exitpoints is a list that contains all function exit points - perfect
107 | for our use-case here.
108 |
109 | By the way, instead of indexing like `emu["EAX"]` you can use `emu.read_register`
110 | and `emu.write_register`. Consider using the more verbose format when writing
111 | reusable scripts, but `emu["EAX"]` is faster to type when working interactively.
112 |
113 | ### Hooks
114 |
115 | Often we are interested in the details of the execution, and we want to
116 | process every instruction in some way. We can easily do this using the `callback` parameter:
117 |
118 | ```python
119 | >>> emu = Emulator()
120 | >>> def print_callback(emu):
121 | ... instr = Instruction(emu.pc)
122 | ... print("executing 0x{:x} {}".format(emu.pc, instr))
123 |
124 | >>> emu.emulate(hash_function.address, hash_function.exitpoints, callback=print_callback)
125 | executing 0x401136 PUSH RBP
126 | executing 0x401137 MOV RBP,RSP
127 | executing 0x40113a MOV dword ptr [RBP + -0x4],EDI
128 | executing 0x40113d MOV EDX,dword ptr [RBP + -0x4]
129 | executing 0x401140 MOV EAX,EDX
130 | executing 0x401142 SHL EAX,0x2
131 | executing 0x401145 ADD EAX,EDX
132 | executing 0x401147 ADD EAX,EAX
133 | executing 0x401149 MOV EDX,EAX
134 | executing 0x40114b MOV EAX,dword ptr [RBP + -0x4]
135 | executing 0x40114e XOR EAX,0x7
136 | executing 0x401151 ADD EAX,EDX
137 | executing 0x401153 POP RBP
138 | executing 0x401154 XOR EDX,EDX
139 | executing 0x401156 XOR EDI,EDI
140 | ```
141 |
142 | You can change the emulator context in the hook, and you can control the execution
143 | using the callback return value. In particular, you can return:
144 |
145 | * `continue` to continue execution normally (this is the default)
146 | * `break` to stop execution immediately
147 | * `continue_then_break` to stop execution after executing the current instruction
148 | * `skip` to skip the current instruction and execute the one immediately after it
149 | * `retry` means that emulator should try to execute the same instruction again.
150 | This is only useful if you changed PC in the callback and want to reevaluate it.
151 |
152 | So for example, instead of providing the return address directly you can
153 | execute until the `ret` instruction:
154 |
155 | ```python
156 | >>> emu = Emulator()
157 | >>> def execute_until_ret(emu):
158 | ... instr = Instruction(emu.pc)
159 | ... if instr.mnemonic == "RET":
160 | ... return "break"
161 | ... return "continue"
162 |
163 | >>> emu["RDI"] = 10
164 | >>> emu.emulate(hash_function.address, callback=execute_until_ret)
165 | >>> print(emu["RAX"])
166 | 113
167 | ```
168 |
169 | By the way, as a reminder, you can use a symbol name almost everywhere instead of
170 | an address. For example, `hash_function.address` is equivalent to "hash" and you
171 | can as well do
172 |
173 | ```python
174 | >>> emu.emulate("hash", callback=execute_until_ret)
175 | ```
176 |
177 | This is probably not a good idea in serious scripts, but it's a nice trick for
178 | quick hacks.
179 |
180 | **Exercise**: Use your knowledge of the emulator to find a value that will make the
181 | `hash` function return 189. Hint: emulate `hash` in a for loop and check the return value.
182 |
183 | ### Hooks and external functions
184 |
185 | Another thing we can do with hooks is dealing with `calls`. For example, `check`
186 | function looks like this:
187 |
188 | ```python
189 | >>> for instr in Function("check").instructions:
190 | ... print("0x{:x} {}".format(instr.address, instr))
191 | ...
192 | 0x401159 PUSH RBP
193 | 0x40115a MOV RBP,RSP
194 | 0x40115d SUB RSP,0x10
195 | 0x401161 MOV dword ptr [RBP + -0x4],EDI
196 | 0x401164 MOV EAX,dword ptr [RBP + -0x4]
197 | 0x401167 MOV EDI,EAX
198 | 0x401169 CALL 0x00401136
199 | 0x40116e CMP EAX,0xbd
200 | 0x401173 JNZ 0x00401189
201 | 0x401175 LEA RAX,[0x402004]
202 | 0x40117c MOV RDI,RAX
203 | 0x40117f MOV EAX,0x0
204 | 0x401184 CALL 0x00401030
205 | 0x401189 NOP
206 | 0x40118a LEAVE
207 | 0x40118b XOR EAX,EAX
208 | 0x40118d XOR EDI,EDI
209 | 0x40118f RET
210 | ```
211 |
212 | The first call is to `hash` function, but the second one is to `printf`. We can't easily
213 | emulate this function, because it's outside of the current program. To avoid crashing
214 | the emulation, we can use the hook to skip the calls:
215 |
216 | ```python
217 | >>> check_function = Function("check")
218 | >>> def skip_calls(emu):
219 | ... instr = Instruction(emu.pc)
220 | ... if instr.mnemonic == "CALL":
221 | ... return "skip"
222 | ... return "continue"
223 | ```
224 |
225 | But let's do something else: let's emulate `check` function until the `CALL hash` instruction,
226 | but skip the call and just return `189` directly. Then emulate until the `CALL printf` instruction,
227 | and print the parameter passed to `printf`. The callback gets more complicated now:
228 |
229 | ```python
230 | >>> def emulate_check(emu):
231 | ... instr = Instruction(emu.pc)
232 | ... if instr.address == 0x401169:
233 | ... emu["RAX"] = 189
234 | ... return "skip"
235 | ... if instr.address == 0x401184:
236 | ... string_addr = emu["RDI"] # cstring parameter is in RDI (linux ABI)
237 | ... print(emu.read_cstring(string_addr))
238 | ... return "break"
239 |
240 | >>> check_function = Function("check")
241 | >>> emu = Emulator()
242 | >>> emu.emulate(check_function.address, callback=emulate_check)
243 | Success!
244 | ```
245 |
246 | We successfully "tricked" the check function into executing the "success"
247 | branch and trying to print the `Success!` string.
248 |
249 | But this code is not very nice. We can use emulator hooks to make it clearer.
250 | Hooks are pieces of code that can be automatically executed at certain points
251 | during emulation. You can register a hook for a specific address:
252 |
253 | ```python
254 | def printf_hook(emu):
255 | arg = emu.read_cstring(emu["RDI"])
256 | print("printf called with '{}'".format(arg))
257 | return "break"
258 |
259 | def hash_hook(emu):
260 | # Note that we are already in the called function, so we need to jump
261 | # to return address manually, and update the stack pointer appropriately.
262 | emu["RAX"] = 189
263 | emu.pc = emu.read_u64(emu.sp)
264 | emu.sp += 8
265 |
266 | emu = Emulator()
267 | emu.add_hook("printf", printf_hook)
268 | emu.add_hook("hash", hash_hook)
269 | emu.emulate("check")
270 | ```
271 |
272 | Note that we again (ab)use automatic symbol resolution here. The last three lines
273 | are equivalent to:
274 |
275 | ```python
276 | emu.add_hook(Symbol("printf").address, printf_hook)
277 | emu.add_hook(Symbol("hash").address, hash_hook)
278 | emu.emulate(Symbol("check").address)
279 | ```
280 |
281 | **Exercise**: Create a hook for `atoi` function that will simulate the libc function -
282 | it should parse the string from the parameter and return it in RAX. Test it by emulating
283 | the "call atoi" instruction with a string parameter.
284 |
285 | ### State inspection
286 |
287 | Of course, after emulation we are interested in the final state of the emulator.
288 | We already showcased reading and writing registers, and we used `read_cstring` function
289 | in the hook. There are also other useful functions:
290 |
291 | * `emu.read_register(reg)` and `emu.write_register(reg, val)` - read or write a register
292 | * `emu[reg]` and `emu[reg] = val` - read or write a register, short version
293 | * `emu.read_u64(addr)` and `emu.write_u64(addr, val)` - read or write a 64-bit value at a given address
294 | * `emu.read_u32(addr)` and `emu.write_u32(addr, val)` - read or write a 32-bit value at a given address
295 | * `emu.read_u16(addr)` and `emu.write_u16(addr, val)` - read or write a 16-bit value at a given address
296 | * `emu.read_u8(addr)` and `emu.write_u8(addr, val)` - read or write an 8-bit value at a given address
297 | * `emu.read_bytes(addr, size)` - read `size` bytes from `addr`
298 | * `emu.write_bytes(addr, bytes)` - write the given `bytes` to `addr`
299 | * `emu.read_cstring(addr)` - read bytes starting from `addr` until a null byte is found.
300 | * `emu.read_unicode(addr)` - read 16bit chars starting from `addr` until a null character is found.
301 | * `emu.read_varnode` and `emu.write_varnode` - read or write a varnode
302 |
303 | They should all be self-explanatory, except the last one. Varnodes are a Ghidra term for an
304 | almost arbitrary value. In particular, Function signature contains information about how variables and
305 | parameters map to varnodes:
306 |
307 | ```python
308 | >>> Function("hash").return_variable
309 | [int @EAX:4]
310 | >>> Function("hash").return_variable.varnode
311 | (register, 0x0, 4)
312 | >>> Function("hash").parameters
313 | [[uint param_1@EDI:4]]
314 | >>> Function("hash").parameters[0].varnode
315 | (register, 0x38, 4)
316 | ```
317 |
318 | You can use `read_varnode` and `write_varnode` to manipulate these values in a pretty generic way.
319 | For example, this is levaraged by `Function.emulate`, to emulate functions in a very generic way:
320 |
321 | ```python
322 | >>> Function("hash").emulate_simple(10)
323 | 113
324 | ```
325 |
326 | It doesn't get any easier than that. The `simple` in the name refers to the return value -
327 | in many cases you will want to use `Function.emulate` to get the whole context of the
328 | emulator after execution.
329 |
330 | ```python
331 | >>> emu = Function("hash").emulate(10)
332 | >>> emu["RAX"]
333 | 113
334 | ```
335 |
336 | **Exercise**: Complete the `atoi` hook from the previous exercise first. Then create an emulator,
337 | add `printf` and `atoi` hooks, and execute a `main` function with the correct parameters.
338 | This will require you to pass correct `argc` and `argv` parameters.
339 |
340 | ### Misc features
341 |
342 | **maxsteps**
343 |
344 | When you emulate a function, you may want to limit the number of steps it can take:
345 |
346 | ```python
347 | >>> emu = Emulator()
348 | >>> def callback(emu):
349 | >>> print("executing {:x}".format(emu.pc))
350 | >>> emu.trace(Function("main").entrypoint, callback=callback, maxsteps=3)
351 | SUB ESP,0x2d4
352 | PUSH EBX
353 | PUSH EBP
354 | ```
355 |
356 | Especially if you're emulatingh random pieces of code, setting maxsteps
357 | to something reasonable (like 2000 instructions) may save you from accidentaly
358 | executing an infinite loops.
359 |
360 | **Breakpoints**
361 |
362 | You can set and remove breakpoints using `add_breakpoint` and `clear_breakpoint`
363 | methods
364 |
365 | **emulate_fast**
366 |
367 | Ghidra emulator is not very fast, but ghidralib `emulate` is even slower - because
368 | we support callbacks, we need to go back and forth between Python and Java.
369 |
370 | To make things faster, you can use the `emulate_fast` function. It keeps the
371 | main loop of the emulation in Java, which may matter in some cases.
372 | The downside is that it doesn't support callbacks or instruction counting -
373 | you can only emulate until a specific address. As an upside, function hooks
374 | are supprted.
375 |
376 | **Emulation shorthands**
377 |
378 | To save precious keystrokes you may combine creating an emulator, running it,
379 | and inspecting the result into one step with:
380 |
381 | ```python
382 | >>> Emulator.new("main", maxsteps=100)["EAX"]
383 | 128
384 | ```
385 |
386 | This convenience wrapper is equivalent to the following code:
387 |
388 | ```python
389 | >>> emu = Emulator()
390 | >>> emu.emulate("main", maxsteps=100)
391 | >>> emu["EAX"]
392 | 128
393 | ```
394 |
395 | Some other objects also provide helpers to do the obvious thing with emulator.
396 | For example, you can emulate a function call with:
397 |
398 | ```python
399 | >>> emu = Function("test").emulate(10)
400 | >>> emu["EAX"]
401 | 113
402 | >>> # Or an even shorter version
403 | >>> Function("test").emulate_simple(10)
404 | 113
405 | ```
406 |
407 | **Unicorn compatibility**
408 |
409 | There is a very, very thin compatibility layer with Unicorn. There are aliases
410 | provided for the following Unicorn methods: `reg_write`, `reg_read`, `mem_write`,
411 | `mem_read`, `mem_map`, `emu_start`. Why? The idea is that many people already
412 | know Unicorn. It may make it a tiny bit easier for them if they can use familiar
413 | method names instead of learning a completely new set.
414 |
415 | The goal is not to provide actual compatibility layer - Unicorn is a very different
416 | library and `ghidralib` won't replace it. The only goal is really so Unicorn users
417 | can use familiar names if they forget ghidralib equivalents. If you are not
418 | an Unicorn user, don't use them.
419 |
420 | ### Learn more
421 |
422 | Check out relevant examples in the `examples` directory, especially:
423 |
424 | * [EmulatorHooks.py](https://github.com/msm-code/ghidralib/blob/master/examples/EmulatorHooks.py)
425 | * [ContextRecovery.py](https://github.com/msm-code/ghidralib/blob/master/examples/ContextRecovery.py)
426 | * [LummaPatternBasedDeobfuscation.py](https://github.com/msm-code/ghidralib/blob/master/examples/LummaPatternBasedDeobfuscation.py)
427 |
--------------------------------------------------------------------------------
/docs/getting_started.md:
--------------------------------------------------------------------------------
1 | # Getting Started
2 |
3 | This document contains an introduction to the most important objects
4 | wrapped by this library, and a few more motivational segments.
5 |
6 | If you prefer to learn by example, check out the [examples directory](https://github.com/msm-code/ghidralib/tree/master/examples),
7 | or the [tests](https://github.com/msm-code/ghidralib/blob/master/tests/ghidralib_test.py).
8 | For a more complete reference, see the [API documentation](reference.md).
9 |
10 | * [Main Actors](#main-actors) - description of the most important
11 | ghidralib objects
12 | * [Working at various abstraction levels](#working-at-various-abstraction-levels)
13 | description of the various abstraction levels wrapped (and made easy!)
14 | by ghidralib.
15 | * [Conventions](#conventions) this library follows some design rules.
16 | They are hopefully intuitive, but understanding them may make your
17 | first steps easier.
18 | * [IDE configuration](#ide-configuration) I strongly recommend using
19 | an IDE that supports type-checking.
20 |
21 | ## Installation
22 |
23 | **PyGhidra (Python 3, since Ghidra 11.3)**
24 |
25 | Activate Ghidra python virtual environment, then run `pip install ghidralib`.
26 |
27 | Activating the virtual environent is system specific. On my machine I have to run `source ~/.config/ghidra/ghidra_11.3.1_PUBLIC/venv/bin/activate`.
28 |
29 | **Jython (Python 2, all Ghidra versions)**
30 |
31 | Just **drop [this file](https://github.com/msm-code/ghidralib/blob/master/ghidralib.py) into your ghidra_scripts directory**.
32 | Click [here](https://raw.githubusercontent.com/msm-code/ghidralib/refs/heads/master/ghidralib.py)
33 | for a direct download link.
34 |
35 | ## Main actors
36 |
37 | A lot of objects are wrapped by this library. The most important at the beginning are:
38 |
39 | * [Function](#function) - a function recognised by Ghidra
40 | * [Instruction](#instruction) - assembly instruction
41 | * [DataType](#datatype) - a configured data type
42 | * [Symbol](#symbol) - a named address (also called a label)
43 |
44 | ### Function
45 |
46 | A function is a named block of code. Not all code in Ghidra belongs to a function,
47 | but being in a function makes it easier to reason about - for example,
48 | we can talk about parameters, variables, return values, etc.
49 |
50 | Check these usage examples:
51 |
52 | ```python
53 | from ghidralib import *
54 | # Get a function at address 0x8ca3f0
55 | Function(0x8ca3f0)
56 |
57 | # Get a function named "main"
58 | Function("main")
59 |
60 | # Print all assembly instructions in main function
61 | for instr in Function("main").instructions:
62 | print(instr)
63 |
64 | # Print all pcode instructions in main function
65 | for instr in Function("main").pcode:
66 | print(instr)
67 |
68 | # Print all high-level pcode instructions in main function
69 | # Or you can do it in 100 lines of Java:
70 | # https://github.com/evm-sec/high-pcode/blob/main/HighPCode.java
71 | for instr in Function("main").high_pcode:
72 | print(instr)
73 |
74 | # Print all basic blocks in main function
75 | for block in Function("main").basicblocks:
76 | print(block)
77 |
78 | # Print high variables in main function
79 | # These are the variables as seen by decompiler - the ones
80 | # that one thinks about when reversing
81 | print(Function("main").high_variables)
82 |
83 | # Get the control flow graph of the main function...
84 | # ...and show it! (you can also do something more useful with it)
85 | Function("main").control_flow.show()
86 |
87 | # Decompile the main function and print the C code.
88 | print(Function("main").decompile())
89 |
90 | # Define a function at address 0x400300
91 | Function.create(0x400300, "main")
92 |
93 | # Use symbolic execution to infer values of eax and edx at each call site
94 | for call in Function("MyCustomCrypto").calls:
95 | ctx = call.infer_context()
96 | key, data = ctx["eax"], ctx["edx"]
97 | print(key, data)
98 |
99 | # Infer parameters for each call to this function (using the decmopiler)
100 | for call in Function("MyCustomCrypto").calls:
101 | key, data = call.get_args()
102 | print(key, data)
103 |
104 | # Rename functions calling this function
105 | for caller in Function("MyCustomCrypto").callers:
106 | # Use caller address as a suffix. It's often useful to combine this with
107 | # emulation, so you can put more context in the name.
108 | caller.rename("CallsCustomCrypto_{}".format(caller.address))
109 |
110 | # Emulate a function call and pass parameters (using the function signature)
111 | ctx = Function("GetFuncNameByHash").emulate(0x698766968)
112 | print(ctx.read_cstring(ctx["eax"]))
113 |
114 | # Use Ghidra's SymbolicPropagator to get known register values
115 | ctx = Function(0x401000).symbolic_context()
116 | print(ctx.register_at(0x401020, "eax"))
117 | ```
118 |
119 | Read more in the [`Function` object documentation](reference.md#ghidralib.Function).
120 |
121 | ### Instruction
122 |
123 | Instructions represent a single assembly operation.
124 | They have a mnemonic (e.g. `mov`), operands (e.g. `eax, 3`),
125 | and a pcode representation used for further analysis.
126 |
127 | Check these usage examples:
128 |
129 | ```python
130 | # Get an instruction at address 0x8ca3f0
131 | Instruction(0x8ca3f0)
132 |
133 | # Get the first instruction in main function
134 | Instruction("main")
135 |
136 | # Print the instruction mnemonic and operands
137 | instr = Instruction(0x8ca3f0)
138 | print(instr.mnemonic, instr.operands)
139 |
140 | # Print the instruction pcode:
141 | for op in instr.pcode:
142 | print(op)
143 |
144 | # Print the instruction high-level pcode:
145 | for op in instr.high_pcode:
146 | print(op)
147 | ```
148 |
149 | Read more in the [`Instruction` object documentation](reference.md#ghidralib.Instruction).
150 |
151 | ### DataType
152 |
153 | Data types are used to describe the structure of data in memory.
154 |
155 |
156 | Check these usage examples:
157 |
158 | ```python
159 | # Get a datatype called "int"
160 | DataType("int")
161 |
162 | # Parse a datatype from C string
163 | HINTERNET = DataType.from_c('typedef void* HINTERNET;')
164 |
165 | # Change a datatype at location
166 | create_data(0x1234, HINTERNET)
167 |
168 | # You can also create structures from C code strings:
169 | foo = DataType.from_c('struct foo { int a; int b; };')
170 | ```
171 |
172 | Read more in the [`DataType` object documentation](reference.md#ghidralib.DataType).
173 |
174 | ### Symbol
175 |
176 | Sometimes called a label. Check these usage examples:
177 |
178 | ```python
179 | # Get a symbol (label) at address 0x8ca3f0
180 | Symbol(0x8ca3f0)
181 |
182 | # Get a symbol (label) named "main"
183 | Symbol("main")
184 |
185 | # Create a label "foo" at address 0x1234
186 | Symbol.create(0x1234, "foo")
187 |
188 | # Change the symbol's data type
189 | Symbol("DAT_1234").set_type(HINTERNET)
190 |
191 | # Print all symbols in the program
192 | for symbol in Symbol.all():
193 | print(symbol)
194 |
195 | # Rename all unknown data to something funniner
196 | for symbol in Symbol.all():
197 | if symbol.name.startswith("DAT_"):
198 | symbol.rename("funniner_" + symbol.name")
199 | ```
200 |
201 | Read more in the [`Symbol` object documentation](reference.md#ghidralib.Symbol).
202 |
203 | ## Working at various abstraction levels
204 |
205 | In this section I'll briefly summarize ghidralib objects that you can use to
206 | work at various abstraction levels.
207 |
208 | * **Assembly instructions** - at the lowest level, there is assembler.
209 | You will use familiar [Instruction](reference.md#ghidralib.Instruction),
210 | [BasicBlock](reference.md#ghidralib.BasicBlock) and [Function](reference.md#ghidralib.Function).
211 | When analysing data, you will think in terms of [Register](reference.md#ghidralib.Register)s
212 | of [Variables](reference.md#ghidralib.Variable), and references are in
213 | terms of [Symbols](reference.md#ghidralib.Symbol).
214 |
215 | * **Pcode instructions** - here you think in terms of [PcodeOp](reference.md#ghidralib.PcodeOp)s,
216 | and [PcodeBlocks](reference.md#ghidralib.PcodeBlock). You still work with
217 | [Functions](reference.md#ghidralib.Function), but the data flows between
218 | architecture-independent [Varnodes](reference.md#ghidralib.Varnode) now instead.
219 |
220 | * **High Pcode instructions** - after the decompilation, many things change.
221 | You stil work with [PcodeOps](reference.md#ghidralib.PcodeOp), but they are
222 | significantly transformed - referred as "High Pcode" in this library.
223 | You now think in terms of [High Functions](reference.md#ghidralib.HighFunction),
224 | [High Variables](reference.md#ghidralib.HighVariable), and
225 | [High Symbols](reference.md#ghidralib.HighSymbol).
226 | Even [Varnodes](reference.md#ghidralib.Varnode) are now slightly more powerful
227 | (under the hood they are `VarnodeASTs` now).
228 |
229 | * **Pcode syntax tree** (`Function.pcode_tree`) -
230 | As far as I know, there was no easy way to work with it. I hope ghidralib makes
231 | this much easier. On
232 | this level, you still have high [PcodeOps](reference.md#ghidralib.PcodeOp), but
233 | syntactic elements like "dowhile" loops, "if" statements etc, are now recovered
234 | and you can traverse the syntax tree (while still dealing with
235 | [PcodeOps](reference.md#ghidralib.PcodeOp)).
236 |
237 | * C abstract syntax tree (AST) - built internally by the decompiler, but not exported
238 | by Ghidra. I hope to add support for it in ghidralib one way or another,
239 | but for now there is no way to access it.
240 |
241 | * **Clang tokens** (`Function.tokens`) - a stream of tokens that represent the C code.
242 | It is very detailed, to the level that it contains even whitespace.
243 | You can clean them up, but the data is still overprocessed a bit too much,
244 | and not useful (IMO) during analysis. Ghidra uses it for display.
245 |
246 | ## Showcase
247 |
248 | In this section I'll present a few examples of impressive-but-not-necessarily-useful
249 | things you can do with ghidralib.
250 |
251 | ### Emulation
252 |
253 | Emulate the program from 0x400300 to 0x400400. When finished, read the value of `eax`
254 | and the memory at 0x401000.
255 |
256 | ```python
257 | # Create a new emulator and execute code between 0x400300 and 0x400400
258 | emu = Emulator()
259 | emu.emulate(0x400300, 0x400400)
260 | print(emu["eax"])
261 | print(emu.read_bytes(0x401000, 16))
262 |
263 | # Emulate a function call and pass parameters (using the function signature)
264 | ctx = Function("GetFuncNameByHash").emulate(0x698766968)
265 | print(ctx.read_cstring(ctx["eax"]))
266 | ```
267 |
268 | In practice you can often use this for recovering obfuscated strings, or unpacking
269 | simple packers.
270 |
271 | ### Graphs
272 |
273 | ```python
274 | # Get the control flow graph of the main function (and display it)
275 | Function("main").control_flow.show()
276 | ```
277 |
278 | Graph visualisation is not the most useful feature of this library, but it
279 | looks cool:
280 |
281 | 
282 |
283 | And you can easily build the graph yourself:
284 |
285 | ```python
286 | g = Graph.create()
287 | foo = g.vertex("foo")
288 | bar = g.vertex("bar")
289 | g.edge(foo, bar)
290 | g.show()
291 | ```
292 |
293 | Or you can do some actually useful things with included graph algorithms
294 | (DFS, BFS, and topological sort) - like tracing paths between functions.
295 |
296 | ### Path finding
297 |
298 | Find the shortest path from source to target in the program control flow graph.
299 | If it exists, highlight all basic blocks along the way.
300 |
301 | ```python
302 | source, target = BasicBlock("entry"), BasicBlock(0x00405073)
303 | path = Program.control_flow().bfs(source)
304 | while path.get(target):
305 | target.highlight()
306 | target = path[target]
307 | ```
308 |
309 | 
310 |
311 | You can highlight anything that has an address (like a basic block, function,
312 | or a single instruction). Call .unhighlight() to clear the highlight.
313 |
314 | ## Conventions
315 |
316 | There are a few conventions that this library follows, and which may be useful
317 | when learning:
318 |
319 | * This library completely ignores the Ghidra "Address" abstraction. Plain integers
320 | are used everywhere instead. Address abstraction is very powerful, but not
321 | necessary for most use cases (at least my use cases).
322 |
323 | If this is a problem for you, please let me know - maybe there is a simple way
324 | to make ghidralib work for you.
325 |
326 | * Every object that wraps a Ghidra object has a `.raw` property that can be used
327 | to get the unwrapped object. So you can always "escape" ghidralib:
328 |
329 | ```python
330 | Function("main").raw.UNKNOWN_STACK_DEPTH_CHANGE
331 | 2147483647
332 | ```
333 |
334 | * Objects that have an address can be addressed in many different ways - by name,
335 | by address, or by Ghidra address object. All of these are equivalent:
336 |
337 | ```python
338 | Function("main")
339 | Function(0x669d1e)
340 | Function(toAddr(0x669d1e))
341 | ```
342 |
343 | * Additionaly, wrappers are "tolerant" and try to drop unnecessary layers.
344 | All of these are resolved to the same object:
345 |
346 | ```python
347 | Instruction(getInstructionAt(toAddr(0x0669d2a))) # from raw object
348 | Instruction(0x669d2a) # from integer
349 | Instruction(Instruction(0x669d2a)) # wrapped two times
350 | ```
351 |
352 | * Same goes in the other direction btw - Java API will accept wrappers
353 |
354 | ```python
355 | getInstructionBefore(getInstructionAt(toAddr(0x0669d2a))) # pure java
356 | getInstructionBefore(Instruction(0x0669d2a)) # mixup library object
357 | ```
358 |
359 | * Many objects expose a static constructor methods, where it makes sense.
360 | Possible methods are "get", "create", "all", "create". So for example
361 | instead of `getAllSymbols()` use `Symbols.all()`.
362 |
363 | * The difference between `Function.get(addr)` and `Function(addr)` is that
364 | `Function.get` returns `None` instead of raising an exception when
365 | the desired object was not found.
366 |
367 | ## IDE Configuration
368 |
369 | I strongly recommend using an IDE that supports type-checking. This is why:
370 |
371 | 
372 |
373 | I personally use is VS Code with Python extensions. If you install
374 | VsCode/VsCodium, a Python extension, and just drop ghidralib.py
375 | in the ghidra_scripts directory, then everything should "just work".
376 |
377 | If ghidralib is not installed via pip, and your script lives in a
378 | different directory than ghidralib, override the PYTHONPATH so
379 | typechecker knows how to import it:
380 |
381 | ```json
382 | {
383 | "python.analysis.extraPaths": ["/home/you/Projects/ghidralib"],
384 | "terminal.integrated.env.windows": {
385 | "PYTHONPATH": "/home/you/Projects/ghidralib",
386 | }
387 | }
388 | ```
389 |
--------------------------------------------------------------------------------
/docs/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/msm-code/ghidralib/ec738e49259b741d774a0f11b524d2a0e134541f/docs/graph.png
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Ghidralib
2 |
3 | ## Welcome to ghidralib documentation!
4 |
5 | 
6 |
7 | This library is an attempt to provide an unofficial **Pythonic standard library** for Ghidra.
8 |
9 | The Ghidra scripting API, while extremely powerful, is not well suited for writing
10 | quick one-off scripts when reverse-engineering something at 3AM. Scripts usually
11 | end up verbose, fragile (since there's no type-checking) and with camelCaseEverywhere.
12 |
13 | The goal of this library is to make Ghidra scripting faster, easier and... more fun.
14 |
15 | ```python
16 | from ghidralib import *
17 | for block in Function("main").basicblocks:
18 | for instr in block.instructions:
19 | for pcode in instr.pcode:
20 | args = ", ".join(map(str, pcode.inputs_simple))
21 | print("{:x} {} {}".format(pcode.address, pcode.mnemonic, args))
22 | ```
23 |
24 | ## Basic Usage
25 |
26 | To use ghidralib, **just drop [this file](https://github.com/msm-code/ghidralib/blob/master/ghidralib.py) into your ghidra_scripts directory**.
27 | Click [here](https://raw.githubusercontent.com/msm-code/ghidralib/refs/heads/master/ghidralib.py)
28 | for a direct download link.
29 |
30 | A tutorial describing specific features is in development. Finished chapters include:
31 |
32 | * [Getting Started](./getting_started.md) - a brief description of useful API functions (recommended).
33 | * [Emulator](./emulator.md) - detailed guide to using the emulator wrapper.
34 | * [Working with global data](./data.md) - basic information about working with global data.
35 |
36 | If you prefer to **learn by example**, you can browse the [examples](https://github.com/msm-code/ghidralib/tree/master/examples).
37 |
38 | You can also read the **autogenerated API documentation** [here](./reference.md).
39 |
40 | When in doubt, check out the source code at [Github](https://github.com/msm-code/ghidralib)
41 |
42 | A fair warning: ghidralib is still actively developed and the API may change
43 | slightly in the future. But this doesn't matter for your one-off scripts, does it?
44 | Current compatibility status is documented [here](./compatibility.md).
45 |
--------------------------------------------------------------------------------
/docs/reference.md:
--------------------------------------------------------------------------------
1 | # API reference
2 |
3 | ::: ghidralib
4 | handler: python
5 | options:
6 | show_if_no_docstring: true
7 | show_root_heading: true
8 | show_source: true
9 |
--------------------------------------------------------------------------------
/docs/typecheck.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/msm-code/ghidralib/ec738e49259b741d774a0f11b524d2a0e134541f/docs/typecheck.png
--------------------------------------------------------------------------------
/examples/AsmLevelDataFlow.py:
--------------------------------------------------------------------------------
1 | # Licensed under Apache 2.0
2 | #
3 | # Perform a simple data flow analysis on x86 instructions. The assembly code looks like this:
4 | #
5 | # 004d47f0 8d 0d f2 c2 5f 00 LEA ECX,[gos_AdjustTokenPrivileges_5fc2f2] = "AdjustTokenPrivileges"
6 | # 004d47f6 89 08 MOV dword ptr [EAX],ECX=>gos_AdjustTokenPrivileges_5fc2f2 = "AdjustTokenPrivileges"
7 | # 004d47f8 8b 0d 70 a8 7d 00 MOV ECX,dword ptr [DAT_007da870]
8 | # 004d47fe 85 c9 TEST ECX,ECX
9 | # 004d4800 75 08 JNZ LAB_004d480a
10 | # 004d4802 89 05 30 32 7b 00 MOV dword ptr [DAT_5fc2f2],EAX
11 | #
12 | # And this snippet repeats dozens (or hundreds) of times in a single function.
13 | # This script looks for the "LEA ECX, [source]" instructions followed later
14 | # by "MOV [target], EAX" and renames the [target] to "ptr_{Symbol(source).name}".
15 | # So in this case, this will rename DAT_5fc2f2 to "ptr_gos_AdjustTokenPrivileges_5fc2f2".
16 | #
17 | # This is not the best kind of analysis (a proper symbolic execution would be better),
18 | # but it's good for quick scripts that only need to work once, because it's easy to
19 | # understand and write.
20 | #
21 | # Tested on 3505cd623ee88e3d396789bbe93ebce9834a72f73c9f335fb490924a71a3b21b
22 |
23 | from ghidralib import *
24 |
25 | f = Function(0x004d47f0)
26 |
27 | last_pointer = 0
28 | for op in f.instructions:
29 | if op.mnemonic == "LEA" and op.operand(0).value == "ECX":
30 | # Operands are ["ECX", 0xAAAAAAAA], where 0xAAAAAAAA is a pointer to string
31 | last_pointer = op.operand(1).scalar
32 | elif op.mnemonic == "MOV" and isinstance(op.operand(0), (int, long)):
33 | # Operands are [0xBBBBBBBB, "EAX"], where 0xBBBBBBBB is the target variable
34 | target = op.operand(0).scalar
35 | ptrname = Symbol(last_pointer).name
36 | print("detected move of {} to {:x}".format(ptrname, target))
37 | Symbol.create(target, "ptr_{}".format(ptrname))
38 |
--------------------------------------------------------------------------------
/examples/ContextRecovery.py:
--------------------------------------------------------------------------------
1 | # Licensed under Apache 2.0
2 | #
3 | # Recover parameters passed to a function at a call site.
4 | # This uses Ghidra emulator to emulate a current basic block up to the
5 | # call opcode, and gets the context at the call location. Since the malware
6 | # was written in Delphi, first three parameters are passed in registers which
7 | # makes our job easy.
8 |
9 | from ghidralib import *
10 |
11 |
12 | # Recovered by reverse-engineering
13 | KEY = unhex("21 aa eb d3 48 de a8 92 06 26 44 b1 e7 85 1a b4")
14 |
15 |
16 | def decode(dat):
17 | """String obfuscation used by the analysed malware"""
18 | l, r = dat[::2], dat[1::2]
19 | offset = 16 - len(dat) % 16
20 | return xor(KEY, r) + xor(KEY[::-1][offset:], l)[::-1]
21 |
22 |
23 | for call in Function("MyCustomCrypto").calls:
24 | ctx = call.infer_context()
25 | key, data = ctx["eax"], ctx["edx"]
26 | if key and data:
27 | datalen = read_u32(data - 4)
28 | print(call.address, decode(read_bytes(data, datalen)))
29 |
--------------------------------------------------------------------------------
/examples/DumpFunctionAST.py:
--------------------------------------------------------------------------------
1 | # Licensed under Apache 2.0
2 | #
3 | # Dump a function AST, by leveraging DecompilerInterface.structureGraph method. Inspired by DecompilerNestedLayout class.
4 | # As far as I know, there was no publicly available Ghidra code to recover Pcode AST before this, see issues:
5 | # https://github.com/NationalSecurityAgency/ghidra/discussions/4314
6 | # https://github.com/NationalSecurityAgency/ghidra/issues/2204
7 | # https://github.com/NationalSecurityAgency/ghidra/discussions/6771
8 |
9 | from ghidralib import *
10 |
11 |
12 | def dump(graph, ind=""):
13 | for block in graph.blocks:
14 | print("{} ({}): ".format(ind, block))
15 | if block.has_children:
16 | dump(block, ind + " ")
17 | else:
18 | for op in block.pcode:
19 | print("{} {:x} {}".format(ind + " ", op.address, op))
20 |
21 |
22 | dump(Function("main").pcode_tree)
23 |
--------------------------------------------------------------------------------
/examples/DumpHighPcode.py:
--------------------------------------------------------------------------------
1 | # Licensed under Apache 2.0
2 | #
3 | # Decompile the function at the cursor and output the highlevel Pcode (PcodeAST)
4 | #
5 | # Basically this: https://github.com/evm-sec/high-pcode/blob/main/HighPCode.java
6 | # but in 4 lines of code instead of 103
7 |
8 | from ghidralib import *
9 |
10 | func = Function(Program.location())
11 | for op in func.high_pcode:
12 | print(op)
13 |
--------------------------------------------------------------------------------
/examples/EmulatorHooks.py:
--------------------------------------------------------------------------------
1 | # Licensed under Apache 2.0
2 | #
3 | # Example of basic support for emulator hooks.
4 | #
5 | # A better support (with automated parameter and return handling) will come later.
6 |
7 | from ghidralib import *
8 |
9 |
10 | def printf(emu):
11 | # Hook function gets emulator as parameter, and returns a result -
12 | # one of "continue", "break" or "skip". Default result is "continue"
13 | arg = emu.read_cstring(emu["rsi"])
14 | print("printf called with '{}'".format(arg))
15 | # Execute a RET operation manually
16 | emu.pc = emu.read_u64(emu.sp)
17 | emu.sp += 8
18 |
19 |
20 | e = Emulator()
21 | e.add_hook("__printf_chk", printf)
22 | # Note - add_hook takes an address. Here we take advantage of ghidralib feature that
23 | # (almost) everywhere where you can put address, you can use symbol name and it will
24 | # be automatically resolved to its address.
25 |
26 | main = Function("main")
27 | e.emulate(main.entrypoint, main.exitpoints)
28 | print("Main returned {}".format(e["rax"]))
29 |
--------------------------------------------------------------------------------
/examples/FixUnaffectedRegisters.py:
--------------------------------------------------------------------------------
1 | # Licensed under Apache 2.0
2 | #
3 | # A fix for https://github.com/NationalSecurityAgency/ghidra/discussions/5186
4 | #
5 | # Go over all defined functions, from bottom to the top, and recover missing arguments.
6 | # In particular, this will look for unaffixed and input variables that are registers,
7 | # and add them as parameters to the function.
8 | #
9 | # Another solution would be to check if var.is_input and variable is not a
10 | # parameter, but the current solution was easier to implement.
11 |
12 | from ghidralib import *
13 |
14 | for func in Program.call_graph().toposort(Function("_start"))[::-1]:
15 | for var in func.high_variables:
16 | if (var.is_input or var.is_unaffected) and var.varnode.is_register:
17 | regname = var.varnode.as_register
18 | print("adding {} to {}".format(regname, func.name))
19 | func.add_register_parameter("uint", regname, "arg_" + regname)
20 |
--------------------------------------------------------------------------------
/examples/LummaPatternBasedDeobfuscation.py:
--------------------------------------------------------------------------------
1 | # Licensed under Apache 2.0
2 | #
3 | # Lumma features a lot of obfuscation techniques. This script tackles the
4 | # following, which the most problematic as it obfuscates the control flow:
5 | #
6 | # 8b 04 85 18 6b 44 00 MOV EAX, dword ptr [EAX*0x4 + DAT_00446b18]
7 | # b9 e4 b2 85 35 MOV ECX, 0x3585b2e4
8 | # 33 0d 20 6b 44 00 XOR ECX, dword ptr [DAT_00446b20]
9 | # 01 c1 ADD ECX, EAX
10 | # 41 INC ECX
11 | # 31 c0 XOR EAX, EAX
12 | # ff e1 JMP ECX
13 | #
14 | # At the start of this pattern, EAX may be either 0 or 1, so this works as an if-else
15 | # statement. But getting this right is hard for decompilers, including Ghidra, so
16 | # we need to give her a hand.
17 | #
18 | # This script looks for that byte pattern (with wildcard for constants and registers),
19 | # then emulates the statement for eax=0 and for eax=1, and finally replaces the
20 | # whole code block with a functionally equivalent short patch (TEST / JZ / JMP).
21 |
22 | from ghidralib import *
23 |
24 | pattern = "8B 04 85"
25 | for addr in findall_pattern(pattern):
26 | # There may be instructions before the JMP, so let's disassemble next 10 instructions
27 | # and look for the JMP (to get the register that the JMP jumps to)
28 | for op in disassemble_at(addr, 10):
29 | if op.mnemonic == "JMP" and op.operands[0].is_register:
30 | jump_to = op.operands[0].register
31 | break
32 | else:
33 | # no JMP found
34 | continue
35 |
36 | # Emulate what happens if EAX=0
37 | emu = Emulator()
38 | emu["EAX"] = 0 # Assume EAX=0 (for clarity, this is the default)
39 | emu.emulate(addr, op.address) # Emulate until JMP
40 | iffalse = emu[jump_to] # Target address at JMP if EAX=0
41 |
42 | emu = Emulator()
43 | emu["EAX"] = 1 # Assume EAX=1
44 | emu.emulate(addr, op.address) # Emulate until JMP
45 | iftrue = emu[jump_to] # Target address at JMP if EAX=1
46 |
47 | # Write the patch (and pad the rest of the block with NOPs)
48 | assemble_at(addr, [
49 | "TEST EAX, EAX",
50 | "JZ 0x{:x}".format(iffalse),
51 | "JMP 0x{:x}".format(iftrue),
52 | ], pad_to=op.address - addr + 2)
53 |
54 | try:
55 | Function(addr).fixup_body()
56 | except:
57 | pass
58 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | Some examples of how to use the library in practice (a lot of this is code
4 | I wrote during my work as a reverse-engineer).
5 |
6 | Since this library is still in development, there's not much to show off yet.
7 | But I plan to share snippets of things I write during my daily work here.
8 |
9 | ## Basics
10 |
11 | Scripts to serve as examples, and maybe to ensure everything works smoothly in ghidralib.
12 | Scripts here are often rewritten Ghidra examples, or very small deobfuscation scripts.
13 |
14 | * [SwitchOverride](./SwitchOverride.py): Fixup a switch statement at the pointer location (in 40 lines of code, [original](https://github.com/NationalSecurityAgency/ghidra/blob/master/Ghidra/Features/Decompiler/ghidra_scripts/SwitchOverride.java) has 110).
15 | * [DumpHighPcode](./DumpHighPcode.py): Dump high-level Pcode of a function (in 4 lines of code, [original](https://github.com/evm-sec/high-pcode/blob/main/HighPCode.java) has 103).
16 | * [EmulatorHooks](./EmulatorHooks.py): An example of how to use the emulator to emulate a single function,
17 | including a simple printf hook.
18 |
19 | ## Everyday use
20 |
21 | * [Context Recovery](./ContextRecovery.py): Iterates over calls to the string
22 | deobfuscation function, recovers the call parameters, and decrypts
23 | the obfuscated strings.
24 | * [Fix "Unaffected" Registers](./FixUnaffectedRegisters.py): Fix broken "unaffected" registers
25 | in the whole program by traversing the call graph and editing function signatures.
26 | * [Recover Function Pointers](./RecoverFunctionPointers.py): Recovering function pointers.
27 | Iterate over MOVs in a function, and use the decompilation of the function referenced
28 | by the second MOV operand to automatically rename and retype the function pointer from the
29 | first parameter. And all of that in just ~20 lines of code!
30 | * [Lumma Pattern Based Deobfuscation](./LummaPatternBasedDeobfuscation.py): A deobfuscation
31 | script that uses a byte pattern finder and assembly patching engine to easily overcome
32 | one of Lumma stealer obfuscation techniques.
33 | * [Asm Level Data Flow](./AsmLevelDataFlow.py): Simple data flow analysis at the x86 assembly level.
34 | This script analyses a function, and looks for the "LEA ECX, [source]" instructions followed later
35 | by "MOV [target], EAX", and then creates a label at `[target]` called `ptr_[source_string_name]`.
36 |
37 | ## Fancy things
38 |
39 | * [DumpFunctionAST](./DumpFunctionAST.py): pretty print a function structure (AST)
40 | as recovered by the decompiler. This is novel: as far as I know there was no
41 | publicly available script that did this.
42 |
--------------------------------------------------------------------------------
/examples/RecoverFunctionPointers.py:
--------------------------------------------------------------------------------
1 | # Licensed under Apache 2.0
2 | #
3 | # At the beginning there was this:
4 | #
5 | # void InitDynamicFunctions(void) {
6 | # DAT_00400010 = FUN_040b886;
7 | # DAT_00400014 = FUN_040b91c;
8 | # DAT_00400018 = FUN_040ba20;
9 | # DAT_0040001C = FUN_040bc20;
10 | #
11 | # Each of the functions was later used to dynamically load and execute a function -
12 | # so they worked as lazy function pointers. This script automatically processes such
13 | # function, and renames all symbols (by decompiling FUN_040ba... and checking
14 | # the referenced string literals). The end result is this:
15 | #
16 | # void InitDynamicFunctions(void) {
17 | # var_accept = load_accept;
18 | # var_bind = load_bind;
19 | # var_closesocket = load_closesocket;
20 | # var_connect = load_connect;
21 | #
22 | # this script also retypes involved variables by looking up the appropriate pointer type..
23 |
24 | from ghidralib import *
25 |
26 | for instruction in Function("InitDynamicFunctions").instructions:
27 | if instruction.mnemonic != "MOV":
28 | continue
29 |
30 | to, frm = instruction.operands
31 | func = Function.get(frm)
32 | if not func:
33 | func = Function.create(frm, "tmp_func")
34 |
35 | for op in func.high_pcode:
36 | if op.opcode != op.COPY:
37 | continue
38 |
39 | literal = get_string(op.inputs[0].value)
40 | if not literal:
41 | continue
42 |
43 | print("renaming {:x} based on literal {}".format(func.entrypoint, literal))
44 | func.rename("load_{}".format(literal))
45 | label = Symbol.create(to, "var_{}".format(literal))
46 | functype = DataType.get(literal)
47 | if functype:
48 | label.set_type(functype)
49 | break
50 |
--------------------------------------------------------------------------------
/examples/SwitchOverride.py:
--------------------------------------------------------------------------------
1 | # Original IP: GHIDRA (SwitchOverride.java)
2 | # rewritten to Python by msm
3 | # Licensed under Apache 2.0
4 | #
5 | # Override a jump opcode so it jumps to the computed jump.
6 | #
7 | # Usage:
8 | # 1. Add the COMPUTED_JUMP references to the branch instruction manually
9 | # 2. run the script when cursor is over the branch instruction
10 |
11 | from ghidralib import *
12 |
13 |
14 | def is_computed_branch(inst): # type: (Instruction) -> bool
15 | if inst.flow_type.is_jump and inst.flow_type.is_computed:
16 | return True
17 |
18 | if inst.flow_type.is_call:
19 | for xref in inst.xrefs_from:
20 | if xref.is_call:
21 | func = Function.get(xref.to_address)
22 | if func and func.fixup:
23 | return True
24 |
25 | return False
26 |
27 |
28 | def switch_override(addr): # type: (Addr) -> None
29 | inst = Instruction(addr)
30 | if not is_computed_branch(inst):
31 | print("Please highlight or place the cursor on the instruction performing the computed jump.") # fmt: skip
32 | return
33 |
34 | destlist = [xref.to_address for xref in inst.xrefs_from if xref.is_jump]
35 | if not destlist:
36 | print("Please highlight destination instructions too.") # fmt: skip
37 | return
38 |
39 | func = Function.get(addr)
40 | if not func:
41 | print("Computed jump instruction must be in a Function body.")
42 | return
43 |
44 | # At some point, jumptables were integrated into ghidralib core - so this
45 | # code is now trivial. Internally this is implemented as a few lines
46 | # of code that create a JumpTable object and write it.
47 | inst.write_jumptable(destlist)
48 |
49 | switch_override(Program.location())
50 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Ghidralib
2 | theme: readthedocs
3 | plugins:
4 | - search
5 | - mkdocstrings:
6 | handlers:
7 | python:
8 | options:
9 | docstring_style: sphinx
10 | heading_level: 1
11 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "ghidralib"
3 | version = "0.2.0"
4 | authors = [
5 | { name="msm", email="msm@tailcall.net" },
6 | ]
7 | description = " A Pythonic Ghidra standard library"
8 | readme = "README.md"
9 | requires-python = ">=3.8"
10 | classifiers = [
11 | "Programming Language :: Python :: 3",
12 | "Operating System :: OS Independent",
13 | ]
14 | license = "MIT"
15 | license-files = ["LICENCSE.txt"]
16 |
17 | [project.urls]
18 | Homepage = "https://github.com/msm-code/ghidralib"
19 | Issues = "https://github.com/msm-code/ghidralib/issues"
--------------------------------------------------------------------------------
/tests/ghidralib_test.py:
--------------------------------------------------------------------------------
1 | # @runtime: PyGhidra
2 |
3 | import sys
4 | from ghidralib import *
5 |
6 | # Run tests on 44573a7526d5053a28d4e3e70c6ad8adf8eec148d8fe81302140b6bb3df179c0
7 |
8 |
9 | # TODO: Symbol.remove
10 |
11 |
12 | def b(value): # (str) -> bytes
13 | """Python2/3 helper: convert string to bytes (py3) or noop (py2)"""
14 | if sys.version_info[0] == 3:
15 | return value.encode("latin1")
16 | return value
17 |
18 |
19 | ###############################################################
20 | # Test Graph
21 | ###############################################################
22 |
23 |
24 | def test_graph():
25 | graph = Graph.create("name", "description")
26 | graph.vertex(1)
27 | graph.vertex(2)
28 | graph.edge(1, 2)
29 | assert 1 in graph
30 | assert graph.has_vertex(1)
31 | assert 1 in graph.vertices
32 | assert len(graph) == 2
33 | assert len(graph.vertices) == 2
34 | assert graph.vertex_count == 2
35 | assert len(graph.edges) == 1
36 | assert graph.edge_count == 1
37 |
38 | assert graph.name == "name"
39 | assert graph.description == "description"
40 |
41 | assert graph.dfs(1) == {1: None, 2: 1}
42 | assert graph.bfs(1) == {1: None, 2: 1}
43 | assert graph.toposort(1) == [2, 1]
44 |
45 | func = Function(0x00403d02)
46 | graph = func.control_flow
47 | origin = BasicBlock(func.address)
48 | assert len(graph.vertices) == graph.vertex_count == len(graph) == 6
49 | assert len(graph.edges) == graph.edge_count == 7
50 | assert graph.has_vertex(origin) and origin in graph
51 |
52 | block_addresses = [0x403d02, 0x403d11, 0x0403d14, 0x403d22, 0x403d3d, 0x403d40]
53 | def validator(block):
54 | assert block.address in block_addresses
55 |
56 | parents = graph.bfs(origin, validator)
57 | assert parents[origin] == None
58 | assert len(parents) == 6
59 |
60 | parents = graph.dfs(origin, validator)
61 | assert parents[origin] == None
62 | assert len(parents) == 6
63 |
64 | assert len(graph.toposort(origin)) == 6
65 |
66 | dot = graph.to_dot()
67 | assert "digraph" in dot
68 |
69 | assert graph.description is not None
70 | assert graph.name is not None
71 |
72 |
73 | ###############################################################
74 | # Test HighVariable
75 | ###############################################################
76 |
77 |
78 | def test_high_variable():
79 | func = Function("entry").high_function
80 | assert len(func.variables) > 0
81 |
82 | var = func.variables[0]
83 | assert var.size > 0
84 | assert var.name is not None
85 | assert var.data_type is not None
86 | assert var.symbol is not None
87 | assert len(var.varnodes) > 0
88 | assert var.varnode is not None
89 |
90 | _ = var.is_addr_tied
91 | _ = var.is_free
92 | _ = var.is_input
93 | _ = var.is_persistent
94 | _ = var.is_unaffected
95 |
96 |
97 | ###############################################################
98 | # Test HighSymbol
99 | ###############################################################
100 |
101 |
102 | def test_high_symbol():
103 | func = Function("entry").high_function
104 | assert len(func.symbols) > 0
105 |
106 | sym = func.symbols[0]
107 | assert sym.size > 0
108 | assert sym.data_type is not None
109 | _ = sym.variable # this may be none
110 | assert sym.name is not None
111 | _ = sym.symbol # may be none
112 | assert not sym.is_this_pointer
113 |
114 |
115 | ###############################################################
116 | # Test register
117 | ###############################################################
118 |
119 |
120 | def test_register():
121 | assert Register.get("eax") is not None
122 | assert Register("eax").name == "EAX"
123 | assert Register("eax").size == 4
124 | assert Register("eax").varnode is not None
125 |
126 |
127 | ###############################################################
128 | # Test Varnode
129 | ###############################################################
130 |
131 |
132 | def test_varnode():
133 | func = Function("entry")
134 | assert len(func.varnodes) > 0
135 | vn = func.varnodes[0]
136 |
137 | if vn.has_value:
138 | assert vn.value is not None
139 | _ = vn.offset
140 | _ = vn.size
141 | # _ = vn.high
142 | # _ = vn.symbol
143 |
144 | _ = vn.is_constant
145 | _ = vn.is_register
146 | if vn.is_register:
147 | assert vn.as_register is not None
148 | _ = vn.is_address
149 | _ = vn.is_unique
150 | _ = vn.is_hash
151 | _ = vn.is_unaffected
152 | _ = vn.is_persistent
153 | _ = vn.is_addr_tied
154 | _ = vn.is_input
155 | _ = vn.is_free
156 | # _ = vn.defining_pcodeop
157 | _ = vn.descendants
158 |
159 | assert isinstance(vn.simple, (str, unicode, int))
160 | assert vn.free.is_free
161 |
162 |
163 | # TODO PcodeBlock
164 | # TODO BlockGraph
165 |
166 | ###############################################################
167 | # Test HighFunction
168 | ###############################################################
169 |
170 |
171 | def test_high_function():
172 | foo = HighFunction.get("sado")
173 | assert foo is None
174 |
175 | # ghidralib sanity
176 | func = HighFunction("FUN_00406831")
177 | assert func == HighFunction.get("FUN_00406831")
178 | assert func == HighFunction.get(0x00406831)
179 | assert func == HighFunction.get(toAddr(0x00406831))
180 | assert func == HighFunction.get(func.raw)
181 |
182 | assert len(Function.all()) > 10
183 |
184 | func = HighFunction("entry")
185 | func.get_pcode_at(0x00406831) # not throws
186 | assert len(func.pcode) > 0
187 | assert len(func.basicblocks) > 0
188 | assert func.pcode_tree is not None
189 | assert len(func.symbols) > 0
190 | assert len(func.variables) > 0
191 |
192 |
193 | # TODO Reference
194 | # TODO RefType
195 |
196 |
197 | ###############################################################
198 | # Test Instruction
199 | ###############################################################
200 |
201 |
202 | def test_instruction():
203 | ins = Instruction.get(0x1234)
204 | assert ins is None
205 |
206 | assert Instruction("entry") == Function("entry").instructions[0]
207 |
208 | ins = Instruction.get(0x406837)
209 | assert ins is not None
210 |
211 | # ghidralib sanity
212 | ins = Instruction(0x406837)
213 | assert ins == Instruction.get(0x406837)
214 | assert ins == Instruction.get(toAddr(0x406837))
215 | assert ins == Instruction.get(ins.raw)
216 |
217 | assert ins.mnemonic == "SUB"
218 | assert ins.address == 0x406837
219 | assert ins.next == Instruction(0x40683A)
220 | assert ins.prev == Instruction(0x406834)
221 |
222 | assert len(ins.pcode) > 0
223 | assert ins.high_pcode is not None
224 |
225 | assert ins.bytes == b("\x83\xec\x18")
226 | assert ins.length == 3
227 | assert len(ins) == 3
228 |
229 | assert ins.operand(0).is_register
230 | assert ins.operand(0).register == "ESP"
231 | assert ins.operands[0].is_register
232 | assert ins.operands[0].register == "ESP"
233 | assert ins.operand(1).is_scalar
234 | assert ins.operand(1).scalar == 0x18
235 | assert ins.operands[1].is_scalar
236 | assert ins.operands[1].scalar == 0x18
237 | assert ins.operand_values == ["ESP", 0x18]
238 |
239 | ins = assemble("JMP 0")[0]
240 | assert not ins.has_fallthrough
241 |
242 | ins.set_fallthrough_override(0x1000)
243 | assert ins.has_fallthrough
244 |
245 | ins.clear_fallthrough_override()
246 | assert not ins.has_fallthrough
247 |
248 | mov = Instruction(0x40683E)
249 | assert len(mov.xrefs_from) > 0
250 | assert len(mov.xrefs_to) == 0
251 | assert mov.has_fallthrough
252 |
253 | ins = Instruction(0x403993)
254 | assert ins.mnemonic == "MOVZX"
255 | assert ins.operand(1).is_list
256 | assert ins.operand(1).list == ["ESI", 2]
257 | assert len(ins.output_varnodes) == 3
258 | assert len(ins.input_varnodes) == 5
259 |
260 | Instruction.create(0x403993) # no-op in this case
261 |
262 | ins = Instruction(0x40399F)
263 | assert ins.flows == [0x4039A6]
264 | assert ins.all_flows == [0x4039A6, 0x4039A1]
265 |
266 | # TODO fallthrough_override and jumptable
267 |
268 |
269 | # TODO AddressRange
270 | # TODO AddressSet
271 |
272 | ###############################################################
273 | # Test BasicBlock
274 | ###############################################################
275 |
276 |
277 | def test_basic_block():
278 | block = BasicBlock.get(0x1234)
279 | assert block is None
280 |
281 | block = BasicBlock(0x004043D9)
282 | assert block == BasicBlock.get(0x004043D9)
283 | assert block == BasicBlock.get("FUN_004043d9")
284 | assert block == BasicBlock.get(toAddr(0x004043D9))
285 | assert block == BasicBlock.get(block.raw)
286 |
287 | assert block in Function(0x004043D9).basicblocks
288 |
289 | assert block.address == 0x004043D9
290 | assert block.start_address == 0x004043D9
291 | assert block.end_address == 0x004043F8
292 | assert block.length == 32
293 | assert block.bytes == unhex(
294 | "558bec81ec840000005356578bf86a145bc745fcdcffffff81ff00001000730a"
295 | )
296 |
297 | assert len(block.instructions) == 12
298 | assert len(block.pcode) > 0
299 | assert len(block.destinations) > 0
300 | assert len(block.sources) > 0
301 | assert block.address in block.body
302 |
303 | assert len(BasicBlock.all()) > 10
304 | assert len(Program.control_flow()) > 10
305 |
306 | assert block.flow_type is not None
307 |
308 |
309 | ###############################################################
310 | # Test Variable
311 | ###############################################################
312 |
313 |
314 | def test_variable():
315 | func = Function(0x004043D9)
316 | vars = func.variables
317 |
318 | assert len(vars) > 0
319 | var = vars[0]
320 |
321 | assert var.function == func
322 |
323 | assert var.name is not None
324 | org_name = var.name
325 | var.rename(org_name + "fun")
326 | assert var.name == org_name + "fun"
327 | var.rename(org_name)
328 |
329 | assert var.data_type is not None
330 | org_type = var.data_type
331 | var.data_type = DataType("int")
332 | assert var.data_type.name == "int"
333 | var.data_type = org_type
334 |
335 | assert var.is_valid
336 | assert var.comment is None
337 | var.comment = "x"
338 | assert var.comment == "x"
339 | var.comment = None
340 |
341 | # Just call the methods, to make sure they don't raise exceptions
342 | _ = var.is_stack
343 | _ = var.is_memory
344 | _ = var.is_unique
345 | _ = var.is_compound
346 | _ = var.is_forced_indirect
347 | _ = var.has_bad_storage
348 | _ = var.is_unassigned_storage
349 | _ = var.is_void
350 | _ = var.stack_offfset
351 | _ = var.is_constant
352 | _ = var.is_hash
353 | _ = var.is_stack
354 | _ = var.is_memory
355 | _ = var.is_unique
356 | _ = var.is_compound
357 |
358 | assert var.symbol is not None
359 | assert len(var.varnodes) > 0
360 | assert var.varnodes[0].raw
361 |
362 | if var.is_register:
363 | assert var.register is not None
364 |
365 |
366 | ###############################################################
367 | # Test Parameter
368 | ###############################################################
369 |
370 |
371 | def test_parameter():
372 | func = Function(0x004043D9)
373 | params = func.parameters
374 |
375 | assert len(params) > 0
376 | param = params[0]
377 | assert param.ordinal == 0
378 | assert param.formal_data_type.name == "int"
379 |
380 |
381 | ###############################################################
382 | # Test FunctionCall
383 | ###############################################################
384 |
385 |
386 | def test_function_call():
387 | func = Function.get(0x004043D9)
388 | assert func is not None
389 |
390 | calls = func.calls
391 | assert len(calls) > 0
392 | assert any(
393 | call.calling_function.name == "FUN_004044d1"
394 | for call in calls
395 | if call.calling_function
396 | )
397 | call = calls[0]
398 |
399 | assert call.called_function.name == func.name
400 | assert call.address is not None
401 |
402 | assert call.callee == call.called_function
403 | assert call.caller == call.calling_function
404 |
405 | assert call.high_pcodeop is not None
406 | assert call.high_pcodeop.raw
407 | assert len(call.high_pcodeop.inputs) > 1
408 | assert len(call.high_varnodes) > 0
409 | assert call.high_varnodes[0].raw
410 | assert len(call.infer_args()) > 0
411 | assert call.infer_context() is not None
412 |
413 | assert call.instruction.mnemonic == "CALL"
414 |
415 |
416 | # TODO ClangTokenGroup
417 |
418 |
419 | ###############################################################
420 | # Test Function
421 | ###############################################################
422 |
423 |
424 | def test_function():
425 | foo = Function.get("sado")
426 | assert foo is None
427 |
428 | # ghidralib sanity
429 | func = Function("FUN_00406831")
430 | assert func == Function.get("FUN_00406831")
431 | assert func == Function.get(0x00406831)
432 | assert func == Function.get(toAddr(0x00406831))
433 | assert func == Function.get(func.raw)
434 |
435 | assert len(Function.all()) > 10
436 |
437 | func = Function("entry")
438 | assert func.name == "entry"
439 | assert func.address == 0x04038AF
440 | assert func.entrypoint == 0x04038AF
441 | assert func.return_type.name == "undefined"
442 | assert not func.is_thunk
443 | assert not func.is_external
444 |
445 | func.set_comment("x")
446 | assert func.comment == "x"
447 | func.set_comment(None)
448 | assert func.comment is None
449 |
450 | func.set_repeatable_comment("x")
451 | assert func.repeatable_comment == "x"
452 | func.set_repeatable_comment("")
453 | assert func.repeatable_comment == ""
454 |
455 | assert len(func.parameters) == 0
456 | assert len(func.local_variables) >= 3
457 | assert func.local_variables[0].raw
458 | assert len(func.variables) >= 3
459 | assert func.variables[0].raw
460 | assert len(func.varnodes) >= 3
461 | assert func.varnodes[0].raw
462 | assert len(func.high_variables) >= 3
463 | assert func.high_variables[0].raw
464 | assert len(func.stack) > 1
465 | assert func.stack[0].raw
466 |
467 | func.rename("test")
468 | assert func.name == "test"
469 | func.rename("entry")
470 |
471 | assert len(func.xrefs) > 0
472 | assert func.xrefs[0].raw
473 | assert len(func.xref_addrs) > 0
474 | assert len(func.callers) == 0
475 | assert len(func.called) > 3
476 | assert func.called[0].raw
477 | assert len(func.calls) == 0
478 |
479 | func.fixup = "x"
480 | assert func.fixup == "x"
481 | func.fixup = None
482 | assert func.fixup is None
483 |
484 | assert len(func.basicblocks) > 3
485 | assert func.basicblocks[0].raw
486 | assert len(func.decompile()) > 100
487 |
488 | assert func.high_function is not None
489 | assert func.high_function.raw
490 | assert len(func.get_high_pcode()) > 10
491 | assert len(func.high_pcode) > 10
492 | assert func.high_pcode[0].raw
493 |
494 | assert func.pcode_tree is not None
495 |
496 | assert len(func.pcode) > 10
497 | assert func.pcode[0].raw
498 | assert len(func.high_basicblocks) > 10
499 | assert func.high_basicblocks[0].raw
500 |
501 | func.get_high_pcode_at(func.entrypoint)
502 |
503 | assert len(func.high_symbols) > 0
504 | assert func.high_symbols[0].raw
505 | assert len(func.primary_symbols) > 0
506 | assert func.primary_symbols[0].raw
507 | assert len(func.symbols) > 0
508 | assert func.symbols[0].raw
509 | assert not func.body.is_empty
510 | assert func.body.raw
511 |
512 | assert func.control_flow is not None
513 | assert func.control_flow.raw
514 |
515 |
516 | ###############################################################
517 | # Test Symbol
518 | ###############################################################
519 |
520 |
521 | def test_symbol():
522 | sym = Symbol.get("sado")
523 | assert sym is None
524 |
525 | assert len(Symbol.all()) > 10
526 |
527 | sym = Symbol("FUN_00403caf")
528 | assert sym.address == 0x00403CAF
529 | assert sym.name == "FUN_00403caf"
530 | assert sym.name_with_namespace == "FUN_00403caf"
531 | assert len(sym.xrefs) > 0
532 | assert len(sym.xref_addrs) > 0
533 |
534 | sym = Symbol.create(0x00403CDE, "foo")
535 | assert Symbol.get("foo") is not None
536 | assert Symbol.get(0x00403CDE) is not None
537 | sym.rename("bar")
538 | assert Symbol.get("bar") is not None
539 | assert sym.name == "bar"
540 |
541 | sym.delete()
542 | assert Symbol.get("foo") is None
543 | assert Symbol.get("bar") is None
544 | assert Symbol.get(0x00403CDE) is None
545 |
546 | assert Symbol("wsprintfA").address == 0xB8AA # Resolve external address
547 |
548 |
549 | ###############################################################
550 | # Test DataType
551 | ###############################################################
552 |
553 |
554 | def test_datatype():
555 | dt = DataType.get("sado")
556 | assert dt is None
557 |
558 | assert len(DataType.all()) > 10
559 | assert len(DataType.all(True)) < len(DataType.all())
560 |
561 | dt = DataType.get("int")
562 | assert dt is not None
563 | assert dt.name == "int"
564 |
565 | dt = DataType.from_c("typedef void* HINTERNET;", insert=False)
566 | assert dt.name == "HINTERNET"
567 | assert dt.length == 4
568 |
569 | # TODO: create_at
570 |
571 |
572 | ###############################################################
573 | # Test Emulator
574 | ###############################################################
575 |
576 |
577 | def test_emulator():
578 | emu = Emulator()
579 | assert emu["esi"] == 0
580 | emu.emulate(0x403ECB, 0x403ED0)
581 | assert emu["esi"] == 0xFFFF
582 | assert emu.pc == 0x403ED0
583 |
584 | emu = Emulator()
585 | emu.add_breakpoint(0x403ED0)
586 | emu.emulate(0x403EC1)
587 | assert emu["esi"] == 0xFFFF
588 | assert emu.pc == 0x403ED0
589 |
590 | emu = Emulator()
591 | assert emu["esi"] == 0
592 | emu.emulate_fast(0x403ECB, 0x403ED0)
593 | assert emu["esi"] == 0xFFFF
594 | assert emu.pc == 0x403ED0
595 |
596 | emu = Emulator.new(0x403ECB, 0x403ED0)
597 | assert emu["esi"] == 0xFFFF
598 | assert emu.pc == 0x403ED0
599 |
600 | emu.single_step()
601 | assert emu.pc == 0x405F96
602 |
603 | emu = Emulator()
604 | emu.emulate(0x403ECB, maxsteps=2)
605 | assert emu.pc == 0x405F96
606 |
607 | emu = Emulator()
608 | emu.emulate(0x403ECB, ends=[0x403ED0, 0x1234])
609 | assert emu.pc == 0x403ED0
610 |
611 | emu = Emulator()
612 | emu.emulate(0x403ECB, stop_when=lambda emu: emu.pc == 0x403ED0)
613 | assert emu.pc == 0x403ED0
614 |
615 | was_called = [False]
616 |
617 | def ensure_called(emu):
618 | was_called[0] = True
619 |
620 | emu = Emulator()
621 | emu.emulate(0x403ECB, 0x403ED0, callback=ensure_called)
622 | assert was_called[0]
623 |
624 | def make_returner(value):
625 | def wrapped(emu):
626 | return value
627 |
628 | return wrapped
629 |
630 | emu.emulate(0x403EC1, 0x403ED0, callback=make_returner("continue"))
631 | assert emu.pc == 0x403ED0
632 |
633 | emu.emulate(0x403EC1, 0x403ED0, callback=make_returner("break"))
634 | assert emu.pc == 0x403EC1
635 |
636 | emu.emulate(0x403EC1, 0x403ED0, callback=make_returner("continue_then_break"))
637 | assert emu.pc == 0x403EC2
638 |
639 | emu["esi"] = 0
640 | assert emu["esi"] == 0
641 |
642 | emu = Emulator()
643 | assert emu.read_bytes(0x403ECB, 5) != b("\x90\x90\x90\x90\x90")
644 | emu.write_bytes(0x403ECB, b("\x90\x90\x90\x90\x90"))
645 | assert emu.read_bytes(0x403ECB, 5) == b("\x90\x90\x90\x90\x90")
646 | emu.emulate(0x403ECB, 0x403ED0)
647 | # assert emu["esi"] == 0
648 | # Uhh, looks like Ghidra emulator doesn't support self-modifying code yet.
649 | # Apparently we're now in a transitional period, and I think we could
650 | # use AdaptedEmulator instead, but it's scheduled to be deleted.
651 |
652 | emu.write_register("esi", 1)
653 | assert emu.read_register("esi") == 1
654 |
655 | emu.write_bytes(0x400000, b("\x01\x02\x03\x04\x05\x06\x07\x08"))
656 | assert emu.read_u8(0x400000) == 0x01
657 | assert emu.read_u16(0x400000) == 0x0201
658 | assert emu.read_u32(0x400000) == 0x04030201
659 | assert emu.read_u64(0x400000) == 0x0807060504030201
660 |
661 | emu.write_u8(0x400000, 0x01)
662 | assert emu.read_u8(0x400000) == 0x01
663 | emu.write_u16(0x400000, 0x0201)
664 | assert emu.read_u16(0x400000) == 0x0201
665 | emu.write_u32(0x400000, 0x04030201)
666 | assert emu.read_u32(0x400000) == 0x04030201
667 | emu.write_u64(0x400000, 0x0807060504030201)
668 | assert emu.read_u64(0x400000) == 0x0807060504030201
669 |
670 | assert emu.read_bytes(0x400000, 8) == b("\x01\x02\x03\x04\x05\x06\x07\x08")
671 |
672 | # High-level function emulation API
673 | fnc = Function(0x004061EC)
674 | emu = fnc.emulate(-0x80000000)
675 | assert emu.read_unicode(emu["eax"]) == "HKEY_CLASSES_ROOT"
676 |
677 | assert fnc.emulate_simple(-0x80000000) == emu["eax"]
678 |
679 | # Low-level function emulation API
680 | fnc = Function(0x004061EC)
681 | emu = Emulator()
682 | emu.write_varnode(fnc.parameters[0].varnode, -0x80000000)
683 | emu.emulate(fnc.entrypoint, stop_when=lambda emu: emu.pc not in fnc.body)
684 | assert emu.read_unicode(emu["eax"]) == "HKEY_CLASSES_ROOT"
685 |
686 | mock_executed = [False]
687 |
688 | def nullsub(emu):
689 | mock_executed[0] = True
690 | emu.pc = emu.read_u64(emu.sp)
691 | emu.sp += 8
692 |
693 | fun = Function(0x406035)
694 | emu = Emulator()
695 | emu.add_hook("lstrcpynW", nullsub)
696 | emu.emulate(fun.entrypoint, fun.exitpoints)
697 | assert mock_executed[0]
698 |
699 |
700 | ###############################################################
701 | # Test Memory Block
702 | ###############################################################
703 |
704 |
705 | def test_memory_block():
706 | mbs = MemoryBlock.all()
707 | assert len(mbs) == 8
708 |
709 | assert len(Program.memory_blocks()) == 8
710 |
711 | mb = [m for m in mbs if m.name == ".text"][0]
712 | assert mb.length == 29696
713 | assert mb.size == 29696
714 | assert mb.start == 0x401000
715 | assert mb.end == 0x4083FF
716 | assert mb.bytes is not None
717 |
718 |
719 | ###############################################################
720 | # Test Program
721 | ###############################################################
722 |
723 |
724 | def test_program():
725 | assert Program.location() != 0
726 |
727 | cg = Program.call_graph()
728 | assert len(cg.vertices) == cg.vertex_count
729 | assert cg.vertex_count > 0
730 | assert len(cg.edges) == cg.edge_count
731 | assert cg.edge_count > 0
732 |
733 | # TODO: create_data
734 |
735 |
736 | ###############################################################
737 | # Test Utilities
738 | ###############################################################
739 |
740 |
741 | def test_util():
742 | data = read_bytes(0x0403ED0, 10)
743 | assert len(disassemble_bytes(data)) > 0
744 | assert disassemble_bytes(data)[0].mnemonic == "CALL"
745 |
746 | assert disassemble_at(0x0403ED0)[0].mnemonic == "CALL"
747 | assert len(disassemble_at(0x0403ED0)) == 1
748 | assert len(disassemble_at(0x0403ED0, max_instr=2)) == 2
749 |
750 | assert assemble_to_bytes(["ADD EAX, EAX", "ADD EAX, EAX"]) == b("\x01\xc0\x01\xc0")
751 | assert assemble_to_bytes("ADD EAX, EAX") == b("\x01\xc0")
752 | # TODO: assemble_at
753 |
754 | assert from_bytes(b("ab")) == 25185
755 | assert to_bytes(0x0201, 2) == b("\x01\x02")
756 | assert to_bytes(0x0201, 4) == b("\x01\x02\x00\x00")
757 | assert unhex("0102") == b("\x01\x02")
758 | assert enhex(b("\x01\x02")) == "0102"
759 | assert xor(b("\x01\x02"), b("\x03\x04")) == b("\x02\x06")
760 |
761 | assert get_string(0x40B968) == "ShellExecuteW"
762 | assert read_cstring(0x40B968) == "ShellExecuteW"
763 |
764 | assert assemble("JMP 0")[0].mnemonic == "JMP"
765 |
766 |
767 | ###############################################################
768 | # Test Data
769 | ###############################################################
770 |
771 |
772 | def test_data():
773 | # 004092f4 73 00 65 unicode u"settings logging to %d"
774 | d = Data(0x4092f4)
775 | assert d.value == "settings logging to %d"
776 | assert d.is_string
777 | assert not d.is_constant
778 | assert not d.is_writable
779 | assert not d.is_volatile
780 | assert d.is_defined
781 | assert not d.is_pointer
782 | assert not d.is_union
783 | assert not d.is_structure
784 | assert not d.is_array
785 | assert not d.is_dynamic
786 | assert d.address == 0x4092f4
787 | assert d.length == 46
788 | assert d.bytes is not None
789 | assert d.data_type.name == "unicode"
790 |
791 | # 004092c8 d0 ba 00 00 addr OLE32.DLL::CoCreateInstance
792 | d = Data(0x04092c8)
793 | assert d.is_pointer
794 | assert d.value == 47824
795 | assert d.data_type.name == "pointer"
796 |
797 | # 0040a404 62 00 00 00 undefined4 00000062h
798 | d = Data(0x040a404)
799 | assert d.value == 98
800 | assert d.data_type.name == "undefined4"
801 |
802 | # User32Reserved
803 | # ffdff044 00 00 00 ddw[26]
804 | # ffdff044 [0] 0h, 0h, 0h, 0h,
805 | # ffdff054 [4] 0h, 0h, 0h, 0h,
806 | # ...
807 | d = Data(0xffdff044)
808 | assert d.is_array
809 | assert d.value == [0] * 26
810 | assert d[1].value == 0
811 | assert d.data_type.name == "dword[26]"
812 |
813 | # IMAGE_DOS_HEADER
814 | d = Data(0x400000)
815 | assert d.is_structure
816 | assert d.value == d
817 | assert d.e_lfanew.value == 0xD0
818 | assert d.get_field("e_lfanew").value == 0xD0
819 | assert d.get_field("e_res2[10]")[0].value == 0x0
820 | assert d.e_maxalloc.bytes == b("\xFF\xFF")
821 | assert d.data_type.name == "IMAGE_DOS_HEADER"
822 |
823 |
824 | def run():
825 | print("Running with {}".format(sys.version))
826 | test_emulator()
827 |
828 | for f in globals():
829 | if f.startswith("test_"):
830 | print("Running {}...".format(f))
831 | globals()[f]()
832 | print(" OK".format(f))
833 | print("Done!")
834 |
835 |
836 | run()
837 |
--------------------------------------------------------------------------------