├── .coveragerc
├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── .pylintrc
├── LICENSE
├── Makefile
├── README.md
├── analyze.py
├── data
    ├── apiscout_win7_prof-n_sp1.json
    └── apiscout_winxp_prof_sp3.json
├── export.py
├── ida_analyze.py
├── requirements.txt
├── setup.py
├── smda
    ├── Disassembler.py
    ├── DisassemblyResult.py
    ├── DisassemblyStatistics.py
    ├── SmdaConfig.py
    ├── __init__.py
    ├── cil
    │   ├── CilDisassembler.py
    │   ├── CilInstructionEscaper.py
    │   ├── FunctionAnalysisState.py
    │   └── __init__.py
    ├── common
    │   ├── BasicBlock.py
    │   ├── BinaryInfo.py
    │   ├── BlockLocator.py
    │   ├── CodeXref.py
    │   ├── DominatorTree.py
    │   ├── SmdaBasicBlock.py
    │   ├── SmdaFunction.py
    │   ├── SmdaInstruction.py
    │   ├── SmdaReport.py
    │   ├── TailcallAnalyzer.py
    │   ├── Tarjan.py
    │   ├── __init__.py
    │   └── labelprovider
    │   │   ├── AbstractLabelProvider.py
    │   │   ├── CilSymbolProvider.py
    │   │   ├── DelphiKbSymbolProvider.py
    │   │   ├── ElfApiResolver.py
    │   │   ├── ElfSymbolProvider.py
    │   │   ├── GoLabelProvider.py
    │   │   ├── OrdinalHelper.py
    │   │   ├── PdbSymbolProvider.py
    │   │   ├── PeSymbolProvider.py
    │   │   ├── WinApiResolver.py
    │   │   └── __init__.py
    ├── ida
    │   ├── BackendInterface.py
    │   ├── IdaExporter.py
    │   ├── IdaInterface.py
    │   └── __init__.py
    ├── intel
    │   ├── BitnessAnalyzer.py
    │   ├── FunctionAnalysisState.py
    │   ├── FunctionCandidate.py
    │   ├── FunctionCandidateManager.py
    │   ├── IndirectCallAnalyzer.py
    │   ├── IntelDisassembler.py
    │   ├── IntelInstructionEscaper.py
    │   ├── JumpTableAnalyzer.py
    │   ├── LanguageAnalyzer.py
    │   ├── MnemonicTfIdf.py
    │   ├── __init__.py
    │   └── definitions.py
    └── utility
    │   ├── BracketQueue.py
    │   ├── DelphiKbFileLoader.py
    │   ├── ElfFileLoader.py
    │   ├── FileLoader.py
    │   ├── MachoFileLoader.py
    │   ├── MemoryFileLoader.py
    │   ├── PeFileLoader.py
    │   ├── PriorityQueue.py
    │   ├── StringExtractor.py
    │   └── __init__.py
├── tests
    ├── __init__.py
    ├── asprox_0x008D0000_xored
    ├── bashlite_xored
    ├── context.py
    ├── cutwail_xored
    ├── komplex_xored
    ├── njrat_xored
    ├── testBracketQueue.py
    ├── testEscaper.py
    ├── testFileFormatParsers.py
    ├── testIntegration.py
    └── testTarjan.py
└── version_history.md


/.coveragerc:
--------------------------------------------------------------------------------
 1 | # .coveragerc to control coverage.py
 2 | [run]
 3 | branch = True
 4 | source = 
 5 |     *smda*
 6 | include = 
 7 |     *smda*
 8 |     *tests*
 9 | omit =
10 |     *lib*
11 |     capstone/*
12 |     *distutils/*
13 |     ctypes/*
14 |     
15 | [report]
16 | # Regexes for lines to exclude from consideration
17 | exclude_lines =
18 |     # Have to re-enable the standard pragma
19 |     pragma: no cover
20 | 
21 |     # Don't complain about missing debug-only code:
22 |     def __repr__
23 |     if self\.debug
24 | 
25 |     # Don't complain if tests don't hit defensive assertion code:
26 |     raise AssertionError
27 |     raise NotImplementedError
28 | 
29 |     # Don't complain if non-runnable code isn't run:
30 |     if 0:
31 |     if __name__ == "__main__":
32 |     def main(argv):
33 | 
34 | ignore_errors = True
35 | 
36 | [html]
37 | directory = coverage_html_report
38 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Python Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |     
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v4
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |         
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         python -m pip install -r requirements.txt
28 |         pip install -e .
29 |         
30 |     - name: Run tests
31 |       run: make test 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | 
28 | 
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .coverage.*
34 | .cache
35 | .noseids
36 | nosetests.xml
37 | coverage.xml
38 | *,cover
39 | .hypothesis/
40 | 
41 | 
42 | # pyenv
43 | .python-version
44 | 
45 | # dotenv
46 | .env
47 | 
48 | # virtualenv
49 | .venv/
50 | venv/
51 | ENV/
52 | 
53 | # Spyder project settings
54 | .spyderproject
55 | 
56 | # Rope project settings
57 | .ropeproject
58 | 
59 | # more IDE settings
60 | .idea
61 | .vscode
62 | 
63 | 
64 | # project files
65 | config.ini
66 | figures
67 | reports
68 | analyze_*.py
69 | coverage-html


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018-2020, Daniel Plohmann and Steffen Enders
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 6 | 
 7 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 8 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
10 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | init:
 2 | 	pip install -r requirements.txt
 3 | package:
 4 | 	rm -rf dist/*
 5 | 	python setup.py sdist
 6 | publish:
 7 | 	python -m twine upload dist/* -u __token__
 8 | pylint:
 9 | 	python -m pylint --rcfile=.pylintrc smda
10 | test:
11 | 	pytest tests/test*
12 | test-coverage:
13 | 	python -m nose --with-coverage --cover-erase --cover-html-dir=./coverage-html --cover-html --cover-package=smda
14 | clean:
15 | 	find . | grep -E "(__pycache__|\.pyc|\.pyo$\)" | xargs rm -rf
16 | 	rm -rf .coverage
17 | 	rm -rf coverage-html
18 | 	rm -rf dist/*
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # SMDA
  3 | 
  4 | SMDA is a minimalist recursive disassembler library that is optimized for accurate Control Flow Graph (CFG) recovery from memory dumps.
  5 | It is based on [Capstone](http://www.capstone-engine.org/) and currently supports x86/x64 Intel machine code.
  6 | As input, arbitrary memory dumps (ideally with known base address) can be processed.
  7 | The output is a collection of functions, basic blocks, and instructions with their respective edges between blocks and functions (in/out).
  8 | Optionally, references to the Windows API can be inferred by using the ApiScout method.
  9 | 
 10 | ## Installation
 11 | 
 12 | With version 1.2.0, we have finally simplified things by moving to [PyPI](https://pypi.org/project/smda/)!  
 13 | So installation now is as easy as:
 14 | 
 15 | ```
 16 | $ pip install smda
 17 | ```
 18 | 
 19 | ## Usage
 20 | 
 21 | A typical workflow using SMDA could like this:
 22 | 
 23 | ```
 24 | >>> from smda.Disassembler import Disassembler
 25 | >>> disassembler = Disassembler()
 26 | >>> report = disassembler.disassembleFile("/bin/cat")
 27 | >>> print(report)
 28 |  0.777s -> (architecture: intel.64bit, base_addr: 0x00000000): 143 functions
 29 | >>> for fn in report.getFunctions():
 30 | ...     print(fn)
 31 | ...     for ins in fn.getInstructions():
 32 | ...         print(ins)
 33 | ...
 34 | 0x00001720: (->   1,    1->)   3 blocks,    7 instructions.
 35 | 0x00001720: (      4883ec08) - sub rsp, 8
 36 | 0x00001724: (488b05bd682000) - mov rax, qword ptr [rip + 0x2068bd]
 37 | 0x0000172b: (        4885c0) - test rax, rax
 38 | 0x0000172e: (          7402) - je 0x1732
 39 | 0x00001730: (          ffd0) - call rax
 40 | 0x00001732: (      4883c408) - add rsp, 8
 41 | 0x00001736: (            c3) - ret 
 42 | 0x00001ad0: (->   1,    4->)   1 blocks,   12 instructions.
 43 | [...]
 44 | >>> json_report = report.toDict()
 45 | ``` 
 46 | 
 47 | There is also a demo script:
 48 | 
 49 | * analyze.py -- example usage: perform disassembly on a file or memory dump and optionally store results in JSON to a given output path.
 50 | 
 51 | The code should be fully compatible with Python 3.8+.
 52 | Further explanation on the innerworkings follow in separate publications but will be referenced here.
 53 | 
 54 | To take full advantage of SMDA's capabilities, make sure to (optionally) install:
 55 |  * lief 
 56 |  * pdbparse (currently as fork from https://github.com/VPaulV/pdbparse to support Python3)
 57 | 
 58 | ## Version History
 59 |  * 2025-02-26: v2.0.2 - Adjusting relative import, adding init file.
 60 |  * 2025-02-25: v2.0.0 - Initial experimental support for CIL (.NET) disassembly.
 61 |  * 2025-02-24: v1.14.3 - PicHashing can now be disabled via SmdaConfig to save some processing time. (THX to @Nalexander-hanel!)
 62 |  * 2025-02-24: v1.14.2 - We are Python 3.8+ compatible again (changed UTC usage) and (DWARF) PE symbols for PE files should be extracted again (THX to @N0fix for the update!)
 63 |  * 2025-02-21: v1.14.1 - Fixed changed field names in LIEF usage that broke ELF parsing, added tests for ELF+macOS parsing (THX to @N0fix for the update!)
 64 |  * 2025-01-29: v1.14.0 - Bump to LIEF 0.16.0+ (THX to @huettenhain for the ping!). Migrated tests to `pytest`, UTC datetime handling fixes.
 65 |  * 2025-01-26: v1.13.24 - Added functionality to import and export SMDA reports as JSON. Fixed byte patterns matching special regex chars (THX to @alexander-hanel!).
 66 |  * 2024-07-26: v1.13.23 - Now using OEP as symbol function candidate when available (THX to @alexander-hanel for reporting!).
 67 |  * 2024-05-10: v1.13.22 - Handled odd case where disassembly with capstone and IDA would return different results (THX to @r0ny123 for reporting!).
 68 |  * 2024-04-17: v1.13.21 - Fixed handling of Go binaries for version 1.20+ (THX to @Manny684!).
 69 |  * 2024-04-08: v1.13.20 - Fixed handling of bnd prefix in CFG instructions to help with parsing PLT (THX to @Manny684!).
 70 |  * 2024-04-02: v1.13.19 - Fixed bug in string parsing, added tests, strings now no longer are hex-encoded as they are always printable anyway.
 71 |  * 2024-03-12: v1.13.18 - Added functionality to extract and store all referenced strings along SmdaFunctions (has to be enabled via SmdaConfig).
 72 |  * 2024-03-12: v1.13.17 - Extended disassembleBuffer() to now take additional arguments `code_areas` and `oep`.
 73 |  * 2024-02-21: v1.13.16 - BREAKING IntelInstructionEscaper.escapeMnemonic: Escaper now handles another 200 instruction names found in other capstone source files (THX for reporting @malwarefrank!).
 74 |  * 2024-02-15: v1.13.15 - Fixed issues with version recognition in SmdaFunction which cause issues in MCRIT (THX to @
 75 | malwarefrank!) 
 76 |  * 2024-02-02: v1.13.12 - Versions might be non-numerical, addressed that in SmdaFunction.
 77 |  * 2024-01-23: v1.13.11 - Introduced indicator in SmdaConfig for compatibility of instruction escaping.
 78 |  * 2024-01-23: v1.13.10 - Parsing of PE files should work again with lief >=0.14.0.
 79 |  * 2024-01-23: v1.13.9  - Improved parsing robustness for section/segment tables in ELF files, also now padding with zeroes when finding less content than expected physical size in a segment (THX for reporting @schrodyn!).
 80 |  * 2024-01-23: v1.13.8  - BREAKING adjustments to IntelInstructionEscaper.escapeMnemonic: Escaper now is capable of handling all known x86/x64 instructions in capstone (THX for reporting @schrodyn!).
 81 |  * 2023-12-01: v1.13.7  - Skip processing of Delphi structs for large files, workaround until this is properly reimplemented.
 82 |  * 2023-11-29: v1.13.6  - Made OpcodeHash an attribute with on-demand calculation to save processing time.
 83 |  * 2023-11-29: v1.13.3  - Implemented an alternative queue working with reference count based brackets in pursuit of accelerated processing.
 84 |  * 2023-11-28: v1.13.2  - IndirectCallAnalyzer will now analyze at most a configurable amount of calls per basic block, default 50.
 85 |  * 2023-11-21: v1.13.1  - SmdaBasicBlock now has `getPredecessors()` and `getSuccessors()`.
 86 |  * 2023-11-21: v1.13.0  - BREAKING adjustments to PicHashing (now wildcarding intraprocedural jumps in functions, additionally more immediates if within address space). Introduction of OpcodeHash (OpcHash), which wildcards all but prefixes and opcode bytes.
 87 |  * 2023-10-12: v1.12.7  - Bugfix for parsing Delphi structs.
 88 |  * 2023-09-15: v1.12.6  - Bugfix in BlockLocator (THX to @cccs-ay!).
 89 |  * 2023-08-28: v1.12.5  - Bugfix for address dereferencing where buffer sizes were not properly checked (THX to @yankovs!).
 90 |  * 2023-08-08: v1.12.4  - SmdaBasicBlock can now do getPicBlockHash().
 91 |  * 2023-05-23: v1.12.3  - Fixed bugs in PE parser and Go parser.
 92 |  * 2023-05-08: v1.12.1  - Get rid of deprecation warning in IDA 8.0+.
 93 |  * 2023-03-24: v1.12.0  - SMDA now parses PE export directories for symbols, as well as MinGW DWARF information if available.
 94 |  * 2023-03-14: v1.11.2  - SMDA report now also contains SHA1 and MD5.
 95 |  * 2023-03-14: v1.11.1  - rendering dotGraph can now include API references instead of plain calls.
 96 |  * 2023-02-06: v1.11.0  - SmdaReport now has functionality to find a function/block by a given offset contained within in (THX to @cccs-ay!).
 97 |  * 2023-02-06: v1.10.0  - Adjusted to LIEF 0.12.3 API for binary parsing (THX to @lainswork!).
 98 |  * 2022-08-12: v1.9.1   - Added support for parsing intel MachO files, including Go parsing.
 99 |  * 2022-08-01: v1.8.0   - Added support for parsing Go function information (THX to @danielenders1!).
100 |  * 2022-01-27: v1.7.0   - SmdaReports now contains a field `oep`; SmdaFunctions now indicate `is_exported` and can provide CodeXrefs via `getCodeInrefs()` and `getCodeOutrefs()`. (THX for the ideas: @mr-tz)
101 |  * 2021-08-20: v1.6.0   - Bugfix for alignment calculation of binary mappings. (THX: @williballenthin)
102 |  * 2021-08-19: v1.6.0   - Bugfix for truncation during ELF segment/section loading. API usage in ELF files is now resolved as well! (THX: @williballenthin)
103 |  * 2020-10-30: v1.5.0   - PE section table now contained in SmdaReport and added `SmdaReport.getSection(offset)`.
104 |  * 2020-10-26: v1.4.0   - Adding SmdaBasicBlock. Some convenience code to ease intgration with capa. (GeekWeek edition!) 
105 |  * 2020-06-22: v1.3.0   - Added DominatorTree (Implementation by Armin Rigo) to calculate function nesting depth, shortened PIC hash to 8 byte, added some missing instructions for the InstructionEscaper, IdaInterface now demangles names.
106 |  * 2020-04-29: v1.2.0   - Restructured config.py into smda/SmdaConfig.py to similfy usage and now available via PyPI! The smda/Disassembler.py now emits a report object (smda.common.SmdaReport) that allows direct (pythonic) interaction with the results - a JSON can still be easily generated by using toDict() on the report.
107 |  * 2020-04-28: v1.1.0   - Several improvements, including: x64 jump table handling, better data flow handling for calls using registers and tailcalls, extended list of common prologues based on much more groundtruth data, extended padding instruction list for gap function discovery, adjusted weights in candidate priority score, filtering code areas based on section tables, using exported symbols as candidates, new function output metadata: confidence score based on instruction mnemonic histogram, PIC hash based on escaped binary instruction sequence
108 |  * 2018-07-01: v1.0.0   - Initial Release.
109 | 
110 | 
111 | ## Credits
112 | 
113 | Thanks to Steffen Enders for his extensive contributions to this project!
114 | Thanks to Paul Hordiienko for adding symbol parsing support (ELF+PDB)!
115 | Thanks to Jonathan Crussell for helping me to beef up SMDA enough to make it a disassembler backend in capa!
116 | Thanks to Willi Ballenthin for improving handling of ELF files, including properly handling API usage!
117 | Thanks to Daniel Enders for his contributions to the parsing of the Golang function registry and label information!
118 | The project uses the implementation of Tarjan's Algorithm by Bas Westerbaan and the implementation of Lengauer-Tarjan's Algorithm for the DominatorTree by Armin Rigo.
119 | 
120 | Pull requests welcome! :)
121 | 
122 | 


--------------------------------------------------------------------------------
/analyze.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | import os
 5 | import re
 6 | import sys
 7 | 
 8 | from smda.SmdaConfig import SmdaConfig
 9 | from smda.Disassembler import Disassembler
10 | 
11 | def parseBaseAddrFromArgs(args):
12 |     if args.base_addr:
13 |         parsed_base_addr = int(args.base_addr, 16) if args.base_addr.startswith("0x") else int(args.base_addr)
14 |         logging.info("using provided base address: 0x%08x", parsed_base_addr)
15 |         return parsed_base_addr
16 |     # try to infer base addr from filename:
17 |     baddr_match = re.search(re.compile("_0x(?P<base_addr>[0-9a-fA-F]{8,16})"), args.input_path)
18 |     if baddr_match:
19 |         parsed_base_addr = int(baddr_match.group("base_addr"), 16)
20 |         logging.info("Parsed base address from file name: 0x%08x %d", parsed_base_addr, parsed_base_addr)
21 |         return parsed_base_addr
22 |     logging.warning("No base address recognized, using 0.")
23 |     return 0
24 | 
25 | def parseOepFromArgs(args):
26 |     if args.oep and args.oep != '':
27 |         parsed_oep = int(args.oep, 16) if args.oep.startswith("0x") else int(args.oep)
28 |         logging.info("using provided OEP(RVA): 0x%08x", parsed_oep)
29 |         return parsed_oep
30 |     logging.warning("No OEP recognized, skipping.")
31 |     return None
32 | 
33 | 
34 | def readFileContent(file_path):
35 |     file_content = b""
36 |     with open(file_path, "rb") as fin:
37 |         file_content = fin.read()
38 |     return file_content
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     PARSER = argparse.ArgumentParser(description='Demo: Use SMDA to disassemble a given file (loaded memory view), optionally map it first and/or write the output to a file.')
43 |     PARSER.add_argument('-p', '--parse_header', action='store_true', default=False, help='Parse header/symbols and perform mapping of the file as normalization.')
44 |     PARSER.add_argument('-d', '--pdb_path', type=str, default='', help='If available, use a PDB file to enhance disassembly (function offsets and names).')
45 |     PARSER.add_argument('-r', '--architecture', type=str, default='', help='Use the disassembler for the following architecture if available (default:auto, options: [intel, cil]).')
46 |     PARSER.add_argument('-a', '--base_addr', type=str, default='', help='When analyzing a buffer, set base address to given value (int or 0x-hex format).')
47 |     PARSER.add_argument('-b', '--bitness', type=int, default=0, help='Optionally force bitness to [32, 64] when processing dumps.')
48 |     PARSER.add_argument('-i', '--oep', type=str, default='', help='Force OEP for buffers, defined as RVA.')
49 |     PARSER.add_argument('-o', '--output_path', type=str, default='', help='Optionally write the output to a file (JSON format).')
50 |     PARSER.add_argument('-s', '--strings', action='store_true', default=False, help='Enable string extraction.')
51 |     PARSER.add_argument('-v', '--verbose', action='store_true', default=False, help='Enable debug logging.')
52 |     PARSER.add_argument('input_path', type=str, default='', help='Path to file to analyze.')
53 | 
54 | 
55 |     ARGS = PARSER.parse_args()
56 | 
57 |     if not ARGS.input_path:
58 |         PARSER.print_help()
59 |         sys.exit(1)
60 | 
61 |     # optionally create and set up a config, e.g. when using ApiScout profiles for WinAPI import usage discovery
62 |     config = SmdaConfig()
63 |     if ARGS.verbose:
64 |         config.LOG_LEVEL = logging.DEBUG
65 |     if ARGS.strings:
66 |         config.WITH_STRINGS = True
67 |     logging.basicConfig(level=config.LOG_LEVEL, format=config.LOG_FORMAT)
68 | 
69 |     SMDA_REPORT = None
70 |     INPUT_FILENAME = ""
71 |     BITNESS = ARGS.bitness if (ARGS.bitness in [32, 64]) else None
72 |     if os.path.isfile(ARGS.input_path):
73 |         print("now analyzing {}".format(ARGS.input_path))
74 |         INPUT_FILENAME = os.path.basename(ARGS.input_path)
75 |         if ARGS.parse_header:
76 |             DISASSEMBLER = Disassembler(config, backend=ARGS.architecture)
77 |             SMDA_REPORT = DISASSEMBLER.disassembleFile(ARGS.input_path, pdb_path=ARGS.pdb_path)
78 |         else:
79 |             BUFFER = readFileContent(ARGS.input_path)
80 |             BASE_ADDR = parseBaseAddrFromArgs(ARGS)
81 |             OEP = parseOepFromArgs(ARGS)
82 |             config.API_COLLECTION_FILES = {"win_7": os.sep.join([config.PROJECT_ROOT, "data", "apiscout_win7_prof-n_sp1.json"])}
83 |             DISASSEMBLER = Disassembler(config, backend=ARGS.architecture)
84 |             SMDA_REPORT = DISASSEMBLER.disassembleBuffer(BUFFER, BASE_ADDR, BITNESS, oep=OEP)
85 |             SMDA_REPORT.filename = os.path.basename(ARGS.input_path)
86 |         print(SMDA_REPORT)
87 |     if SMDA_REPORT and os.path.isdir(ARGS.output_path):
88 |         with open(ARGS.output_path + os.sep + INPUT_FILENAME + ".smda", "w") as fout:
89 |             json.dump(SMDA_REPORT.toDict(), fout, indent=1, sort_keys=True)
90 | 


--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | 
 5 | from smda.Disassembler import Disassembler
 6 | 
 7 | 
 8 | def detectBackend():
 9 |     backend = ""
10 |     version = ""
11 |     try:
12 |         import idaapi
13 |         import idautils
14 |         backend = "IDA"
15 |         version = idaapi.IDA_SDK_VERSION
16 |     except:
17 |         pass
18 |     return (backend, version)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     BACKEND, VERSION = detectBackend()
23 |     if BACKEND == "IDA":
24 |         from smda.ida.IdaInterface import IdaInterface
25 |         ida_interface = IdaInterface()
26 |         binary = ida_interface.getBinary()
27 |         base_addr = ida_interface.getBaseAddr()
28 |         DISASSEMBLER = Disassembler(backend=BACKEND)
29 |         REPORT = DISASSEMBLER.disassembleBuffer(binary, base_addr)
30 |         output_path = ida_interface.getIdbDir()
31 |         output_filepath = output_path + "ConvertedFromIdb.smda"
32 |         with open(output_filepath, "w") as fout:
33 |             json.dump(REPORT.toDict(), fout, indent=1, sort_keys=True)
34 |             print("Output saved to: %s" % output_filepath)
35 |     else:
36 |         raise Exception("No supported backend found.")
37 | 


--------------------------------------------------------------------------------
/ida_analyze.py:
--------------------------------------------------------------------------------
 1 | from smda.SmdaConfig import SmdaConfig
 2 | from smda.Disassembler import Disassembler
 3 | 
 4 | from export import detectBackend
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     BACKEND, VERSION = detectBackend()
 9 |     if BACKEND == "IDA":
10 |         from smda.ida.IdaInterface import IdaInterface
11 |         ida_interface = IdaInterface()
12 |         binary = ida_interface.getBinary()
13 |         base_addr = ida_interface.getBaseAddr()
14 |         config = SmdaConfig()
15 |         DISASSEMBLER = Disassembler(config)
16 |         REPORT = DISASSEMBLER.disassembleBuffer(binary, base_addr)
17 |         smda_function_count = 0
18 |         smda_name_count = 0
19 |         for smda_function in REPORT.getFunctions():
20 |             smda_function_count += ida_interface.makeFunction(smda_function.offset)
21 |             if smda_function.function_name != "":
22 |                 smda_name_count += ida_interface.makeNameEx(smda_function.offset, smda_function.function_name)
23 |         print(f"Defined {smda_function_count} functions and assigned {smda_name_count} function names.")
24 |     else:
25 |         raise Exception("Run this script from within IDA.")
26 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | capstone
3 | dncil
4 | dnfile
5 | lief>=0.16.0
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | from setuptools import setup, find_packages
 4 | 
 5 | with open("README.md", "r") as fh:
 6 |     long_description = fh.read()
 7 | 
 8 | 
 9 | requirements = ["capstone", "dncil", "dnfile"]
10 | 
11 | if sys.version_info >= (3, 0):
12 |     # py3
13 |     requirements.append("lief>=0.16.0")
14 | else:
15 |     # py2 - newer LIEF is Python3 only
16 |     requirements.append("lief==0.9.0")
17 | 
18 | 
19 | setup(
20 |     name='smda',
21 |     # note to self: always change this in config as well.
22 |     version='2.0.2',
23 |     description='A recursive disassmbler optimized for CFG recovery from memory dumps. Based on capstone.',
24 |     long_description_content_type="text/markdown",
25 |     long_description=long_description,
26 |     author='Daniel Plohmann',
27 |     author_email='daniel.plohmann@mailbox.org',
28 |     url='https://github.com/danielplohmann/smda',
29 |     license="BSD 2-Clause",
30 |     packages=find_packages(exclude=('tests', 'docs')),
31 |     install_requires=requirements,
32 |     data_files=[
33 |         ('', ['LICENSE']),
34 |     ],
35 |     classifiers=[
36 |         "Development Status :: 4 - Beta",
37 |         "License :: OSI Approved :: BSD License",
38 |         "Operating System :: OS Independent",
39 |         "Programming Language :: Python :: 2.7",
40 |         "Programming Language :: Python :: 3",
41 |         "Topic :: Security",
42 |         "Topic :: Software Development :: Disassemblers",
43 |     ],
44 | )
45 | 


--------------------------------------------------------------------------------
/smda/Disassembler.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import datetime
  3 | import json
  4 | import os
  5 | import traceback
  6 | import logging
  7 | 
  8 | from smda.utility.FileLoader import FileLoader
  9 | from smda.utility.MemoryFileLoader import MemoryFileLoader
 10 | from smda.utility.StringExtractor import extract_strings
 11 | from smda.SmdaConfig import SmdaConfig
 12 | from smda.common.BinaryInfo import BinaryInfo
 13 | from smda.common.SmdaReport import SmdaReport
 14 | from smda.intel.IntelDisassembler import IntelDisassembler
 15 | from smda.cil.CilDisassembler import CilDisassembler
 16 | from smda.ida.IdaExporter import IdaExporter
 17 | 
 18 | LOGGER = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | class Disassembler(object):
 22 | 
 23 |     def __init__(self, config=None, backend=None):
 24 |         if config is None:
 25 |             config = SmdaConfig()
 26 |         self.config = config
 27 |         self.disassembler = None
 28 |         if backend == "intel":
 29 |             self.disassembler = IntelDisassembler(self.config)
 30 |         elif backend == "cil":
 31 |             self.disassembler = CilDisassembler(self.config)
 32 |         elif backend == "IDA":
 33 |             self.disassembler = IdaExporter(self.config)
 34 |         self._start_time = None
 35 |         self._timeout = 0
 36 |         # cache the last DisassemblyResult
 37 |         self.disassembly = None
 38 | 
 39 |     def initDisassembler(self, architecture="intel"):
 40 |         """ Initialize disassembler backend to given architecture, if not initialized yet, default: intel """
 41 |         if self.disassembler is None:
 42 |             if architecture == "intel":
 43 |                 self.disassembler = IntelDisassembler(self.config)
 44 |             elif architecture == "cil":
 45 |                 self.disassembler = CilDisassembler(self.config)
 46 | 
 47 |     def _getDurationInSeconds(self, start_ts, end_ts):
 48 |         return (end_ts - start_ts).seconds + ((end_ts - start_ts).microseconds / 1000000.0)
 49 | 
 50 |     def _callbackAnalysisTimeout(self):
 51 |         if not self._timeout:
 52 |             return False
 53 |         time_diff = datetime.datetime.now(datetime.timezone.utc) - self._start_time
 54 |         LOGGER.debug("Current analysis callback time %s", (time_diff))
 55 |         return time_diff.seconds >= self._timeout
 56 | 
 57 |     def _addStringsToReport(self, smda_report, buffer):
 58 |         smda_report.buffer = buffer
 59 |         for smda_function in smda_report.getFunctions():
 60 |             function_strings = {}
 61 |             for string, addr in extract_strings(smda_function):
 62 |                 function_strings[addr] = string
 63 |             smda_function.stringrefs = function_strings
 64 | 
 65 |     def disassembleFile(self, file_path, pdb_path=""):
 66 |         loader = FileLoader(file_path, map_file=True)
 67 |         file_content = loader.getData()
 68 |         binary_info = BinaryInfo(file_content)
 69 |         binary_info.raw_data = loader.getRawData()
 70 |         # we want the SHA256/SHA1/MD5 of the unmapped file not how we mapped it to memory
 71 |         binary_info.sha256 = hashlib.sha256(binary_info.raw_data).hexdigest()
 72 |         binary_info.sha1 = hashlib.sha1(binary_info.raw_data).hexdigest()
 73 |         binary_info.md5 = hashlib.md5(binary_info.raw_data).hexdigest()
 74 |         binary_info.file_path = file_path
 75 |         binary_info.base_addr = loader.getBaseAddress()
 76 |         binary_info.bitness = loader.getBitness()
 77 |         binary_info.architecture = loader.getArchitecture()
 78 |         binary_info.code_areas = loader.getCodeAreas()
 79 |         self.initDisassembler(binary_info.architecture)
 80 |         start = datetime.datetime.now(datetime.timezone.utc)
 81 |         try:
 82 |             self.disassembler.addPdbFile(binary_info, pdb_path)
 83 |             smda_report = self._disassemble(binary_info, timeout=self.config.TIMEOUT)
 84 |             if self.config.WITH_STRINGS:
 85 |                 self._addStringsToReport(smda_report, file_content)
 86 |             if self.config.STORE_BUFFER:
 87 |                 smda_report.buffer = file_content
 88 |         except Exception as exc:
 89 |             LOGGER.error("An error occurred while disassembling file.")
 90 |             # print("-> an error occured (", str(exc), ").")
 91 |             smda_report = self._createErrorReport(start, exc)
 92 |         return smda_report
 93 | 
 94 |     def disassembleUnmappedBuffer(self, file_content):
 95 |         loader = MemoryFileLoader(file_content, map_file=True)
 96 |         file_content = loader.getData()
 97 |         binary_info = BinaryInfo(file_content)
 98 |         binary_info.raw_data = loader.getRawData()
 99 |         # we want the SHA256/SHA1/MD5 of the unmapped file not how we mapped it to memory
100 |         binary_info.sha256 = hashlib.sha256(binary_info.raw_data).hexdigest()
101 |         binary_info.sha1 = hashlib.sha1(binary_info.raw_data).hexdigest()
102 |         binary_info.md5 = hashlib.md5(binary_info.raw_data).hexdigest()
103 |         binary_info.file_path = ""
104 |         binary_info.base_addr = loader.getBaseAddress()
105 |         binary_info.bitness = loader.getBitness()
106 |         binary_info.architecture = loader.getArchitecture()
107 |         binary_info.code_areas = loader.getCodeAreas()
108 |         self.initDisassembler(binary_info.architecture)
109 |         start = datetime.datetime.now(datetime.timezone.utc)
110 |         try:
111 |             smda_report = self._disassemble(binary_info, timeout=self.config.TIMEOUT)
112 |             if self.config.WITH_STRINGS:
113 |                 self._addStringsToReport(smda_report, file_content)
114 |             if self.config.STORE_BUFFER:
115 |                 smda_report.buffer = file_content
116 |         except Exception as exc:
117 |             LOGGER.error("An error occurred while disassembling unmapped buffer.")
118 |             # print("-> an error occured (", str(exc), ").")
119 |             smda_report = self._createErrorReport(start, exc)
120 |         return smda_report
121 | 
122 |     def disassembleBuffer(self, file_content, base_addr, bitness=None, code_areas=None, oep=None, architecture="intel"):
123 |         """
124 |         Disassemble a given buffer (file_content), with given base_addr.
125 |         Optionally specify bitness, the areas to which disassembly should be limited to (code_areas) and an entry point (oep)
126 |         """
127 |         binary_info = BinaryInfo(file_content)
128 |         binary_info.base_addr = base_addr
129 |         binary_info.bitness = bitness
130 |         binary_info.is_buffer = True
131 |         binary_info.code_areas = code_areas
132 |         binary_info.architecture = architecture
133 |         binary_info.oep = oep
134 |         self.initDisassembler(binary_info.architecture)
135 |         start = datetime.datetime.now(datetime.timezone.utc)
136 |         try:
137 |             smda_report = self._disassemble(binary_info, timeout=self.config.TIMEOUT)
138 |             if self.config.WITH_STRINGS:
139 |                 self._addStringsToReport(smda_report, file_content)
140 |             if self.config.STORE_BUFFER:
141 |                 smda_report.buffer = file_content
142 |         except Exception as exc:
143 |             LOGGER.error("An error occurred while disassembling buffer.")
144 |             # print("-> an error occured (", str(exc), ").")
145 |             smda_report = self._createErrorReport(start, exc)
146 |         return smda_report
147 | 
148 |     def _disassemble(self, binary_info, timeout=0):
149 |         self._start_time = datetime.datetime.now(datetime.timezone.utc)
150 |         self._timeout = timeout
151 |         self.disassembly = self.disassembler.analyzeBuffer(binary_info, self._callbackAnalysisTimeout)
152 |         return SmdaReport(self.disassembly, config=self.config)
153 | 
154 |     def _createErrorReport(self, start, exception):
155 |         report = SmdaReport(config=self.config)
156 |         report.smda_version = self.config.VERSION
157 |         report.status = "error"
158 |         report.execution_time = self._getDurationInSeconds(start, datetime.datetime.now(datetime.timezone.utc))
159 |         report.message = traceback.format_exc()
160 |         return report
161 | 


--------------------------------------------------------------------------------
/smda/DisassemblyStatistics.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class DisassemblyStatistics(object):
 4 | 
 5 |     num_functions = None
 6 |     num_recursive_functions = None
 7 |     num_leaf_functions = None
 8 |     num_basic_blocks = None
 9 |     num_instructions = None
10 |     num_api_calls = None
11 |     num_function_calls = None
12 |     num_failed_functions = None
13 |     num_failed_instructions = None
14 | 
15 |     def __init__(self, disassembly_result=None):
16 |         if disassembly_result is not None:
17 |             self.num_functions = len(disassembly_result.functions)
18 |             self.num_recursive_functions = len(disassembly_result.recursive_functions)
19 |             self.num_leaf_functions = len(disassembly_result.leaf_functions)
20 |             self.num_basic_blocks = self._countBlocks(disassembly_result)
21 |             self.num_instructions = self._countInstructions(disassembly_result)
22 |             self.num_api_calls = self._countApiCalls(disassembly_result)
23 |             self.num_function_calls = self._countFunctionCalls(disassembly_result)
24 |             self.num_failed_functions = len(disassembly_result.failed_analysis_addr)
25 |             self.num_failed_instructions = len(disassembly_result.errors)
26 | 
27 |     def _countBlocks(self, disassembly_result):
28 |         num_blocks = 0
29 |         for _, blocks in disassembly_result.functions.items():
30 |             num_blocks += len(blocks)
31 |         return num_blocks
32 | 
33 |     def _countApiCalls(self, disassembly_result):
34 |         return len(disassembly_result.getAllApiRefs())
35 | 
36 |     def _countInstructions(self, disassembly_result):
37 |         num_ins = 0
38 |         for function_offset in sorted(disassembly_result.functions):
39 |             for block in disassembly_result.functions[function_offset]:
40 |                 num_ins += len(block)
41 |         return num_ins
42 | 
43 |     def _countFunctionCalls(self, disassembly_result):
44 |         num_calls = 0
45 |         for function_start in disassembly_result.functions:
46 |             if function_start in disassembly_result.code_refs_to:
47 |                 num_calls += len(disassembly_result.code_refs_to[function_start])
48 |         return num_calls
49 | 
50 |     @classmethod
51 |     def fromDict(cls, statistics_dict):
52 |         statistics = cls(None)
53 |         statistics.num_functions = statistics_dict["num_functions"]
54 |         statistics.num_recursive_functions = statistics_dict["num_recursive_functions"]
55 |         statistics.num_leaf_functions = statistics_dict["num_leaf_functions"]
56 |         statistics.num_basic_blocks = statistics_dict["num_basic_blocks"]
57 |         statistics.num_instructions = statistics_dict["num_instructions"]
58 |         statistics.num_api_calls = statistics_dict["num_api_calls"]
59 |         statistics.num_function_calls = statistics_dict["num_function_calls"]
60 |         statistics.num_failed_functions = statistics_dict["num_failed_functions"]
61 |         statistics.num_failed_instructions = statistics_dict["num_failed_instructions"]
62 |         return statistics
63 | 
64 |     def toDict(self):
65 |         return {
66 |             "num_functions": self.num_functions,
67 |             "num_recursive_functions": self.num_recursive_functions,
68 |             "num_leaf_functions": self.num_leaf_functions,
69 |             "num_basic_blocks": self.num_basic_blocks,
70 |             "num_instructions": self.num_instructions,
71 |             "num_api_calls": self.num_api_calls,
72 |             "num_function_calls": self.num_function_calls,
73 |             "num_failed_functions": self.num_failed_functions,
74 |             "num_failed_instructions": self.num_failed_instructions
75 |         }
76 | 
77 |     def __add__(self, other):
78 |         if not isinstance(other, DisassemblyStatistics):
79 |             raise ValueError("Needs another DisassemblyStatistics to perform addition of values")
80 |         self.num_functions += other.num_functions
81 |         self.num_recursive_functions += other.num_recursive_functions
82 |         self.num_leaf_functions += other.num_leaf_functions
83 |         self.num_basic_blocks += other.num_basic_blocks
84 |         self.num_instructions += other.num_instructions
85 |         self.num_api_calls += other.num_api_calls
86 |         self.num_function_calls += other.num_function_calls
87 |         self.num_failed_functions += other.num_failed_functions
88 |         self.num_failed_instructions += other.num_failed_instructions
89 |         return self
90 | 


--------------------------------------------------------------------------------
/smda/SmdaConfig.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | 
 4 | 
 5 | class SmdaConfig(object):
 6 | 
 7 |     # note to self: always change this in setup.py as well!
 8 |     VERSION = "2.0.2"
 9 |     ESCAPER_DOWNWARD_COMPATIBILITY = "1.13.16"
10 |     CONFIG_FILE_PATH = str(os.path.abspath(__file__))
11 |     PROJECT_ROOT = str(os.path.abspath(os.sep.join([CONFIG_FILE_PATH, "..", ".."])))
12 | 
13 |     ### An (optional) WinAPI database as generated by ApiScout (https://github.com/danielplohmann/apiscout)
14 |     API_COLLECTION_FILES = {}
15 |     ### global logging-config setup
16 |     # Only do basicConfig if no handlers have been configured
17 |     LOG_PATH = "./"
18 |     LOG_LEVEL = logging.INFO
19 |     LOG_FORMAT = "%(asctime)-15s: %(name)-32s - %(message)s"
20 | 
21 |     ### SMDA disassembler config
22 |     # maximum time in seconds for disassembly to complete
23 |     TIMEOUT = 300
24 |     # maximum number of bytes to allocate while loading
25 |     MAX_IMAGE_SIZE = 100 * 1024 * 1024
26 |     # store raw binary buffer in SmdaReport to enable carving data from refs
27 |     STORE_BUFFER = False
28 |     # extract strings during disassembly
29 |     WITH_STRINGS = False
30 |     # the queue to use for candidate management
31 |     CANDIDATE_QUEUE = "PriorityQueue"  # choose from: ["BracketQueue", "PriorityQueue"]
32 |     # improve disassembly by resolving references through data flows
33 |     USE_ALIGNMENT = True
34 |     USE_SYMBOLS_AS_CANDIDATES = True
35 |     RESOLVE_REGISTER_CALLS = True
36 |     # limit this to avoid blowing up analysis time for weird samples
37 |     MAX_INDIRECT_CALLS_PER_BASIC_BLOCK = 50
38 |     HIGH_ACCURACY = True
39 |     RESOLVE_TAILCALLS = False
40 |     # optional metadata generation options
41 |     CALCULATE_SCC = True
42 |     CALCULATE_NESTING = True
43 |     CALCULATE_HASHING = True
44 |     # confidence score to use for filtering functions before including them in the output
45 |     CONFIDENCE_THRESHOLD = 0.0
46 | 


--------------------------------------------------------------------------------
/smda/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/smda/cil/FunctionAnalysisState.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | LOGGER = logging.getLogger(__name__)
  4 | 
  5 | CALL_INS = ["call", "calli", "callvirt"]
  6 | END_INS = ["ret"]
  7 | 
  8 | class FunctionAnalysisState(object):
  9 | 
 10 |     def __init__(self, start_addr, code_start_addr, disassembly):
 11 |         self.start_addr = start_addr
 12 |         self.code_start_addr = code_start_addr
 13 |         self.disassembly = disassembly
 14 |         self.block_queue = [start_addr]
 15 |         self.current_block = []
 16 |         self.blocks = []
 17 |         self.num_blocks_analyzed = 0
 18 |         self.instructions = []
 19 |         self.instruction_start_bytes = set([])
 20 |         self.processed_blocks = set([])
 21 |         self.processed_bytes = set([])
 22 |         self.jump_targets = set([])
 23 |         self.call_register_ins = []
 24 |         self.block_start = 0xFFFFFFFF
 25 |         self.data_bytes = set([])
 26 |         self.data_refs = set([])
 27 |         self.code_refs = set([])
 28 |         self.code_refs_from = {}
 29 |         self.code_refs_to = {}
 30 |         self.prev_opcode = ""
 31 |         self.suspicious_ins_count = 0
 32 |         self.is_jmp = False
 33 |         self.is_next_instruction_reachable = True
 34 |         self.is_block_ending_instruction = False
 35 |         self.is_sanely_ending = False
 36 |         self.has_collision = False
 37 |         self.colliding_addresses = set()
 38 |         # set a flag that this tailcall has already been resolved so it does not have to be reanalyzed several times
 39 |         self.is_tailcall_function = False
 40 |         self.is_leaf_function = True
 41 |         self.is_recursive = False
 42 |         self.is_thunk_call = False
 43 |         self.label = ""
 44 | 
 45 |     def addInstruction(self, i_address, i_size, i_mnemonic, i_op_str, i_bytes):
 46 |         ins = (i_address, i_size, i_mnemonic, i_op_str, i_bytes)
 47 |         self.instructions.append(ins)
 48 |         self.instruction_start_bytes.add(ins[0])
 49 |         for byte in range(i_size):
 50 |             self.processed_bytes.add(i_address + byte)
 51 |         if self.is_next_instruction_reachable:
 52 |             self.addCodeRef(i_address, i_address + i_size, self.is_jmp)
 53 |         self.is_jmp = False
 54 | 
 55 |     def addCodeRef(self, addr_from, addr_to, by_jump=False):
 56 |         self.code_refs.update([(addr_from, addr_to)])
 57 |         refs_from = self.code_refs_from.get(addr_from, set([]))
 58 |         refs_from.update([addr_to])
 59 |         self.code_refs_from[addr_from] = refs_from
 60 |         refs_to = self.code_refs_to.get(addr_to, set([]))
 61 |         refs_to.update([addr_from])
 62 |         self.code_refs_to[addr_to] = refs_to
 63 |         if by_jump:
 64 |             self.is_jmp = True
 65 |             self.jump_targets.update([addr_to])
 66 | 
 67 |     def removeCodeRef(self, addr_from, addr_to):
 68 |         if (addr_from, addr_to) in self.code_refs:
 69 |             self.code_refs.remove((addr_from, addr_to))
 70 |         if addr_from in self.code_refs_from and addr_to in self.code_refs_from[addr_from]:
 71 |             self.code_refs_from[addr_from].remove(addr_to)
 72 |         if addr_to in self.code_refs_to and addr_from in self.code_refs_to[addr_to]:
 73 |             self.code_refs_to[addr_to].remove(addr_from)
 74 |         if addr_to in self.jump_targets:
 75 |             self.jump_targets.remove(addr_to)
 76 | 
 77 |     def addDataRef(self, addr_from, addr_to, size=1):
 78 |         self.data_refs.update([(addr_from, addr_to)])
 79 |         for i in range(size):
 80 |             self.data_bytes.update([addr_to + i])
 81 | 
 82 |     def finalizeAnalysis(self, as_gap=False):
 83 |         fn_min = min([ins[0] for ins in self.instructions])
 84 |         fn_max = max([ins[0] + ins[1] for ins in self.instructions])
 85 | 
 86 |         self.disassembly.function_symbols[self.start_addr] = self.label
 87 |         self.disassembly.function_borders[self.start_addr] = (fn_min, fn_max)
 88 |         for ins in self.instructions:
 89 |             self.disassembly.instructions[ins[0]] = (ins[2], ins[1])
 90 |             for offset in range(ins[1]):
 91 |                 self.disassembly.code_map[ins[0] + offset] = ins[0]
 92 |                 self.disassembly.ins2fn[ins[0] + offset] = self.start_addr
 93 |         self.disassembly.data_map.update(self.data_bytes)
 94 |         self.disassembly.functions[self.start_addr] = self.getBlocks()
 95 |         for cref in self.code_refs:
 96 |             self.disassembly.addCodeRefs(cref[0], cref[1])
 97 |         for dref in self.data_refs:
 98 |             self.disassembly.addDataRefs(dref[0], dref[1])
 99 |         if self.is_recursive:
100 |             self.disassembly.recursive_functions.add(self.start_addr)
101 |         if self.is_leaf_function:
102 |             self.disassembly.leaf_functions.add(self.start_addr)
103 |         if self.is_thunk_call:
104 |             self.disassembly.thunk_functions.add(self.start_addr)
105 |         return True
106 | 
107 |     def getBlocks(self):
108 |         """
109 |         block derivation strategy:
110 |         walk over all potential block starts, which are the start_addr + all "jump" targets (i.e. CFG redirection targets)
111 |         then, for consecutive instructions, break if
112 |         * they have more than 1 outgoing edge
113 |         * the following instruction has more than 1 incoming edge
114 |         """
115 |         if self.blocks:
116 |             return self.blocks
117 |         self.instructions.sort()
118 |         ins = {i[0]:ind for ind, i in enumerate(self.instructions)}
119 |         potential_starts = set([self.code_start_addr])
120 |         potential_starts.update(list(self.jump_targets))
121 |         blocks = []
122 |         for start in sorted(potential_starts):
123 |             if not start in ins:
124 |                 continue
125 |             block = []
126 |             for i in range(ins[start], len(self.instructions)):
127 |                 current = self.instructions[i]
128 |                 block.append(current)
129 |                 # if one code reference is to another address than the next
130 |                 if current[0] in self.code_refs_from:
131 |                     if not current[2] in CALL_INS and not i == len(self.instructions) - 1:
132 |                         if any([r != self.instructions[i+1][0] for r in self.code_refs_from[current[0]]]):
133 |                             break
134 |                     # if we can reach a colliding address from here, the block is broken and should end.
135 |                     reachable_collisions = self.code_refs_from[current[0]].intersection(self.colliding_addresses)
136 |                     next_addr = current[0] + current[1]
137 |                     is_next_addr = next_addr in reachable_collisions
138 |                     if reachable_collisions and is_next_addr:
139 |                         # we should remove the from/to code references for this collision as there should be no non CFG instruction references between instructions of different functions
140 |                         self.removeCodeRef(current[0], next_addr)
141 |                         break
142 |                 if not i == len(self.instructions) - 1 and self.instructions[i+1][0] in self.code_refs_to:
143 |                     if len(self.code_refs_to[self.instructions[i+1][0]]) > 1 or self.instructions[i+1][0] in potential_starts:
144 |                         break
145 |                 if current[2] in END_INS:
146 |                     break
147 |             if block:
148 |                 blocks.append(block)
149 |         self.blocks = blocks
150 |         return self.blocks
151 | 
152 |     def isProcessed(self, addr):
153 |         return addr in self.processed_bytes
154 | 
155 |     def isProcessedFunction(self):
156 |         return self.start_addr in self.disassembly.code_map
157 | 
158 |     def isNextInstructionReachable(self):
159 |         return self.is_next_instruction_reachable
160 | 
161 |     def setNextInstructionReachable(self, is_reachable):
162 |         self.is_next_instruction_reachable = is_reachable
163 | 
164 |     def __str__(self):
165 |         result = "0x{:x} | current: 0x{:x} | blocks: {} | queue: {} | processed: {} | crefs: {} | drefs: {} | suspicious: {} | ending: {}".format(
166 |             self.start_addr,
167 |             self.block_start,
168 |             len(self.getBlocks()),
169 |             ",".join(["0x%x" % b for b in sorted(self.block_queue)]),
170 |             ",".join(["0x%x" % b for b in sorted(list(self.processed_blocks))]),
171 |             len(self.code_refs),
172 |             len(self.data_refs),
173 |             self.suspicious_ins_count,
174 |             self.is_sanely_ending
175 |         )
176 |         return result
177 | 


--------------------------------------------------------------------------------
/smda/cil/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/smda/cil/__init__.py


--------------------------------------------------------------------------------
/smda/common/BasicBlock.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class BasicBlock(object):
 4 | 
 5 |     def __init__(self):
 6 |         self.start_addr = 0
 7 |         self.end_addr = 0
 8 |         self.instructions = []
 9 |         self.successors = []
10 | 
11 |     def __str__(self):
12 |         return "0x%x - 0x%x (%d) -> [%s]" % (self.start_addr, self.end_addr, len(self.instructions), ", ".join(["0x%x" % ref for ref in self.successors]))
13 | 


--------------------------------------------------------------------------------
/smda/common/BinaryInfo.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | 
 3 | import lief
 4 | lief.logging.disable()
 5 | 
 6 | 
 7 | class BinaryInfo(object):
 8 |     """ simple DTO to contain most information related to the binary/buffer to be analyzed """
 9 | 
10 |     architecture = ""
11 |     base_addr = 0
12 |     binary = b""
13 |     raw_data = b""
14 |     binary_size = 0
15 |     bitness = None
16 |     code_areas = []
17 |     component = ""
18 |     family = ""
19 |     file_path = ""
20 |     is_library = False
21 |     is_buffer = False
22 |     sha256 = ""
23 |     sha1 = ""
24 |     md5 = ""
25 |     version = ""
26 |     exported_functions = None
27 |     oep = None
28 | 
29 |     def __init__(self, binary):
30 |         self.binary = binary
31 |         self.raw_data = binary
32 |         self.binary_size = len(binary)
33 |         self.sha256 = hashlib.sha256(binary).hexdigest()
34 |         self.sha1 = hashlib.sha1(binary).hexdigest()
35 |         self.md5 = hashlib.md5(binary).hexdigest()
36 | 
37 |     def getOep(self):
38 |         if self.oep is None:
39 |             lief_result = lief.parse(self.raw_data)
40 |             if isinstance(lief_result, lief.PE.Binary):
41 |                 self.oep = lief_result.optional_header.addressof_entrypoint
42 |             elif isinstance(lief_result, lief.ELF.Binary):
43 |                 self.oep = lief_result.header.entrypoint
44 |         return self.oep
45 | 
46 |     def getExportedFunctions(self):
47 |         if self.exported_functions is None:
48 |             lief_result = lief.parse(self.raw_data)
49 |             if isinstance(lief_result, lief.PE.Binary) or isinstance(lief_result, lief.ELF.Binary):
50 |                 self.exported_functions = {}
51 |                 for function in lief_result.exported_functions:
52 |                     self.exported_functions[function.address] = function.name
53 |         return self.exported_functions
54 | 
55 |     def getSections(self):
56 |         pefile = lief.parse(self.raw_data)
57 |         # TODO 20201030 might want to add ELF sections as well
58 |         if not isinstance(pefile, lief.PE.Binary):
59 |             return
60 |         if pefile and pefile.sections:
61 |             for section in pefile.sections:
62 |                 section_start = self.base_addr + section.virtual_address
63 |                 section_size = section.virtual_size
64 |                 if section_size % 0x1000 != 0:
65 |                     section_size += 0x1000 - (section_size % 0x1000)
66 |                 section_end = section_start + section_size
67 |                 yield section.name, section_start, section_end
68 | 
69 |     def isInCodeAreas(self, address):
70 |         is_inside = False
71 |         # if no code areas found, assume the whole image is code and calculate according to base address and size
72 |         if len(self.code_areas) == 0:
73 |             if self.base_addr <= address <= self.base_addr + self.binary_size:
74 |                 is_inside = True
75 |         else:
76 |             is_inside = any([a[0] <= address < a[1] for a in self.code_areas])
77 |         return is_inside
78 | 
79 |     def getHeaderBytes(self):
80 |         if self.raw_data:
81 |             lief_result = lief.parse(self.raw_data)
82 |             if isinstance(lief_result, lief.PE.Binary):
83 |                 return self.raw_data[:0x400]
84 |             elif isinstance(lief_result, lief.ELF.Binary):
85 |                 return self.raw_data[:0x40]
86 |         return None
87 | 


--------------------------------------------------------------------------------
/smda/common/BlockLocator.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import bisect
 3 | 
 4 | 
 5 | class BlockLocator():
 6 |     """ Class that finds a block by any address within.
 7 |         When instantiated, creates the required data structures. 
 8 |     """
 9 | 
10 |     sorted_blocks_addresses = None
11 |     blocks_dict = None
12 | 
13 |     def __init__(self, functions):
14 |         # Instantiate the datastructures required : 
15 |         # 1. get a flat list of all the blocks in all the functions
16 |         blocks = list(itertools.chain(*[f.getBlocks() for f in functions]))
17 |         self.sorted_blocks_addresses = sorted(b.offset for b in blocks)
18 | 
19 |         # 2 a dict of blocks by addresses
20 |         self.blocks_dict = {b.offset:b for b in blocks}
21 |     
22 |     def _get_block_end(self, block):
23 |         last_ins = block.instructions[-1]
24 |         return last_ins.offset + len(last_ins.bytes) // 2 # bytes is actuall a hex string
25 | 
26 |     def findBlockByContainedAddress(self, inner_address):
27 |         # do a binary search to find the closest address to the left of inner_address
28 |         block_num = bisect.bisect(self.sorted_blocks_addresses, inner_address) - 1
29 |         
30 |         if block_num == -1:
31 |             # target address is smaller than first block. return none
32 |             return None
33 |         
34 |         block_start = self.sorted_blocks_addresses[block_num] 
35 |         block = self.blocks_dict[block_start] 
36 |         block_end = self._get_block_end(block)
37 |         
38 |         # make sure inner_address falls within the selected block  
39 |         if block.offset <= inner_address < block_end: 
40 |             return block
41 | 
42 |         return None
43 | 


--------------------------------------------------------------------------------
/smda/common/CodeXref.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class CodeXref(object):
 4 | 
 5 |     def __init__(self, smda_ins_from, smda_ins_to):
 6 |         self.smda_ins_from = smda_ins_from
 7 |         self.smda_ins_to = smda_ins_to
 8 | 
 9 |     @property
10 |     def from_function(self):
11 |         return self.smda_ins_from.smda_function
12 | 
13 |     @property
14 |     def to_function(self):
15 |         return self.smda_ins_to.smda_function
16 | 
17 |     @property
18 |     def from_instruction(self):
19 |         return self.smda_ins_from
20 | 
21 |     @property
22 |     def to_instruction(self):
23 |         return self.smda_ins_to
24 | 
25 |     def __str__(self):
26 |         return "0x%x (0x%x) -> 0x%x (0x%x)" % (self.smda_ins_from.offset, self.smda_ins_from.smda_function.offset, self.smda_ins_to.offset, self.smda_ins_to.smda_function.offset)
27 | 
28 |     def __repr__(self):
29 |         return "<CodeXref(SmdaInstruction({}), SmdaInstruction({}))>".format(self.smda_ins_from.offset, self.smda_ins_to.offset)
30 | 


--------------------------------------------------------------------------------
/smda/common/DominatorTree.py:
--------------------------------------------------------------------------------
  1 | # Implementation by Armin Rigo
  2 | # source: https://bitbucket.org/arigo/arigo/src/default/hack/pypy-hack/heapstats/dominator.py
  3 | 
  4 | # Implementation following:
  5 | #
  6 | #   Lengauer, Thomas; and Tarjan, Robert Endre (July 1979).
  7 | #   "A fast algorithm for finding dominators in a flowgraph".
  8 | #   ACM Transactions on Programming Languages and Systems (TOPLAS) 1 (1):
  9 | #   121-141.
 10 | #
 11 | #   http://portal.acm.org/ft_gateway.cfm?id=357071
 12 | 
 13 | import logging
 14 | 
 15 | LOGGER = logging.getLogger(__name__)
 16 | 
 17 | class DominatorTree(object):
 18 | 
 19 |     def __init__(self, G, r):
 20 |         assert r in G
 21 |         self.succ = G
 22 |         self.r = r
 23 | 
 24 |     def init_variables(self):
 25 |         self.parent = {}
 26 |         self.pred = {}
 27 |         self.semi = {}
 28 |         self.vertex = []
 29 |         self.bucket = {}
 30 |         self.dom = {}
 31 |         self.ancestor = {}
 32 |         self.label = {}
 33 |         for v in self.succ:
 34 |             self.pred[v] = set()
 35 |             self.bucket[v] = set()
 36 | 
 37 |     def depth_first_search(self, v):
 38 |         stack = [v]
 39 |         while stack:
 40 |             v = stack.pop()
 41 |             n = len(self.vertex)
 42 |             self.semi[v] = n
 43 |             self.vertex.append(v)
 44 |             for w in self.succ[v]:
 45 |                 self.pred[w].add(v)
 46 |                 if w not in self.semi:
 47 |                     self.parent[w] = v
 48 |                     self.semi[w] = None     # temporarily
 49 |                     stack.append(w)
 50 | 
 51 |     def LINK(self, v, w):
 52 |         self.ancestor[w] = v
 53 | 
 54 |     def EVAL(self, v):
 55 |         if v not in self.ancestor:
 56 |             return v
 57 |         else:
 58 |             self.COMPRESS(v)
 59 |             return self.label.get(v, v)
 60 | 
 61 |     def COMPRESS(self, v):
 62 |         if self.ancestor[v] in self.ancestor:
 63 |             self.COMPRESS(self.ancestor[v])
 64 |             w = self.ancestor[v]
 65 |             if self.semi[self.label.get(w,w)] < self.semi[self.label.get(v,v)]:
 66 |                 self.label[v] = self.label.get(w,w)
 67 |             self.ancestor[v] = self.ancestor[w]
 68 | 
 69 |     def steps_2_3(self):
 70 |         for w in self.vertex[:0:-1]:
 71 |             # step 2
 72 |             for v in self.pred[w]:
 73 |                 u = self.EVAL(v)
 74 |                 if self.semi[u] < self.semi[w]:
 75 |                     self.semi[w] = self.semi[u]
 76 |             self.bucket[self.vertex[self.semi[w]]].add(w)
 77 |             self.LINK(self.parent[w], w)
 78 |             # step 3
 79 |             for v in list(self.bucket[self.parent[w]]):
 80 |                 self.bucket[self.parent[w]].remove(v)
 81 |                 u = self.EVAL(v)
 82 |                 if self.semi[u] < self.semi[v]:
 83 |                     self.dom[v] = u
 84 |                 else:
 85 |                     self.dom[v] = self.parent[w]
 86 | 
 87 |     def step_4(self):
 88 |         for w in self.vertex[1:]:
 89 |             if self.dom[w] != self.vertex[self.semi[w]]:
 90 |                 self.dom[w] = self.dom[self.dom[w]]
 91 | 
 92 |     def compute(self):
 93 |         self.init_variables()
 94 |         self.depth_first_search(self.r)
 95 |         self.steps_2_3()
 96 |         self.step_4()
 97 | 
 98 | 
 99 | def fix_graph(graph):
100 |     expanded_graph = {}
101 |     for key, values in graph.items():
102 |         expanded_graph[key] = values
103 |         for value in values:
104 |             if value not in expanded_graph:
105 |                 expanded_graph[value] = []
106 |     return expanded_graph
107 | 
108 | 
109 | # Calculation of Nesting Depth by walking down dominators and summarizing weights
110 | # Implementation by Steffen Enders and Daniel Plohmann
111 | 
112 | def build_dominator_tree(G, r):
113 |     expanded_graph = fix_graph(G)
114 |     if not r in expanded_graph:
115 |         # print("r not in G:", r, G)
116 |         LOGGER.debug("r not in G: %s %s", r, G)
117 |         return None
118 |     domtree = DominatorTree(expanded_graph, r)
119 |     domtree.compute()
120 |     inverted = {}
121 |     for key, value in domtree.dom.items():
122 |         if value not in inverted:
123 |             inverted[value] = []
124 |         inverted[value].append(key)
125 |     return inverted
126 | 
127 | def get_nesting_depth(graph, domtree, root):
128 |     expanded_graph = fix_graph(graph)
129 |     significant_nodes = set.union(*([set(v) for v in expanded_graph.values() if len(v) > 1] + [set()]))
130 |     # print("significant_nodes", significant_nodes)
131 |     def maximum_costs(cn):
132 |         # print("  maximum_costs cn", cn)
133 |         if cn not in domtree or not domtree[cn]:
134 |             # print("    %d not in domtree or not domtree[%d]" % (cn, cn), 1 if cn in significant_nodes else 0)
135 |             return (1 if cn in significant_nodes else 0)
136 |         val = max(maximum_costs(n) for n in domtree[cn]) + (1 if cn in significant_nodes else 0)
137 |         # print("   ", val, 1 if cn in significant_nodes else 0)
138 |         return val
139 |     try:
140 |         return maximum_costs(root)
141 |     except:
142 |         return 0
143 | 
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     test_data = [
148 |         {
149 |             "smda": {10208: [10229], 10229: [10240, 10253], 10240: [10244, 10246], 10244: [10246], 10246: [10240, 10253], 10253: [10229, 10261]},
150 |             "smda_function": 10208,
151 |             "fixed": {10208: [10229], 10229: [10240, 10253], 10240: [10244, 10246], 10253: [10229, 10261], 10244: [10246], 10246: [10240, 10253], 10261: []},
152 |             "dt": {10240: [10244, 10246], 10229: [10240, 10253], 10253: [10261], 10208: [10229]},
153 |             "nd": 3
154 |         }, {
155 |             "smda": {1: [2], 2: [3, 4, 6], 3: [5], 4: [5], 5: [2]},
156 |             "smda_function": 1,
157 |             "fixed": {1: [2], 2: [3, 4, 6], 3: [5], 4: [5], 6: [], 5: [2]},
158 |             "dt": {2: [3, 4, 5, 6], 1: [2]},
159 |             "nd": 1
160 |         }, {
161 |             "smda": {1: [2], 2: [3, 6], 3: [41, 42], 41: [5], 42: [5], 5: [2]},
162 |             "smda_function": 1,
163 |             "fixed": {1: [2], 2: [3, 6], 3: [41, 42], 6: [], 41: [5], 42: [5], 5: [2]},
164 |             "dt": {3: [41, 42, 5], 2: [3, 6], 1: [2]},
165 |             "nd": 2
166 |         },
167 | 
168 |     ]
169 |     for data in test_data:
170 |         print("*" * 80)
171 |         print("Running Test Case: ", data["smda_function"])
172 |         print("*" * 80)
173 |         print("smda", data["smda"])
174 |         fixed_smda = {}
175 |         for key, values in data["smda"].items():
176 |             fixed_smda[key] = values
177 |             for value in values:
178 |                 if value not in fixed_smda:
179 |                     fixed_smda[value] = []
180 |         print("fixed_smda", fixed_smda)
181 |         assert fixed_smda == data["fixed"]
182 |         dt = build_dominator_tree(data["smda"], data["smda_function"])
183 |         print("dominator tree", dt)
184 |         assert dt == data["dt"]
185 |         nd = get_nesting_depth(fixed_smda, dt, data["smda_function"])
186 |         print("nd", nd)
187 |         assert nd == data["nd"]
188 | 


--------------------------------------------------------------------------------
/smda/common/SmdaBasicBlock.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | import hashlib
 3 | from typing import Iterator
 4 | 
 5 | from smda.common.SmdaInstruction import SmdaInstruction
 6 | 
 7 | 
 8 | class SmdaBasicBlock:
 9 | 
10 |     smda_function = None
11 |     instructions = None
12 |     picblockhash = None
13 |     opcblockhash = None
14 |     offset = None
15 |     length = None
16 | 
17 |     def __init__(self, instructions, smda_function=None):
18 |         assert isinstance(instructions, list)
19 |         self.smda_function = smda_function
20 |         if instructions:
21 |             self.instructions = instructions
22 |             self.offset = instructions[0].offset
23 |             self.length = len(instructions)
24 |             self.picblockhash = self.getPicBlockHash()
25 |             self.opcblockhash = self.getOpcBlockHash()
26 | 
27 |     def getInstructions(self) -> Iterator["SmdaInstruction"]:
28 |         for instruction in self.instructions:
29 |             yield instruction
30 | 
31 |     def getPicBlockHash(self):
32 |         if self.picblockhash is not None:
33 |             return self.picblockhash
34 |         picblockhash_sequence = self.getPicBlockHashSequence()
35 |         if picblockhash_sequence is not None:
36 |             self.picblockhash = struct.unpack("Q", hashlib.sha256(picblockhash_sequence).digest()[:8])[0]
37 |         return self.picblockhash
38 | 
39 |     def getPicBlockHashSequence(self):
40 |         """ if we have a SmdaFunction as parent, we can try to generate the PicBlockHash ad-hoc """
41 |         # check all the prerequisites
42 |         if self.smda_function and self.smda_function.smda_report and self.smda_function._escaper and self.smda_function.smda_report.base_addr is not None and self.smda_function.smda_report.binary_size:
43 |             escaped_binary_seqs = []
44 |             for instruction in self.getInstructions():
45 |                 escaped_binary_seqs.append(instruction.getEscapedBinary(self.smda_function._escaper, escape_intraprocedural_jumps=True, lower_addr=self.smda_function.smda_report.base_addr, upper_addr=self.smda_function.smda_report.base_addr + self.smda_function.smda_report.binary_size))
46 |             return bytes([ord(c) for c in "".join(escaped_binary_seqs)])
47 |     
48 |     def getOpcBlockHash(self):
49 |         if self.opcblockhash is not None:
50 |             return self.opcblockhash
51 |         opcblockhash_sequence = self.getOpcBlockHashSequence()
52 |         if opcblockhash_sequence is not None:
53 |             self.opcblockhash = struct.unpack("Q", hashlib.sha256(opcblockhash_sequence).digest()[:8])[0]
54 |         return self.opcblockhash
55 | 
56 |     def getOpcBlockHashSequence(self):
57 |         """ if we have a SmdaFunction as parent, we can try to generate the OpcBlockHash ad-hoc """
58 |         # check all the prerequisites
59 |         if self.smda_function and self.smda_function.smda_report and self.smda_function._escaper:
60 |             escaped_binary_seqs = []
61 |             for instruction in self.getInstructions():
62 |                 escaped_binary_seqs.append(instruction.getEscapedToOpcodeOnly(self.smda_function._escaper))
63 |             return bytes([ord(c) for c in "".join(escaped_binary_seqs)])
64 |         
65 |     def getPredecessors(self):
66 |         predecessors = []
67 |         if self.smda_function is not None:
68 |             for frm, to in self.smda_function.blockrefs.items():
69 |                 if self.offset in to:
70 |                     predecessors.append(frm)
71 |         return predecessors
72 |     
73 |     def getSuccessors(self):
74 |         successors = []
75 |         if self.smda_function is not None:
76 |             if self.offset in self.smda_function.blockrefs:
77 |                 successors.extend(self.smda_function.blockrefs[self.offset])
78 |         return successors
79 | 
80 |     @classmethod
81 |     def fromDict(cls, block_dict, smda_function=None) -> "SmdaBasicBlock":
82 |         smda_block = cls(None)
83 |         smda_block.smda_function = smda_function
84 |         smda_block.instructions = [SmdaInstruction.fromDict(d, smda_function=smda_function) for d in block_dict]
85 |         return smda_block
86 | 
87 |     def toDict(self) -> dict:
88 |         return [smda_ins.toDict() for smda_ins in self.instructions]
89 | 
90 |     def __int__(self):
91 |         return self.offset
92 | 
93 |     def __str__(self):
94 |         return "0x{:08x}: ({:>4})".format(self.offset, self.length)
95 | 


--------------------------------------------------------------------------------
/smda/common/SmdaInstruction.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | import logging
  3 | 
  4 | from capstone.x86 import X86_OP_IMM, X86_OP_MEM
  5 | 
  6 | from smda.intel.IntelInstructionEscaper import IntelInstructionEscaper
  7 | 
  8 | LOGGER = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class SmdaInstruction:
 12 | 
 13 |     smda_function = None
 14 |     offset = None
 15 |     bytes = None
 16 |     mnemonic = None
 17 |     operands = None
 18 |     detailed = None
 19 | 
 20 |     def __init__(self, ins_list=None, smda_function=None):
 21 |         self.smda_function = smda_function
 22 |         if ins_list is not None:
 23 |             self.offset = ins_list[0]
 24 |             self.bytes = ins_list[1]
 25 |             self.mnemonic = ins_list[2]
 26 |             self.operands = ins_list[3]
 27 | 
 28 |     def getDataRefs(self):
 29 |         if self.getMnemonicGroup(IntelInstructionEscaper) != "C":
 30 |             detailed = self.getDetailed()
 31 |             if len(detailed.operands) > 0:
 32 |                 for i in detailed.operands:
 33 |                     value = None
 34 |                     if i.type == X86_OP_IMM:
 35 |                         value = i.imm
 36 |                     if i.type == X86_OP_MEM:
 37 |                         value = i.mem.disp
 38 |                         if detailed.reg_name(i.mem.base) == "rip":
 39 |                             # add RIP value
 40 |                             value += detailed.address + detailed.size
 41 |                     if value is not None and self.smda_function.smda_report.isAddrWithinMemoryImage(value):
 42 |                         yield value
 43 | 
 44 |     def getDetailed(self):
 45 |         if self.detailed is None:
 46 |             capstone = self.smda_function.smda_report.getCapstone()
 47 |             with_details = [i for i in capstone.disasm(bytes.fromhex(self.bytes), self.offset)]
 48 |             # TODO
 49 |             # this may diverge on instructions like 
 50 |             # 9bd93c24 - 
 51 |             # <CsInsn 0x4d3f1f0 [9b]: wait >
 52 |             # 1 wait 
 53 |             # <CsInsn 0x4d3f1f1 [d93c24]: fnstcw word ptr [esp]>
 54 |             # 3 fnstcw word ptr [esp]
 55 |             # which is split by capstone but treated as one / prefix by IDA
 56 |             # https://fragglet.github.io/dos-help-files/alang.hlp/FLDCW.html
 57 |             # FSTCW has wait and no-wait versions. The wait version (FSTCW)
 58 |             # checks for unmasked numeric errors; the no-wait version (FNSTCW)
 59 |             # does not. When the .8087 directive is used, the assembler puts the
 60 |             # WAIT instruction before the wait version and the NOP instruction
 61 |             # before the no-wait version.
 62 |             if len(with_details) > 1:
 63 |                 LOGGER.warn(f"Sequence {self.bytes} disassembles to {len(with_details)} instructions but expected one - taking the last instruction only!")
 64 |                 self.detailed = with_details[-1]
 65 |             else:
 66 |                 assert len(with_details) == 1
 67 |                 self.detailed = with_details[0]
 68 |         return self.detailed
 69 | 
 70 |     def getMnemonicGroup(self, escaper):
 71 |         if escaper:
 72 |             return escaper.escapeMnemonic(self.mnemonic)
 73 |         return self.bytes
 74 | 
 75 |     def getEscapedOperands(self, escaper):
 76 |         if escaper:
 77 |             return escaper.escapeOperands(self)
 78 |         return self.bytes
 79 | 
 80 |     def getMaskedOperands(self, escaper):
 81 |         if escaper:
 82 |             return escaper.escapeOperands(self, offsets_only=True)
 83 |         return self.bytes
 84 | 
 85 |     def getEscapedToOpcodeOnly(self, escaper):
 86 |         if escaper:
 87 |             return escaper.escapeToOpcodeOnly(self)
 88 |         return self.bytes
 89 | 
 90 |     def getEscapedBinary(self, escaper, escape_intraprocedural_jumps=False, lower_addr=None, upper_addr=None):
 91 |         if escaper:
 92 |             return escaper.escapeBinary(self, escape_intraprocedural_jumps=escape_intraprocedural_jumps, lower_addr=lower_addr, upper_addr=upper_addr)
 93 |         return self.bytes
 94 | 
 95 |     @classmethod
 96 |     def fromDict(cls, instruction_dict, smda_function=None) -> Optional["SmdaInstruction"]:
 97 |         smda_instruction = cls(None)
 98 |         smda_instruction.smda_function = smda_function
 99 |         smda_instruction.offset = instruction_dict[0]
100 |         smda_instruction.bytes = instruction_dict[1]
101 |         smda_instruction.mnemonic = instruction_dict[2]
102 |         smda_instruction.operands = instruction_dict[3]
103 |         return smda_instruction
104 | 
105 |     def toDict(self) -> dict:
106 |         return [self.offset, self.bytes, self.mnemonic, self.operands]
107 | 
108 |     def __int__(self):
109 |         return self.offset
110 | 
111 |     def __str__(self):
112 |         return "0x{:08x}: ({:>14s}) - {} {}".format(self.offset, self.bytes, self.mnemonic, self.operands)
113 | 


--------------------------------------------------------------------------------
/smda/common/TailcallAnalyzer.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from operator import itemgetter
  3 | import bisect
  4 | import json
  5 | 
  6 | class TailcallAnalyzer(object):
  7 | 
  8 |     def __init__(self):
  9 |         self.__jumps = defaultdict(set)
 10 |         self.__tmp_jumps = defaultdict(list)
 11 |         self.__functions = list()
 12 | 
 13 |     def initFunction(self):
 14 |         self.__tmp_jumps = defaultdict(list)
 15 | 
 16 |     def addJump(self, source, destination):
 17 |         self.__tmp_jumps[source].append(destination)
 18 | 
 19 |     def finalizeFunction(self, function_state):
 20 |         for source, destinations in self.__tmp_jumps.items():
 21 |             self.__jumps[source].update(destinations)
 22 |         self.__tmp_jumps.clear()
 23 |         self.__functions.append(function_state)
 24 | 
 25 |     def getTailcalls(self):
 26 |         result = list()
 27 |         # jumps sorted by (destination, source)
 28 |         jumps = list(sorted(((s, d) for s in self.__jumps for d in self.__jumps[s]), key=itemgetter(1, 0)))
 29 |         jumps_dest = [d for s, d in jumps]
 30 |         # for each function generate the intervals that contain the instructions
 31 |         for function in self.__functions:
 32 |             # check if there are any jumps from outside the function to inside the function
 33 |             function_intervals = self.__getFunctionIntervals(function)
 34 |             if not function_intervals:
 35 |                 # empty function?
 36 |                 continue
 37 |             min_addr = min(interval[0] for interval in function_intervals)
 38 |             max_addr = max(interval[1] for interval in function_intervals)
 39 |             for source, destination in jumps[bisect.bisect_left(jumps_dest, min_addr):bisect.bisect_right(jumps_dest, max_addr)]:
 40 |                 if (
 41 |                         # the jumps destination is different from the functions start address
 42 |                         destination != function.start_addr and
 43 |                         # the jumps destination is in one of the functions intervals
 44 |                         any((first <= destination <= last) for first, last in function_intervals) and
 45 |                         # the jump originates from outside the function (outside all intervals)
 46 |                         all((source < first or source > last) for first, last in function_intervals)):
 47 | 
 48 |                     result.append({
 49 |                         "source_addr": source,
 50 |                         "destination_addr": destination,
 51 |                         "destination_function": function.start_addr
 52 |                     })
 53 | 
 54 |         return result
 55 | 
 56 |     def __getFunctionIntervals(self, function_state):
 57 |         intervals = list()
 58 |         instructions = sorted(function_state.instructions, key=itemgetter(0))
 59 |         first_instruction = instructions[0] if instructions else None
 60 |         last_instruction = first_instruction
 61 |         for instruction in instructions:
 62 |             if instruction[0] > last_instruction[0] + last_instruction[1]:
 63 |                 intervals.append((first_instruction[0], last_instruction[0]))
 64 |                 first_instruction = instruction
 65 |             last_instruction = instruction
 66 |         if last_instruction:
 67 |             intervals.append((first_instruction[0], last_instruction[0]))
 68 |         return intervals
 69 | 
 70 |     def __getFunctionByStartAddr(self, start_addr):
 71 |         for function in self.__functions:
 72 |             if function.start_addr == start_addr:
 73 |                 return function
 74 | 
 75 |     def __printIntervals(self, intervals):
 76 |         # return
 77 |         if len(intervals) < 25:
 78 |             for one, two in intervals:
 79 |                 print("  0x{:x} -> 0x{:x}".format(one, two))
 80 |         else: print("Function has too many intervals to display")
 81 | 
 82 |     def resolveTailcalls(self, disassembler, verbose=False):
 83 |         newly_created_functions = set([])
 84 |         for tailcall in self.getTailcalls():
 85 |             if verbose:
 86 |                 print("Processing tailcall:\n{}".format(json.dumps(tailcall, indent=2, sort_keys=True)))
 87 |             # remove the information from the function-analysis state of the disassembly
 88 |             function = self.__getFunctionByStartAddr(tailcall["destination_function"])
 89 |             if not function or function.is_tailcall_function:
 90 |                 disassembler.analyzeFunction(tailcall["destination_function"])
 91 |                 continue
 92 | 
 93 |             self.__functions.remove(function)
 94 |             if function:
 95 |                 if verbose:
 96 |                     print("Old function:")
 97 |                     self.__printIntervals(self.__getFunctionIntervals(function))
 98 |                 function.revertAnalysis()
 99 | 
100 |             # analyze the tailcall destination as function
101 |             disassembler.analyzeFunction(tailcall["destination_addr"])
102 |             newly_created_functions.add(tailcall["destination_addr"])
103 |             function = self.__getFunctionByStartAddr(tailcall["destination_addr"])
104 |             if function and not tailcall["destination_function"] in function.instruction_start_bytes:
105 |                 # analyze the (previously) broken function a second time
106 |                 try:
107 |                     disassembler.analyzeFunction(tailcall["destination_function"])
108 |                     function = self.__getFunctionByStartAddr(tailcall["destination_function"])
109 |                     function.is_tailcall_function = True
110 |                 except:
111 |                     pass
112 |                     # print ("0x{:x} -> 0x{:x}".format(tailcall["destination_function"], tailcall["destination_addr"]))
113 |             elif verbose:
114 |                 print("**** 0x{:x} IS NOW PART OF 0x{:x}".format(tailcall["destination_function"], tailcall["destination_addr"]))
115 | 
116 |             if verbose:
117 |                 function = self.__getFunctionByStartAddr(tailcall["destination_function"])
118 |                 new_function = self.__getFunctionByStartAddr(tailcall["destination_addr"])
119 |                 print("New function:")
120 |                 if new_function:
121 |                     self.__printIntervals(self.__getFunctionIntervals(new_function))
122 |                 print("Re-disassembled old function:")
123 |                 if function:
124 |                     self.__printIntervals(self.__getFunctionIntervals(function))
125 |         return sorted(list(newly_created_functions))
126 | 


--------------------------------------------------------------------------------
/smda/common/Tarjan.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Tarjan's Algorithm (named for its discoverer, Robert Tarjan) is a graph theory algorithm
 3 |     for finding the strongly connected components of a graph.
 4 |     This can be used to find loops.
 5 |     Based on: http://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
 6 |     - Refactored into a class to allow pooled computation by Daniel Plohmann
 7 |     - Implementation by Bas Westerbaan:
 8 |       https://github.com/bwesterb/py-tarjan
 9 | """
10 | 
11 | class Tarjan(object):
12 |     """ g is the graph represented as a dictionary { <vertex> : <successors of vertex> } """
13 | 
14 |     def __init__(self, graph):
15 |         self._graph = graph
16 |         self._stack = []
17 |         self._stack_set = set([])
18 |         self._index = {}
19 |         self._lowlink = {}
20 |         self._nonrecursive_stack = []
21 |         self._result = []
22 | 
23 |     def _tarjan_head(self, v):
24 |         self._index[v] = len(self._index)
25 |         self._lowlink[v] = self._index[v]
26 |         self._stack.append(v)
27 |         self._stack_set.add(v)
28 |         it = iter(self._graph.get(v, ()))
29 |         self._nonrecursive_stack.append((it, False, v, None))
30 | 
31 |     def _tarjan_body(self, it, v):
32 |         for w in it:
33 |             if w not in self._index:
34 |                 self._nonrecursive_stack.append((it, True, v, w))
35 |                 self._tarjan_head(w)
36 |                 return
37 |             if w in self._stack_set:
38 |                 self._lowlink[v] = min(self._lowlink[v], self._index[w])
39 |         if self._lowlink[v] == self._index[v]:
40 |             scc = []
41 |             w = None
42 |             while v != w:
43 |                 w = self._stack.pop()
44 |                 scc.append(w)
45 |                 self._stack_set.remove(w)
46 |             self._result.append(scc)
47 | 
48 |     def calculateScc(self):
49 |         main_iter = iter(self._graph)
50 |         while True:
51 |             try:
52 |                 v = next(main_iter)
53 |             except StopIteration:
54 |                 return self._result
55 |             if v not in self._index:
56 |                 self._tarjan_head(v)
57 |             while self._nonrecursive_stack:
58 |                 it, inside, v, w = self._nonrecursive_stack.pop()
59 |                 if inside:
60 |                     self._lowlink[v] = min(self._lowlink[w], self._lowlink[v])
61 |                 self._tarjan_body(it, v)
62 | 
63 |     def closure(self):
64 |         """ Given a graph @g, returns the transitive closure of @g """
65 |         ret = {}
66 |         for scc in self.calculateScc():
67 |             ws = set()
68 |             ews = set()
69 |             for v in scc:
70 |                 ws.update(self._graph[v])
71 |             for w in ws:
72 |                 assert w in ret or w in scc
73 |                 ews.add(w)
74 |                 ews.update(ret.get(w, ()))
75 |             if len(scc) > 1:
76 |                 ews.update(scc)
77 |             ews = tuple(ews)
78 |             for v in scc:
79 |                 ret[v] = ews
80 |         return ret
81 | 
82 |     def getResult(self):
83 |         return self._result
84 | 


--------------------------------------------------------------------------------
/smda/common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/smda/common/labelprovider/AbstractLabelProvider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | from abc import abstractmethod
 4 | 
 5 | import logging
 6 | LOGGER = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class AbstractLabelProvider:
10 | 
11 |     def __init__(self, config):
12 |         raise NotImplementedError
13 | 
14 |     @abstractmethod
15 |     def update(self, binary_info):
16 |         """If the LabelProvider needs to parse from the given target, update() can be used to populate the provider """
17 |         raise NotImplementedError
18 | 
19 |     @abstractmethod
20 |     def getApi(self, absolute_addr):
21 |         """If the LabelProvider has any information about a used API for the given address, return (dll, api), else return None"""
22 |         raise NotImplementedError
23 | 
24 |     @abstractmethod
25 |     def getSymbol(self, address):
26 |         """If the LabelProvider has any information about a used Symbol for the given address, return the symbol, else return None"""
27 |         raise NotImplementedError
28 | 
29 |     @abstractmethod
30 |     def isApiProvider(self):
31 |         """Returns whether the get_api(..) function of the AbstractLabelProvider is functional"""
32 |         return False
33 | 
34 |     @abstractmethod
35 |     def isSymbolProvider(self):
36 |         """Returns whether the get_symbol(..) function of the AbstractLabelProvider is functional"""
37 |         return False
38 | 
39 |     @abstractmethod
40 |     def getFunctionSymbols(self):
41 |         """Return all function symbol data """
42 |         return {}
43 | 


--------------------------------------------------------------------------------
/smda/common/labelprovider/CilSymbolProvider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import logging
 4 | 
 5 | import dnfile
 6 | from dnfile.enums import MetadataTables
 7 | 
 8 | from .AbstractLabelProvider import AbstractLabelProvider
 9 | 
10 | LOGGER = logging.getLogger(__name__)
11 | 
12 | 
13 | class CilSymbolProvider(AbstractLabelProvider):
14 |     """ Minimal resolver for CIL/DOTNET symbols """
15 | 
16 |     def __init__(self, config):
17 |         self._config = config
18 |         #addr:func_name
19 |         self._addr_to_func_symbols = {}
20 |         self._func_symbol_to_addr = {}
21 | 
22 |     def isSymbolProvider(self):
23 |         return True
24 |     
25 |     def decodeSymbolName(self, value):
26 |         """ ensure a proper utf-8 escaped string """
27 |         return value.encode("utf-8").decode("utf-8")
28 | 
29 |     def update(self, binary_info):
30 |         pe = dnfile.dnPE(data=binary_info.raw_data)
31 |         for row in pe.net.mdtables.MethodDef:
32 |             addr = pe.get_offset_from_rva(row.Rva)
33 |             func_name = self.decodeSymbolName(row.Name.value)
34 |             self._addr_to_func_symbols[addr] = func_name
35 |             self._func_symbol_to_addr[func_name] = addr
36 | 
37 |     def getSymbol(self, address):
38 |         return self._addr_to_func_symbols.get(address, "")
39 | 
40 |     def getAddress(self, func_symbol):
41 |         return self._func_symbol_to_addr.get(func_symbol, None)
42 | 
43 |     def getFunctionSymbols(self):
44 |         return self._addr_to_func_symbols
45 | 


--------------------------------------------------------------------------------
/smda/common/labelprovider/DelphiKbSymbolProvider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os
 3 | import re
 4 | import logging
 5 | from io import BytesIO
 6 | 
 7 | from .AbstractLabelProvider import AbstractLabelProvider
 8 | from smda.utility.DelphiKbFileLoader import DelphiKbFileLoader
 9 | 
10 | LOGGER = logging.getLogger(__name__)
11 | 
12 | 
13 | class DelphiKbSymbolProvider(AbstractLabelProvider):
14 |     """ Minimal resolver for Delphi knowledge base files """
15 | 
16 |     def __init__(self, config):
17 |         self._config = config
18 |         # addr:func_name
19 |         self._func_symbols = {}
20 |         self._relocations = {}
21 | 
22 |     def update(self, binary_info):
23 |         binary = binary_info.binary
24 |         if DelphiKbFileLoader.isCompatible(binary):
25 |             self._func_symbols = self.parseKbBuffer(binary, binary_info.base_addr)
26 | 
27 |     def isSymbolProvider(self):
28 |         return True
29 | 
30 |     def getSymbol(self, address):
31 |         return self._func_symbols.get(address, "")
32 | 
33 |     def getFunctionSymbols(self):
34 |         return self._func_symbols
35 | 
36 |     def getRelocations(self):
37 |         return self._relocations
38 | 
39 |     def parseKbBuffer(self, binary, base_addr):
40 |         result = {}
41 |         fh = BytesIO(binary)
42 |         fh.seek(-4, os.SEEK_END)
43 |         fh.seek(int.from_bytes(fh.read(4), byteorder="little"))
44 |         # process modules
45 |         len_mod_data_table = int.from_bytes(fh.read(4), byteorder="little")
46 |         fh.read(4)
47 |         modules = {}
48 |         for i in range(len_mod_data_table):
49 |             offset = int.from_bytes(fh.read(4), byteorder="little")
50 |             size = int.from_bytes(fh.read(4), byteorder="little")
51 |             modId = int.from_bytes(fh.read(4), byteorder="little")
52 |             namID = int.from_bytes(fh.read(4), byteorder="little")
53 |             modules[modId] = {}
54 |             modules[modId]['offset'] = offset
55 |             modules[modId]['size'] = size
56 |             modules[modId]['namID'] = namID
57 |         temp_off = fh.tell()
58 |         for modID in modules:
59 |             fh.seek(modules[modID]['offset'])
60 |             if modID != int.from_bytes(fh.read(2), byteorder="little"):
61 |                 print('ModID doesnt match' + str(modules[modID]['offset']))
62 |             len_name = int.from_bytes(fh.read(2), byteorder="little")
63 |             modules[modID]['name'] = fh.read(len_name).decode()
64 |             modules[modID]['functions'] = []
65 |         fh.seek(temp_off)
66 |         # process functions and their code
67 |         for i in range(4):
68 |             fh.seek(int.from_bytes(fh.read(4), byteorder="little") * 16 + fh.tell() + 4)
69 |         len_fun_data_table = int.from_bytes(fh.read(4), byteorder="little")
70 |         fh.read(4)
71 |         for i in range(len_fun_data_table):
72 |             offset = int.from_bytes(fh.read(4), byteorder="little")
73 |             temp_off = fh.tell()
74 |             fh.seek(offset)
75 |             function_info = {}
76 |             function_info['modId'] = int.from_bytes(fh.read(2), byteorder="little")
77 |             len_name = int.from_bytes(fh.read(2), byteorder="little")
78 |             function_info['name'] = fh.read(len_name).decode()
79 |             fh.read(9)
80 |             len_type = int.from_bytes(fh.read(2), byteorder="little")
81 |             type = fh.read(len_type).decode()
82 |             fh.read(5)
83 |             function_info['dump_size'] = int.from_bytes(fh.read(4), byteorder="little")
84 |             fh.read(4)
85 |             function_code_start_offset = fh.tell()
86 |             result[base_addr + function_code_start_offset] = function_info['name']
87 |             function_info['dump'] = list(fh.read(function_info['dump_size']))
88 |             # relocations mark both call but also data ref offsets
89 |             function_info['reloc'] = fh.read(function_info['dump_size'])
90 |             for match in re.finditer(b"\xFF\xFF\xFF\xFF", function_info['reloc']):
91 |                 self._relocations[function_code_start_offset + match.start()] = 0
92 |             modules[function_info['modId']]['functions'].append(function_info)
93 |             fh.seek(temp_off + 12)
94 |         return result
95 | 


--------------------------------------------------------------------------------
/smda/common/labelprovider/ElfApiResolver.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import lief
 3 | lief.logging.disable()
 4 | 
 5 | from .AbstractLabelProvider import AbstractLabelProvider
 6 | 
 7 | 
 8 | class ElfApiResolver(AbstractLabelProvider):
 9 |     """ Minimal ELF API reference resolver, extracting APIs from ELF imports """
10 | 
11 |     def __init__(self, config):
12 |         self._api_map = {
13 |             "lief": {}
14 |         }
15 | 
16 |     def update(self, binary_info):
17 |         if binary_info.is_buffer:
18 |             # cannot reconstruct from shellcode/memory dump at this time
19 |             return
20 | 
21 |         else:
22 |             lief_binary = lief.parse(binary_info.raw_data)
23 | 
24 |             if not isinstance(lief_binary, lief.ELF.Binary):
25 |                 return
26 | 
27 |             for relocation in lief_binary.relocations:
28 |                 if not relocation.has_symbol:
29 |                     # doesn't have a name, we won't care about it
30 |                     continue
31 |                 if not relocation.symbol.imported:
32 |                     # only interested in APIs from external sources
33 |                     continue
34 |                 if not relocation.symbol.is_function:
35 |                     # only interested in APIs (which are functions)
36 |                     continue
37 | 
38 |                 # we can't really say what library the symbol came from
39 |                 # however, we can treat the version (if present) as relevant metadata?
40 |                 # note: this only works for GNU binaries, such as for Linux
41 |                 lib = None
42 |                 if relocation.symbol.has_version and relocation.symbol.symbol_version.has_auxiliary_version:
43 |                     # like "GLIBC_2.2.5"
44 |                     lib = relocation.symbol.symbol_version.symbol_version_auxiliary.name
45 | 
46 |                 name = relocation.symbol.name
47 |                 address = relocation.address
48 | 
49 |                 self._api_map["lief"][address] = (lib, name)
50 | 
51 |     def isApiProvider(self):
52 |         """Returns whether the get_api(..) function of the AbstractLabelProvider is functional"""
53 |         return True
54 | 
55 |     def getApi(self, to_addr, absolute_addr):
56 |         """
57 |         If the LabelProvider has any information about a used API for the given address, return (dll, api), else return (None, None).
58 | 
59 |         May return None for the `dll` if it cannot be determined.
60 |         When it can be determined for ELF files, the `dll` field should be interpreted as the API version rather than shared library name.
61 |         For example: "GLIBC_2.2.5".
62 |         """
63 |         return self._api_map["lief"].get(to_addr, (None, None))
64 | 


--------------------------------------------------------------------------------
/smda/common/labelprovider/ElfSymbolProvider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import logging
 4 | from .AbstractLabelProvider import AbstractLabelProvider
 5 | 
 6 | LOGGER = logging.getLogger(__name__)
 7 | 
 8 | try:
 9 |     import lief
10 |     lief.logging.disable()
11 | except:
12 |     lief = None
13 |     LOGGER.warning("3rd party library LIEF not installed - won't be able to extract symbols for ELF files where available.")
14 | 
15 | 
16 | 
17 | class ElfSymbolProvider(AbstractLabelProvider):
18 |     """ Minimal resolver for ELF symbols """
19 | 
20 |     def __init__(self, config):
21 |         self._config = config
22 |         #addr:func_name
23 |         self._func_symbols = {}
24 | 
25 |     def isSymbolProvider(self):
26 |         return True
27 | 
28 |     def _parseOep(self, lief_result):
29 |         if lief_result:
30 |             self._func_symbols[lief_result.header.entrypoint] = "original_entry_point"
31 | 
32 |     def update(self, binary_info):
33 |         #works both for PE and ELF
34 |         self._func_symbols = {}
35 |         data = b""
36 |         if binary_info.file_path:
37 |             with open(binary_info.file_path, "rb") as fin:
38 |                 data = fin.read()
39 |             return
40 |         elif binary_info.raw_data:
41 |             data = binary_info.raw_data
42 |         else:
43 |             return
44 |         if data[:4] != b"\x7FELF" or lief is None:
45 |             return
46 |         lief_binary = lief.parse(data)
47 |         self._parseOep(lief_binary)
48 |         # TODO split resolution into API/dynamic part and local symbols
49 |         self._parseExports(lief_binary)
50 |         self._parseSymbols(lief_binary.symtab_symbols)
51 |         self._parseSymbols(lief_binary.dynamic_symbols)
52 |         for reloc in lief_binary.relocations:
53 |             if reloc.has_symbol:
54 |                 self._func_symbols[reloc.address] = reloc.symbol.name
55 | 
56 |     def _parseExports(self, binary):
57 |         for function in binary.exported_functions:
58 |             self._func_symbols[function.address] = function.name
59 | 
60 |     def _parseSymbols(self, symbols):
61 |         for symbol in symbols:
62 |             if symbol.is_function:
63 |                 if symbol.value != 0:
64 |                     func_name = ""
65 |                     try:
66 |                         func_name = symbol.demangled_name
67 |                     except:
68 |                         func_name = symbol.name
69 |                     self._func_symbols[symbol.value] = func_name
70 | 
71 |     def getSymbol(self, address):
72 |         return self._func_symbols.get(address, "")
73 | 
74 |     def getFunctionSymbols(self):
75 |         return self._func_symbols
76 | 


--------------------------------------------------------------------------------
/smda/common/labelprovider/GoLabelProvider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import re
  3 | import lief
  4 | lief.logging.disable()
  5 | import struct
  6 | import logging
  7 | from collections import OrderedDict
  8 | 
  9 | from .AbstractLabelProvider import AbstractLabelProvider
 10 | 
 11 | LOGGER = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class GoSymbolProvider(AbstractLabelProvider):
 15 |     """ Minimal resolver for Go symbols """
 16 | 
 17 |     def __init__(self, config):
 18 |         self._config = config
 19 |         # addr:func_name
 20 |         self._func_symbols = {}
 21 | 
 22 |     def update(self, binary_info):
 23 |         binary = binary_info.binary
 24 |         pclntab_offset = None
 25 |         try:
 26 |             lief_binary = lief.parse(binary)
 27 |             if lief_binary.format == lief.EXE_FORMATS.ELF:
 28 |                 pclntab_offset = lief_binary.get_section(".gopclntab").offset
 29 |             elif lief_binary.format == lief.EXE_FORMATS.MACHO:
 30 |                 pclntab_offset = lief_binary.get_section("__gopclntab").offset
 31 |             elif lief_binary.format == lief.EXE_FORMATS.PE:
 32 |                 rdata_offset = lief_binary.get_section(".rdata").offset
 33 |                 pclntab_offset = rdata_offset + lief_binary.get_symbol("runtime.pclntab").value
 34 |         except:
 35 |             pass
 36 |         if pclntab_offset is None:
 37 |             # scan for offset of structure
 38 |             pclntab_regex = re.compile(b".\xFF\xFF\xFF\x00\x00\x01(\x04|\x08)")
 39 |             hits = [match.start() for match in re.finditer(pclntab_regex, binary)]
 40 |             if len(hits) > 1:
 41 |                 logging.error("GoLabelProvider found too many candidates for pclntab")
 42 |             elif len(hits) == 1:
 43 |                 pclntab_offset = hits[0]
 44 |         # if we found a valid offset, do the pclntab parsing
 45 |         if pclntab_offset:
 46 |             try:
 47 |                 result = self._parse_pclntab(pclntab_offset, binary)
 48 |                 if result:
 49 |                     self._func_symbols = result
 50 |             except:
 51 |                 return
 52 | 
 53 |     def isSymbolProvider(self):
 54 |         return True
 55 | 
 56 |     def getSymbol(self, address):
 57 |         return self._func_symbols.get(address, "")
 58 | 
 59 |     def getFunctionSymbols(self):
 60 |         return self._func_symbols
 61 | 
 62 |     def _readUtf8(self, buffer):
 63 |         string_read = ""
 64 |         offset = 0
 65 |         while buffer[offset] != 0:
 66 |             string_read += f"{buffer[offset]:02x}"
 67 |             offset += 1
 68 |         # need to defang special char(s)
 69 |         decoded_string = bytearray.fromhex(string_read).decode().replace('\u00b7', ':')
 70 |         return decoded_string
 71 | 
 72 |     def _parse_pclntab(self, pclntab_offset, binary):
 73 |         pclntab_buffer = binary[pclntab_offset:]
 74 | 
 75 |         marker = struct.unpack("I", pclntab_buffer[0:4])[0]
 76 |         if marker == 0xfffffffb:
 77 |             version = '1.12'
 78 |         elif marker == 0xfffffffa:
 79 |             version = '1.16'
 80 |         elif marker == 0xfffffff0:
 81 |             version = '1.18'
 82 |         elif marker == 0xfffffff1:
 83 |             version = '1.20'
 84 |         else:
 85 |             raise ValueError(f"Could not recognize Golang version marker: 0x{marker}")
 86 |         
 87 |         bitness_indicator = struct.unpack("B", pclntab_buffer[7:8])[0]
 88 |         bitness = None
 89 |         if bitness_indicator == 8:
 90 |             bitness = 64
 91 |         elif bitness_indicator == 4:
 92 |             bitness = 32
 93 |         else:
 94 |             raise ValueError(f"Could not recognize Golang bitness marker: 0x{bitness_indicator}")
 95 | 
 96 |         field_size = 8 if bitness == 64 else 4
 97 |         field_indicator = "Q" if bitness == 64 else "I"
 98 |         if version == '1.12':
 99 |             number_of_functions = struct.unpack("I", pclntab_buffer[8:12])[0]
100 |             function_name_offset = pclntab_offset
101 |             weird_table_offset = pclntab_offset + 16 if bitness == 64 else pclntab_offset + 12
102 |             start_text = 0
103 |         elif version == '1.16':
104 |             parsed_pclntab_fields = struct.unpack(7*field_indicator, pclntab_buffer[8:8+7*field_size])
105 |             number_of_functions = parsed_pclntab_fields[0]
106 |             function_name_offset = pclntab_offset + parsed_pclntab_fields[2]
107 |             file_name_offset = pclntab_offset + parsed_pclntab_fields[3]
108 |             weird_table_offset = pclntab_offset + parsed_pclntab_fields[6]
109 |             start_text = 0
110 |         elif version == '1.18' or version == '1.20':
111 |             parsed_pclntab_fields = struct.unpack(8*field_indicator, pclntab_buffer[8:8+8*field_size])
112 |             number_of_functions = parsed_pclntab_fields[0]
113 |             start_text = parsed_pclntab_fields[2]
114 |             function_name_offset = pclntab_offset + parsed_pclntab_fields[3]
115 |             file_name_offset = pclntab_offset + parsed_pclntab_fields[5]
116 |             weird_table_offset = pclntab_offset + parsed_pclntab_fields[7]
117 | 
118 |         # first parse function offsets
119 |         offsets = OrderedDict()
120 |         func_info_offsets = {}
121 |         read_offset = 0
122 |         table_buffer = binary[weird_table_offset:]
123 |         for index in range(number_of_functions):
124 |             # need to parse a second table in this case
125 |             if version == '1.12':
126 |                 offsets[index] = struct.unpack(field_indicator, table_buffer[read_offset:read_offset+field_size])[0]
127 |                 read_offset += field_size
128 |                 func_info_offsets[index] = struct.unpack(field_indicator, table_buffer[read_offset:read_offset+field_size])[0]
129 |                 read_offset += field_size
130 |             # advance element pointer
131 |             if version == '1.16':
132 |                 offsets[index] = struct.unpack(field_indicator, table_buffer[read_offset:read_offset+field_size])[0]
133 |                 read_offset += 2 * field_size
134 |             # here we have a more compact structure for both x86/x64, no need to skip
135 |             if version == '1.18' or version == '1.20':
136 |                 offsets[index] = struct.unpack("I", table_buffer[read_offset:read_offset+4])[0]
137 |                 read_offset += 8
138 | 
139 |         functions = {}
140 |         offsets2 = offsets.copy()
141 |         function_name_buffer = binary[function_name_offset:]
142 |         if version == '1.12':
143 |             for index, info_offset in func_info_offsets.items():
144 |                 function_offset = offsets[index]
145 |                 name_offset = struct.unpack(field_indicator, pclntab_buffer[info_offset+field_size:info_offset+2*field_size])[0]
146 |                 # only take lower 32bit in case of 64bit binaries.
147 |                 name_offset &= 0xFFFFFFFF
148 |                 function_name = self._readUtf8(function_name_buffer[name_offset:])
149 |                 functions[function_offset + start_text] = function_name
150 |         else:
151 |             delete = False
152 |             for offset, function_offset in offsets.items():
153 |                 if delete:
154 |                     offsets2.pop(offset)
155 |                 bytes_read = struct.unpack("I", table_buffer[read_offset:read_offset+4])[0]
156 |                 read_offset += 4
157 |                 try:
158 |                     while bytes_read != function_offset:
159 |                         bytes_read = struct.unpack("I", table_buffer[read_offset:read_offset+4])[0]
160 |                         read_offset += 4
161 |                 except ValueError:
162 |                     delete = True
163 |                     offsets2.pop(offset)
164 |                     continue
165 |                 if version == '1.16' and bitness == 64:
166 |                     read_offset += 4
167 |                 name_offset = struct.unpack('I', table_buffer[read_offset:read_offset+4])[0]
168 |                 function_name = self._readUtf8(function_name_buffer[name_offset:])
169 |                 read_offset += 4
170 |                 functions[function_offset + start_text] = function_name
171 |         return functions
172 | 


--------------------------------------------------------------------------------
/smda/common/labelprovider/OrdinalHelper.py:
--------------------------------------------------------------------------------
 1 | class OrdinalHelper(object):
 2 |     # TODO POC implementation, extend list. ole32.dll and mfc42.dll are candidates here
 3 |     ORDINALS = {
 4 |         "ws2_32.dll": {
 5 |             1: "accept",
 6 |             2: "bind",
 7 |             3: "closesocket",
 8 |             4: "connect",
 9 |             97: "freeaddrinfo",
10 |             98: "getaddrinfo",
11 |             99: "getnameinfo",
12 |             51: "gethostbyaddr",
13 |             52: "gethostbyname",
14 |             53: "getprotobyname",
15 |             54: "getprotobynumber",
16 |             55: "getservbyname",
17 |             56: "getservbyport",
18 |             57: "gethostname",
19 |             5: "getpeername",
20 |             6: "getsockname",
21 |             7: "getsockopt",
22 |             8: "htonl",
23 |             9: "htons",
24 |             10: "ioctlsocket",
25 |             11: "inet_addr",
26 |             12: "inet_ntoa",
27 |             13: "listen",
28 |             14: "ntohl",
29 |             15: "ntohs",
30 |             16: "recv",
31 |             17: "recvfrom",
32 |             18: "select",
33 |             19: "send",
34 |             20: "sendto",
35 |             21: "setsockopt",
36 |             22: "shutdown",
37 |             23: "socket"
38 |         }
39 |     }
40 | 
41 |     @staticmethod
42 |     def resolveOrdinal(dll_name, ordinal):
43 |         dll_name = dll_name.lower()
44 |         if dll_name in OrdinalHelper.ORDINALS and ordinal in OrdinalHelper.ORDINALS[dll_name]:
45 |             return OrdinalHelper.ORDINALS[dll_name][ordinal]
46 |         return ""
47 | 


--------------------------------------------------------------------------------
/smda/common/labelprovider/PdbSymbolProvider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import logging
 4 | 
 5 | from smda.utility.PeFileLoader import PeFileLoader
 6 | from .AbstractLabelProvider import AbstractLabelProvider
 7 | 
 8 | LOGGER = logging.getLogger(__name__)
 9 | 
10 | try:
11 |     import pdbparse
12 |     from pdbparse.undname import undname
13 | except:
14 |     pdbparse = None
15 |     LOGGER.debug("3rd party library pdbparse (use fork @ https://github.com/VPaulV/pdbparse) not installed - won't be able to extract symbols from PDB files where available.")
16 | 
17 | 
18 | class DummyOmap(object):
19 |     def remap(self, addr):
20 |         return addr
21 | 
22 | 
23 | class PdbSymbolProvider(AbstractLabelProvider):
24 |     """ Minimal resolver for PDB symbols """
25 | 
26 |     def __init__(self, config):
27 |         self._config = config
28 |         self._base_addr = 0
29 |         # addr:func_name
30 |         self._func_symbols = {}
31 | 
32 |     def isSymbolProvider(self):
33 |         return True
34 | 
35 |     def _parseOep(self, data):
36 |         oep_rva = PeFileLoader.getOEP(data)
37 |         if oep_rva:
38 |             self._func_symbols[self._base_addr + oep_rva] = "original_entry_point"
39 | 
40 |     def update(self, binary_info):
41 |         self._base_addr = binary_info.base_addr
42 |         if not binary_info.file_path:
43 |             return
44 |         data = ""
45 |         with open(binary_info.file_path, "rb") as fin:
46 |             data = fin.read(16)
47 |         self._parseOep(data)
48 |         if data[:15] != b"Microsoft C/C++" or pdbparse is None:
49 |             return
50 |         try:
51 |             pdb = pdbparse.parse(binary_info.file_path)
52 |             self._parseSymbols(pdb)
53 |         except Exception as exc:
54 |             LOGGER.error("Failed parsing \"%s\" with exception type: %s", binary_info.file_path, type(exc))
55 | 
56 |     def _parseSymbols(self, pdb):
57 |         try:
58 |             sects = pdb.STREAM_SECT_HDR_ORIG.sections
59 |             omap = pdb.STREAM_OMAP_FROM_SRC
60 |         except AttributeError:
61 |             sects = pdb.STREAM_SECT_HDR.sections
62 |             omap = DummyOmap()
63 |         gsyms = pdb.STREAM_GSYM
64 |         for sym in gsyms.globals:
65 |             try:
66 |                 off = sym.offset
67 |                 if len(sects) < sym.segment:
68 |                     continue
69 |                 virt_base = sects[sym.segment - 1].VirtualAddress
70 |                 function_address = (self._base_addr + omap.remap(off + virt_base))
71 |                 demangled_name = undname(sym.name)
72 |                 if sym.symtype == 2:
73 |                     # print("0x%x + 0x%x + 0x%x = 0x%x: %s || %s (type: %d)" % (self._base_addr, off, virt_base, function_address, sym.name, demangled_name, sym.symtype))
74 |                     self._func_symbols[function_address] = demangled_name
75 |             except AttributeError:
76 |                 pass
77 | 
78 |     def getSymbol(self, address):
79 |         return self._func_symbols.get(address, "")
80 | 
81 |     def getFunctionSymbols(self):
82 |         return self._func_symbols
83 | 


--------------------------------------------------------------------------------
/smda/common/labelprovider/PeSymbolProvider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import logging
 4 | from .AbstractLabelProvider import AbstractLabelProvider
 5 | 
 6 | LOGGER = logging.getLogger(__name__)
 7 | 
 8 | try:
 9 |     import lief
10 |     lief.logging.disable()
11 | except:
12 |     lief = None
13 |     LOGGER.warning("3rd party library LIEF not installed - won't be able to extract symbols for ELF files where available.")
14 | 
15 | 
16 | 
17 | class PeSymbolProvider(AbstractLabelProvider):
18 |     """ Minimal resolver for PE symbols """
19 | 
20 |     def __init__(self, config):
21 |         self._config = config
22 |         #addr:func_name
23 |         self._func_symbols = {}
24 | 
25 |     def isSymbolProvider(self):
26 |         return True
27 | 
28 |     def _parseOep(self, lief_result):
29 |         if lief_result:
30 |             self._func_symbols[lief_result.entrypoint] = "original_entry_point"
31 | 
32 |     def update(self, binary_info):
33 |         #works both for PE and ELF
34 |         self._func_symbols = {}
35 |         if not binary_info.file_path:
36 |             return
37 |         data = ""
38 |         with open(binary_info.file_path, "rb") as fin:
39 |             data = fin.read(16)
40 |         if data[:2] != b"MZ" or lief is None:
41 |             return
42 |         lief_binary = lief.parse(binary_info.file_path)
43 |         if lief_binary is not None:
44 |             self._parseOep(lief_binary)
45 |             self._parseExports(lief_binary)
46 |             self._parseSymbols(lief_binary)
47 | 
48 |     def _parseExports(self, binary):
49 |         for function in binary.exported_functions:
50 |             function_name = ""
51 |             try:
52 |                 # here may occur a LIEF exception that we want to skip ->
53 |                 # UnicodeDecodeError: 'utf-32-le' codec can't decode bytes in position 0-3: code point not in range(0x110000)
54 |                 function_name = function.name
55 |             except:
56 |                 pass
57 |             if function_name and all(c in range(0x20, 0x7f) for c in function_name):
58 |                 self._func_symbols[binary.imagebase + function.address] = function_name
59 | 
60 |     def _parseSymbols(self, lief_binary):
61 |         # find VA of first code section
62 |         code_base_address = None
63 |         for section in lief_binary.sections:
64 |             if section.characteristics & 0x20000000:
65 |                 code_base_address = lief_binary.imagebase + section.virtual_address
66 |                 break
67 |         if code_base_address is None:
68 |             return
69 |         for symbol in lief_binary.symbols:
70 |             if hasattr(symbol.complex_type, "name") and symbol.complex_type.name == "FUNCTION":
71 |                 function_name = ""
72 |                 try:
73 |                     # here may occur a LIEF exception that we want to skip ->
74 |                     # UnicodeDecodeError: 'utf-32-le' codec can't decode bytes in position 0-3: code point not in range(0x110000)
75 |                     function_name = symbol.name
76 |                 except:
77 |                     pass
78 |                 if function_name and all(ord(c) in range(0x20, 0x7f) for c in function_name):
79 |                     # for some reason, we need to add the section_offset of .text here
80 |                     function_offset = code_base_address + symbol.value
81 |                     if function_offset not in self._func_symbols:
82 |                         self._func_symbols[function_offset] = function_name
83 | 
84 |     def getSymbol(self, address):
85 |         return self._func_symbols.get(address, "")
86 | 
87 |     def getFunctionSymbols(self):
88 |         return self._func_symbols
89 | 


--------------------------------------------------------------------------------
/smda/common/labelprovider/WinApiResolver.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import json
 5 | import logging
 6 | 
 7 | import lief
 8 | lief.logging.disable()
 9 | 
10 | from .AbstractLabelProvider import AbstractLabelProvider
11 | from smda.common.labelprovider.OrdinalHelper import OrdinalHelper
12 | 
13 | LOGGER = logging.getLogger(__name__)
14 | 
15 | 
16 | class WinApiResolver(AbstractLabelProvider):
17 |     """ Minimal WinAPI reference resolver, extracted from ApiScout """
18 | 
19 |     def __init__(self, config):
20 |         self._config = config
21 |         self._has_64bit = False
22 |         self._api_map = {
23 |             "lief": {}
24 |         }
25 |         self._os_name = None
26 |         self._is_buffer = False
27 |         for os_name, db_filepath in self._config.API_COLLECTION_FILES.items():
28 |             self._loadDbFile(os_name, db_filepath)
29 |             self._os_name = os_name
30 | 
31 |     def update(self, binary_info):
32 |         self._is_buffer = binary_info.is_buffer
33 |         if not self._is_buffer:
34 |             #setup import table info from LIEF
35 |             lief_binary = lief.parse(binary_info.raw_data)
36 |             if not isinstance(lief_binary, lief.PE.Binary):
37 |                 return
38 |             for imported_library in lief_binary.imports:
39 |                 for func in imported_library.entries:
40 |                     if func.name:
41 |                         self._api_map["lief"][func.iat_address + binary_info.base_addr] = (imported_library.name.lower(), func.name)
42 |                     elif func.is_ordinal:
43 |                         resolved_ordinal = OrdinalHelper.resolveOrdinal(imported_library.name.lower(), func.ordinal)
44 |                         ordinal_name = resolved_ordinal if resolved_ordinal else "#%s" % func.ordinal
45 |                         self._api_map["lief"][func.iat_address + binary_info.base_addr] = (imported_library.name.lower(), ordinal_name)
46 | 
47 |     def setOsName(self, os_name):
48 |         self._os_name = os_name
49 | 
50 |     def _loadDbFile(self, os_name, db_filepath):
51 |         api_db = {}
52 |         if os.path.isfile(db_filepath):
53 |             with open(db_filepath, "r") as f_json:
54 |                 api_db = json.loads(f_json.read())
55 |         else:
56 |             LOGGER.error("Can't find ApiScout collection file: \"%s\" -- continuing without ApiResolver.", db_filepath)
57 |             return
58 |         num_apis_loaded = 0
59 |         api_map = {}
60 |         for dll_entry in api_db["dlls"]:
61 |             LOGGER.debug("  building address map for: %s", dll_entry)
62 |             for export in api_db["dlls"][dll_entry]["exports"]:
63 |                 num_apis_loaded += 1
64 |                 api_name = "%s" % (export["name"])
65 |                 if api_name == "None":
66 |                     api_name = "None<{}>".format(export["ordinal"])
67 |                 dll_name = "_".join(dll_entry.split("_")[2:])
68 |                 bitness = api_db["dlls"][dll_entry]["bitness"]
69 |                 self._has_64bit |= bitness == 64
70 |                 base_address = api_db["dlls"][dll_entry]["base_address"]
71 |                 virtual_address = base_address + export["address"]
72 |                 api_map[virtual_address] = (dll_name, api_name)
73 |         LOGGER.debug("loaded %d exports from %d DLLs (%s).", num_apis_loaded, len(api_db["dlls"]), api_db["os_name"])
74 |         self._api_map[os_name] = api_map
75 | 
76 |     def isApiProvider(self):
77 |         """Returns whether the get_api(..) function of the AbstractLabelProvider is functional"""
78 |         return True
79 | 
80 |     def getApi(self, to_addr, absolute_addr):
81 |         """If the LabelProvider has any information about a used API for the given address, return (dll, api), else return (None, None)"""
82 |         # if we work on a dump, use ApiScout method:
83 |         if self._is_buffer:
84 |             if self._os_name and self._os_name in self._api_map:
85 |                 return self._api_map[self._os_name].get(absolute_addr, (None, None))
86 |             else:
87 |                 return (None, None)
88 |         # otherwise take import table info from LIEF
89 |         else:
90 |             return self._api_map["lief"].get(to_addr, (None, None))
91 | 


--------------------------------------------------------------------------------
/smda/common/labelprovider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/smda/common/labelprovider/__init__.py


--------------------------------------------------------------------------------
/smda/ida/BackendInterface.py:
--------------------------------------------------------------------------------
 1 | class BackendInterface(object):
 2 | 
 3 |     def __init__(self):
 4 |         pass
 5 | 
 6 |     def getArchitecture(self):
 7 |         raise NotImplementedError
 8 | 
 9 |     def getBitness(self):
10 |         raise NotImplementedError
11 | 
12 |     def getFunctions(self):
13 |         raise NotImplementedError
14 | 
15 |     def getBlocks(self, function_offset):
16 |         raise NotImplementedError
17 | 
18 |     def getCodeInRefs(self, offset):
19 |         raise NotImplementedError
20 | 
21 |     def getCodeOutRefs(self, offset):
22 |         raise NotImplementedError
23 | 
24 |     def getInstructionBytes(self, offset):
25 |         raise NotImplementedError
26 | 
27 |     def getFunctionSymbols(self, demangle=False):
28 |         raise NotImplementedError
29 | 
30 |     def getBaseAddr(self):
31 |         raise NotImplementedError
32 | 
33 |     def getBinary(self):
34 |         raise NotImplementedError
35 | 
36 |     def getApiOffsets(self):
37 |         raise NotImplementedError
38 | 


--------------------------------------------------------------------------------
/smda/ida/IdaExporter.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from capstone import Cs, CS_ARCH_X86, CS_MODE_32, CS_MODE_64
 4 | 
 5 | from smda.DisassemblyResult import DisassemblyResult
 6 | from .IdaInterface import IdaInterface
 7 | 
 8 | class IdaExporter(object):
 9 | 
10 |     def __init__(self, config, bitness=None):
11 |         self.config = config
12 |         self.ida_interface = IdaInterface()
13 |         self.bitness = bitness if bitness else self.ida_interface.getBitness()
14 |         self.capstone = None
15 |         self.disassembly = DisassemblyResult()
16 |         self.disassembly.smda_version = config.VERSION
17 |         self._initCapstone()
18 | 
19 |     def _initCapstone(self):
20 |         self.capstone = Cs(CS_ARCH_X86, CS_MODE_32)
21 |         if self.bitness == 64:
22 |             self.capstone = Cs(CS_ARCH_X86, CS_MODE_64)
23 | 
24 |     def _convertIdaInsToSmda(self, offset, instruction_bytes):
25 |         cache = [i for i in self.capstone.disasm_lite(instruction_bytes, offset)]
26 |         if cache:
27 |             i_address, i_size, i_mnemonic, i_op_str = cache[0]
28 |             smda_ins = (i_address, i_size, i_mnemonic, i_op_str, instruction_bytes)
29 |         else:
30 |             # record error and emit placeholder instruction
31 |             bytes_as_hex = "".join(["%02x" % c for c in bytearray(instruction_bytes)])
32 |             print("missing capstone disassembly output at 0x%x (%s)" % (offset, bytes_as_hex))
33 |             self.disassembly.errors[offset] = {
34 |                 "type": "capstone disassembly failure",
35 |                 "instruction_bytes": bytes_as_hex
36 |             }
37 |             smda_ins = (offset, len(instruction_bytes), "error", "error", bytearray(instruction_bytes))
38 |         return smda_ins
39 | 
40 |     def analyzeBuffer(self, binary_info, cb_analysis_timeout=None):
41 |         """ instead of performing a full analysis, simply collect all data from IDA and convert it into a report """
42 |         self.disassembly.analysis_start_ts = datetime.datetime.now(datetime.timezone.utc)
43 |         self.disassembly.binary_info = binary_info
44 |         self.disassembly.binary_info.architecture = self.ida_interface.getArchitecture()
45 |         if not self.disassembly.binary_info.base_addr:
46 |             self.disassembly.binary_info.base_addr = self.ida_interface.getBaseAddr()
47 |         if not self.disassembly.binary_info.binary:
48 |             self.disassembly.binary_info.binary = self.ida_interface.getBinary()
49 |         if not self.disassembly.binary_info.bitness:
50 |             self.disassembly.binary_info.bitness = self.bitness
51 |         self.disassembly.function_symbols = self.ida_interface.getFunctionSymbols()
52 |         api_map = self.ida_interface.getApiMap()
53 |         for function_offset in self.ida_interface.getFunctions():
54 |             if self.ida_interface.isExternalFunction(function_offset):
55 |                 continue
56 |             converted_function = []
57 |             for block in self.ida_interface.getBlocks(function_offset):
58 |                 converted_block = []
59 |                 for instruction_offset in block:
60 |                     instruction_bytes = self.ida_interface.getInstructionBytes(instruction_offset)
61 |                     smda_instruction = self._convertIdaInsToSmda(instruction_offset, instruction_bytes)
62 |                     converted_block.append(smda_instruction)
63 |                     self.disassembly.instructions[smda_instruction[0]] = (smda_instruction[2], smda_instruction[1])
64 |                     in_refs = self.ida_interface.getCodeInRefs(smda_instruction[0])
65 |                     for in_ref in in_refs:
66 |                         self.disassembly.addCodeRefs(in_ref[0], in_ref[1])
67 |                     out_refs = self.ida_interface.getCodeOutRefs(smda_instruction[0])
68 |                     for out_ref in out_refs:
69 |                         self.disassembly.addCodeRefs(out_ref[0], out_ref[1])
70 |                         if out_ref[1] in api_map:
71 |                             self.disassembly.addr_to_api[instruction_offset] = api_map[out_ref[1]]
72 |                 converted_function.append(converted_block)
73 |             self.disassembly.functions[function_offset] = converted_function
74 |             if self.disassembly.isRecursiveFunction(function_offset):
75 |                 self.disassembly.recursive_functions.add(function_offset)
76 |             if self.disassembly.isLeafFunction(function_offset):
77 |                 self.disassembly.leaf_functions.add(function_offset)
78 |         self.disassembly.analysis_end_ts = datetime.datetime.now(datetime.timezone.utc)
79 |         return self.disassembly
80 | 


--------------------------------------------------------------------------------
/smda/ida/IdaInterface.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from .BackendInterface import BackendInterface
  4 | 
  5 | try:
  6 |     import idaapi
  7 |     import idautils
  8 | except:
  9 |     pass
 10 | 
 11 | try:
 12 |     # we only need these when we are in IDA - IDA 7.4 and above
 13 |     import ida_idaapi
 14 |     import ida_funcs
 15 |     import ida_gdl
 16 |     import ida_bytes
 17 |     import ida_nalt
 18 |     import ida_segment
 19 |     import ida_name
 20 | except:
 21 |     pass
 22 | 
 23 | try:
 24 |     # we only need these when we are in IDA - IDA 7.3 and below
 25 |     import idc
 26 | except:
 27 |     pass
 28 | 
 29 | 
 30 | class IdaInterface(object):
 31 |     # derived from https://python-3-patterns-idioms-test.readthedocs.io/en/latest/Singleton.html
 32 |     instance = None
 33 |     def __init__(self):
 34 |         if not IdaInterface.instance:
 35 |             if idaapi.IDA_SDK_VERSION >= 740:
 36 |                 IdaInterface.instance = Ida74Interface()
 37 |             else:
 38 |                 IdaInterface.instance = Ida73Interface()
 39 | 
 40 |     def __getattr__(self, name):
 41 |         return getattr(self.instance, name)
 42 | 
 43 |     def getIdbDir(self):
 44 |         return idautils.GetIdbDir()
 45 | 
 46 | 
 47 | class Ida74Interface(BackendInterface):
 48 | 
 49 |     def __init__(self):
 50 |         self.version = "IDA Pro 7.4"
 51 |         self._processor_map = {
 52 |             "metapc": "intel"
 53 |         }
 54 |         self._api_map = {}
 55 |         self._import_module_name = ""
 56 | 
 57 |     def getArchitecture(self):
 58 |         # https://reverseengineering.stackexchange.com/a/11398
 59 |         info = ida_idaapi.get_inf_structure()
 60 |         if idaapi.IDA_SDK_VERSION >= 800:
 61 |             procname = info.procname
 62 |         else:
 63 |             procname = info.procName
 64 |         if procname in self._processor_map:
 65 |             return self._processor_map[procname]
 66 |         else:
 67 |             raise ValueError("Unsupported Architecture")
 68 | 
 69 |     def getBitness(self):
 70 |         # https://reverseengineering.stackexchange.com/a/11398
 71 |         bits = None
 72 |         info = ida_idaapi.get_inf_structure()
 73 |         if info.is_64bit():
 74 |             bits = 64
 75 |         elif info.is_32bit():
 76 |             bits = 32
 77 |         else:
 78 |             bits = 16
 79 |         return bits
 80 | 
 81 |     def getFunctions(self):
 82 |         return sorted([offset for offset in idautils.Functions()])
 83 | 
 84 |     def getBlocks(self, function_offset):
 85 |         blocks = []
 86 |         function_chart = ida_gdl.FlowChart(ida_funcs.get_func(function_offset))
 87 |         for block in function_chart:
 88 |             extracted_block = []
 89 |             for instruction in idautils.Heads(block.start_ea, block.end_ea):
 90 |                 if ida_bytes.is_code(ida_bytes.get_flags(instruction)):
 91 |                     extracted_block.append(instruction)
 92 |             if extracted_block:
 93 |                 blocks.append(extracted_block)
 94 |         return sorted(blocks)
 95 | 
 96 |     def getInstructionBytes(self, offset):
 97 |         ins = idautils.DecodeInstruction(offset)
 98 |         ins_bytes = ida_bytes.get_bytes(offset, ins.size)
 99 |         return ins_bytes
100 | 
101 |     def getCodeInRefs(self, offset):
102 |         return [(ref_from, offset) for ref_from in idautils.CodeRefsTo(offset, True)]
103 | 
104 |     def getCodeOutRefs(self, offset):
105 |         return [(offset, ref_to) for ref_to in idautils.CodeRefsFrom(offset, True)]
106 | 
107 |     def getFunctionSymbols(self, demangle=False):
108 |         function_symbols = {}
109 |         function_offsets = self.getFunctions()
110 |         for function_offset in function_offsets:
111 |             function_name = ida_funcs.get_func_name(function_offset)
112 |             # apply demangling if required
113 |             if demangle and "@" in function_name:
114 |                 demangled = ida_name.demangle_name(function_name, 0)
115 |                 if demangled:
116 |                     function_name = demangled
117 |             if not re.match("sub_[0-9a-fA-F]+", function_name):
118 |                 function_symbols[function_offset] = function_name
119 |         return function_symbols
120 | 
121 |     def getBaseAddr(self):
122 |         base_addr = 0
123 |         segment_starts = [ea for ea in idautils.Segments()]
124 |         if segment_starts:
125 |             first_segment_start = segment_starts[0]
126 |             # re-align by 0x10000 to reflect typically allocation behaviour for IDA-mapped binaries
127 |             first_segment_start = (first_segment_start / 0x10000) * 0x10000
128 |             base_addr = int(first_segment_start)
129 |         return base_addr
130 | 
131 |     def getBinary(self):
132 |         result = b""
133 |         segment = ida_segment.get_first_seg()
134 |         while segment:
135 |             result += ida_bytes.get_bytes(segment.start_ea, segment.end_ea - segment.start_ea)
136 |             segment = ida_segment.get_next_seg(segment.end_ea)
137 |         return result
138 | 
139 |     def getApiMap(self):
140 |         self._api_map = {}
141 |         num_imports = ida_nalt.get_import_module_qty()
142 |         for i in range(0, num_imports):
143 |             self._import_module_name = ida_nalt.get_import_module_name(i)
144 |             ida_nalt.enum_import_names(i, self._cbEnumImports)
145 |         return self._api_map
146 | 
147 |     def isExternalFunction(self, function_offset):
148 |         function_segment = ida_segment.getseg(function_offset)
149 |         function_segment_name = ida_segment.get_segm_name(function_segment)
150 |         is_extern = function_segment_name in ["extern", "UNDEF"]
151 |         return is_extern
152 | 
153 |     def makeFunction(self, instruction):
154 |         return ida_funcs.add_func(instruction)
155 | 
156 |     def makeNameEx(self, address, name, warning_level=None):
157 |         if warning_level is None:
158 |             warning_level=idc.SN_NOWARN
159 |         return idc.set_name(address, name, warning_level)
160 | 
161 |     def _cbEnumImports(self, addr, name, ordinal):
162 |         # potentially use: idc.Name(addr)
163 |         if self._import_module_name:
164 |             self._api_map[addr] = self._import_module_name + "!" + name
165 |         else:
166 |             self._api_map[addr] = name
167 |         return True
168 | 
169 | 
170 | 
171 | class Ida73Interface(BackendInterface):
172 | 
173 |     def __init__(self):
174 |         self.version = "IDA Pro 7.3 and below"
175 |         self._processor_map = {
176 |             "metapc": "intel"
177 |         }
178 |         self._api_map = {}
179 |         self._import_module_name = ""
180 | 
181 |     def getArchitecture(self):
182 |         # https://reverseengineering.stackexchange.com/a/11398
183 |         info = idaapi.get_inf_structure()
184 |         procname = info.procName
185 |         if procname in self._processor_map:
186 |             return self._processor_map[procname]
187 |         else:
188 |             raise ValueError("Unsupported Architecture")
189 | 
190 |     def getBitness(self):
191 |         # https://reverseengineering.stackexchange.com/a/11398
192 |         bits = None
193 |         info = idaapi.get_inf_structure()
194 |         if info.is_64bit():
195 |             bits = 64
196 |         elif info.is_32bit():
197 |             bits = 32
198 |         else:
199 |             bits = 16
200 |         return bits
201 | 
202 |     def getFunctions(self):
203 |         return sorted([offset for offset in idautils.Functions()])
204 | 
205 |     def getBlocks(self, function_offset):
206 |         blocks = []
207 |         function_chart = idaapi.FlowChart(idaapi.get_func(function_offset))
208 |         for block in function_chart:
209 |             extracted_block = []
210 |             for instruction in idautils.Heads(block.startEA, block.endEA):
211 |                 if idc.isCode(idc.GetFlags(instruction)):
212 |                     extracted_block.append(instruction)
213 |             if extracted_block:
214 |                 blocks.append(extracted_block)
215 |         return sorted(blocks)
216 | 
217 |     def getInstructionBytes(self, offset):
218 |         ins = idautils.DecodeInstruction(offset)
219 |         ins_bytes = idc.get_bytes(offset, ins.size)
220 |         return ins_bytes
221 | 
222 |     def getCodeInRefs(self, offset):
223 |         return [(ref_from, offset) for ref_from in idautils.CodeRefsTo(offset, True)]
224 | 
225 |     def getCodeOutRefs(self, offset):
226 |         return [(offset, ref_to) for ref_to in idautils.CodeRefsFrom(offset, True)]
227 | 
228 |     def getFunctionSymbols(self, demangle=False):
229 |         function_symbols = {}
230 |         function_offsets = self.getFunctions()
231 |         for function_offset in function_offsets:
232 |             function_name = idc.GetFunctionName(function_offset)
233 |             # apply demangling if required
234 |             if demangle and "@" in function_name:
235 |                 function_name = idc.demangle_name(function_name, 0)
236 |             if not re.match("sub_[0-9a-fA-F]+", function_name):
237 |                 function_symbols[function_offset] = function_name
238 |         return function_symbols
239 | 
240 |     def getBaseAddr(self):
241 |         segment_starts = [ea for ea in idautils.Segments()]
242 |         first_segment_start = segment_starts[0]
243 |         # re-align by 0x10000 to reflect typically allocation behaviour for IDA-mapped binaries
244 |         first_segment_start = (first_segment_start / 0x10000) * 0x10000
245 |         return int(first_segment_start)
246 | 
247 |     def getBinary(self):
248 |         result = b""
249 |         segment_starts = [ea for ea in idautils.Segments()]
250 |         offsets = []
251 |         start_len = 0
252 |         for start in segment_starts:
253 |             end = idc.SegEnd(start)
254 |             result += idc.get_bytes(start, end - start)
255 |             offsets.append((start, start_len, len(result)))
256 |             start_len = len(result)
257 |         return result
258 | 
259 |     def getApiMap(self):
260 |         self._api_map = {}
261 |         num_imports = idaapi.get_import_module_qty()
262 |         for i in range(0, num_imports):
263 |             self._import_module_name = idaapi.get_import_module_name(i)
264 |             idaapi.enum_import_names(i, self._cbEnumImports)
265 |         return self._api_map
266 | 
267 |     def isExternalFunction(self, function_offset):
268 |         # TODO look up older function names to support this for IDA 7.3- as well
269 |         return False
270 | 
271 |     def makeFunction(self, instruction):
272 |         return idc.add_func(instruction)
273 | 
274 |     def makeNameEx(self, address, name, warning_level=None):
275 |         if warning_level is None:
276 |             warning_level=idc.SN_NOWARN
277 |         return idc.set_name(address, name, warning_level)
278 | 
279 |     def _cbEnumImports(self, addr, name, ordinal):
280 |         # potentially use: idc.Name(addr)
281 |         if self._import_module_name:
282 |             self._api_map[addr] = self._import_module_name + "!" + name
283 |         else:
284 |             self._api_map[addr] = name
285 |         return True
286 | 


--------------------------------------------------------------------------------
/smda/ida/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/smda/ida/__init__.py


--------------------------------------------------------------------------------
/smda/intel/BitnessAnalyzer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import struct
 3 | import logging
 4 | from collections import Counter
 5 | 
 6 | from .definitions import COMMON_START_BYTES
 7 | 
 8 | LOGGER = logging.getLogger(__name__)
 9 | 
10 | 
11 | class BitnessAnalyzer(object):
12 | 
13 |     def determineBitnessFromFile(self, filepath):
14 |         LOGGER.debug("Running Bitness test on %s", filepath)
15 |         with open(filepath, "rb") as infile:
16 |             if re.search(r"[0-9a-fA-F]{64}_dump_0x[0-9a-fA-F]+$", filepath):
17 |                 return self.determineBitness(binary=infile.read())
18 |         return 0
19 | 
20 |     def determineBitnessFromDisassembly(self, disassembly):
21 |         LOGGER.debug("Running Bitness test on binary data of DisassemblyResult")
22 |         return self.determineBitness(binary=disassembly.binary_info.binary)
23 | 
24 |     def determineBitness(self, binary):
25 |         candidate_first_bytes = {"32": Counter(), "64": Counter()}
26 |         # check for potential call instructions and collect their first bytes
27 |         for bitness in ["32", "64"]:
28 |             for call_match in re.finditer(b"\xE8", binary):
29 |                 if len(binary) - call_match.start() > 5:
30 |                     packed_call = binary[call_match.start() + 1:call_match.start() + 5]
31 |                     rel_call_offset = struct.unpack("i", packed_call)[0]
32 |                     call_destination = (rel_call_offset + call_match.start() + 5)  # & bitmask
33 |                     if call_destination > 0 and call_destination < len(binary):
34 |                         first_byte = binary[call_destination]
35 |                         candidate_first_bytes[bitness][first_byte] += 1
36 |         score = {"32": 0, "64": 0}
37 |         for bitness in ["32", "64"]:
38 |             for candidate_sequence in candidate_first_bytes[bitness]:
39 |                 if isinstance(candidate_sequence, int):
40 |                     candidate_sequence = "%02x" % candidate_sequence
41 |                 elif isinstance(candidate_sequence, str):
42 |                     candidate_sequence = candidate_sequence.encode("hex")
43 |                 for common_sequence, sequence_score in COMMON_START_BYTES[bitness].items():
44 |                     if candidate_sequence == str(common_sequence):
45 |                         score[bitness] += sequence_score * 1.0
46 |         total_score = max(score["32"] + score["64"], 1)
47 |         score["32"] /= total_score
48 |         score["64"] /= total_score
49 |         LOGGER.debug("Bitness scores: %5.2f (32bit), %5.2f (64bit)", score["32"], score["64"])
50 |         return 64 if score["32"] < score["64"] else 32
51 | 


--------------------------------------------------------------------------------
/smda/intel/FunctionCandidate.py:
--------------------------------------------------------------------------------
  1 | from binascii import hexlify
  2 | 
  3 | from .definitions import COMMON_PROLOGUES
  4 | 
  5 | class FunctionCandidate(object):
  6 | 
  7 |     def __init__(self, binary_info, addr):
  8 |         self.bitness = binary_info.bitness
  9 |         self.addr = addr
 10 |         rel_start_addr = addr - binary_info.base_addr
 11 |         self.bytes = binary_info.binary[rel_start_addr:rel_start_addr + 5]
 12 |         self.lang_spec = None
 13 |         self.call_ref_sources = []
 14 |         self.finished = False
 15 |         self.is_symbol = False
 16 |         self.is_gap_candidate = False
 17 |         self.is_tailcall = False
 18 |         self.alignment = 0
 19 |         if addr % 4 == 0:
 20 |             self.alignment = 4
 21 |         elif addr % 16 == 0:
 22 |             self.alignment = 16
 23 |         self.analysis_aborted = False
 24 |         self.abortion_reason = ""
 25 |         self._score = None
 26 |         self._tfidf_score = None
 27 |         self._confidence = None
 28 |         self.function_start_score = None
 29 |         self.is_stub = False
 30 |         self.is_initial_candidate = False
 31 |         self.is_exception_handler = False
 32 | 
 33 |     def setTfIdf(self, tfidf_score):
 34 |         self._tfidf_score = tfidf_score
 35 | 
 36 |     def getTfIdf(self):
 37 |         return round(self._tfidf_score, 3)
 38 | 
 39 |     def getConfidence(self):
 40 |         if self._confidence is None:
 41 |             # based on evaluation over Andriesse, Bao, and Plohmann data sets
 42 |             weighted_confidence = 0.298 * (1 if self.hasCommonFunctionStart() else 0)
 43 |             if self._tfidf_score is not None:
 44 |                 weighted_confidence += (
 45 |                     0.321 * (1 if self._tfidf_score < 0 else 0) +
 46 |                     0.124 * (1 if self._tfidf_score < -2 else 0) +
 47 |                     0.120 * (1 if self._tfidf_score < -4 else 0) +
 48 |                     0.101 * (1 if self._tfidf_score < -1 else 0) +
 49 |                     0.025 * (1 if self._tfidf_score < -8 else 0)
 50 |                     )
 51 |             # above experiments show that multiple inbound call references are basically always indeed functions
 52 |             if len(self.call_ref_sources) > 1:
 53 |                 self._confidence = 1.0
 54 |             # initially recognized candidates are also almost always functions as they follow this heuristic
 55 |             elif self.is_initial_candidate:
 56 |                 self._confidence = round(0.5 + 0.5 * (weighted_confidence), 3)
 57 |             else:
 58 |                 self._confidence = round(weighted_confidence, 3)
 59 |         return self._confidence
 60 | 
 61 |     def hasCommonFunctionStart(self):
 62 |         for length in sorted([int(l) for l in COMMON_PROLOGUES], reverse=True):
 63 |             byte_sequence = self.bytes[:length]
 64 |             if byte_sequence in COMMON_PROLOGUES["%d" % length][self.bitness]:
 65 |                 return True
 66 |         return False
 67 | 
 68 |     def getFunctionStartScore(self):
 69 |         if self.function_start_score is None:
 70 |             for length in sorted([int(l) for l in COMMON_PROLOGUES], reverse=True):
 71 |                 byte_sequence = self.bytes[:length]
 72 |                 if byte_sequence in COMMON_PROLOGUES["%d" % length][self.bitness]:
 73 |                     self.function_start_score = COMMON_PROLOGUES["%d" % length][self.bitness][byte_sequence]
 74 |                     break
 75 |             self.function_start_score = self.function_start_score if self.function_start_score else 0
 76 |         return self.function_start_score
 77 | 
 78 |     def addCallRef(self, source_addr):
 79 |         if source_addr not in self.call_ref_sources:
 80 |             self.call_ref_sources.append(source_addr)
 81 |         self._score = None
 82 | 
 83 |     def removeCallRefs(self, source_addrs):
 84 |         for addr in source_addrs:
 85 |             if addr in self.call_ref_sources:
 86 |                 self.call_ref_sources.remove(addr)
 87 |         self._score = None
 88 | 
 89 |     def setIsTailcallCandidate(self, is_tailcall):
 90 |         self.is_tailcall = is_tailcall
 91 | 
 92 |     def setInitialCandidate(self, initial):
 93 |         self.is_initial_candidate = initial
 94 | 
 95 |     def setIsGapCandidate(self, gap):
 96 |         self.is_gap_candidate = gap
 97 | 
 98 |     def setLanguageSpec(self, lang_spec):
 99 |         self.lang_spec = lang_spec
100 |         self._score = None
101 | 
102 |     def setIsSymbol(self, is_symbol):
103 |         self.is_symbol = is_symbol
104 |         self._score = None
105 | 
106 |     def setIsExceptionHandler(self, is_exception_handler):
107 |         self.is_exception_handler = is_exception_handler
108 |         self._score = None
109 | 
110 |     def setIsStub(self, is_stub):
111 |         self.is_stub = is_stub
112 |         self._score = None
113 | 
114 |     def setAnalysisAborted(self, reason):
115 |         self.finished = True
116 |         self.analysis_aborted = True
117 |         self.abortion_reason = reason
118 | 
119 |     def setAnalysisCompleted(self):
120 |         self.finished = True
121 | 
122 |     def isFinished(self):
123 |         return self.finished
124 | 
125 |     def calculateScore(self):
126 |         score = 0
127 |         score += 10000 if self.is_symbol else 0
128 |         score += 5000 if self.is_exception_handler else 0
129 |         score += 1000 if self.is_stub else 0
130 |         score += 100 if self.lang_spec is not None else 0
131 |         score += self.getFunctionStartScore()
132 |         num_call_refs = len(self.call_ref_sources)
133 |         if num_call_refs >= 10:
134 |             call_ref_score = 10 + int(num_call_refs / 10)
135 |         else:
136 |             call_ref_score = num_call_refs
137 |         score += 10 * call_ref_score
138 |         score += 1 if self.alignment else 0
139 |         return score
140 | 
141 |     def getScore(self):
142 |         if self._score is None:
143 |             self._score = self.calculateScore()
144 |         return self._score
145 | 
146 |     def __lt__(self, other):
147 |         own_score = self.getScore()
148 |         other_score = other.getScore()
149 |         if own_score == other_score:
150 |             return self.addr > other.addr
151 |         return own_score < other_score
152 | 
153 |     def getCharacteristics(self):
154 |         is_aligned = "a" if self.alignment else "-"
155 |         is_finished = "f" if self.finished else "-"
156 |         is_gap = "g" if self.is_gap_candidate else "-"
157 |         is_initial = "i" if self.is_initial_candidate else "-"
158 |         is_lang_spec = "l" if self.lang_spec is not None else "-"
159 |         is_prologue = "p" if self.hasCommonFunctionStart() else "-"
160 |         is_ref = "r" if self.call_ref_sources else "-"
161 |         is_symbol = "s" if self.is_symbol else "-"
162 |         is_tailcall = "t" if self.is_tailcall else "-"
163 |         is_stub = "u" if self.is_stub else "-"
164 |         is_aborted = "x" if self.analysis_aborted else "-"
165 |         characteristics = is_initial + is_symbol + is_stub + is_aligned + is_lang_spec + is_prologue + is_ref + is_tailcall + is_gap + is_finished + is_aborted
166 |         return characteristics
167 | 
168 |     def __str__(self):
169 |         characteristics = self.getCharacteristics()
170 |         prologue_score = "%d" % self.getFunctionStartScore()
171 |         ref_summary = "{}".format(len(self.call_ref_sources)) if len(self.call_ref_sources) != 1 else "{}: 0x{:x}".format(len(self.call_ref_sources), self.call_ref_sources[0])
172 |         return "0x{:x}: {} -> {} (total score: {}), inref: {} | {}".format(self.addr, hexlify(self.bytes), prologue_score, self.getScore(), ref_summary, characteristics)
173 | 
174 |     def toJson(self):
175 |         return {
176 |             "addr": self.addr,
177 |             "bytes": self.bytes.hex(),
178 |             "alignment": self.alignment,
179 |             "reason": self.abortion_reason,
180 |             "num_refs": len(self.call_ref_sources),
181 |             "characteristics": self.getCharacteristics(),
182 |             "prologue_score": self.getFunctionStartScore(),
183 |             "score": self.calculateScore(),
184 |             "confidence": self.getConfidence()
185 |         }
186 | 


--------------------------------------------------------------------------------
/smda/intel/IndirectCallAnalyzer.py:
--------------------------------------------------------------------------------
  1 | import struct
  2 | import re
  3 | import logging
  4 | 
  5 | LOGGER = logging.getLogger(__name__)
  6 | 
  7 | 
  8 | class IndirectCallAnalyzer(object):
  9 |     """ Perform basic dataflow analysis to resolve indirect call targets """
 10 | 
 11 |     def __init__(self, disassembler):
 12 |         self.disassembler = disassembler
 13 |         self.disassembly = self.disassembler.disassembly
 14 |         self.current_calling_addr = 0
 15 |         self.state = None
 16 | 
 17 |     def searchBlock(self, analysis_state, address):
 18 |         for block in analysis_state.getBlocks():
 19 |             if address in [i[0] for i in block]:
 20 |                 return block
 21 |         return []
 22 | 
 23 |     def getDword(self, addr):
 24 |         if not self.disassembly.isAddrWithinMemoryImage(addr):
 25 |             return None
 26 |         return struct.unpack("I", self.disassembly.getBytes(addr, 4))[0]
 27 | 
 28 |     def processBlock(self, analysis_state, block, registers, register_name, processed, depth):
 29 |         if not block:
 30 |             return False
 31 |         if block in processed:
 32 |             LOGGER.debug("already processed block 0x%08x; skipping", block[0][0])
 33 |             return False
 34 |         processed.append(block)
 35 |         LOGGER.debug("start processing block: 0x%08x\nlooking for register %s", block[0][0], register_name)
 36 |         abs_value_found = False
 37 |         for ins in reversed(block):
 38 |             LOGGER.debug("0x%08x: %s %s", ins[0], ins[2], ins[3])
 39 |             if ins[2] == "mov":
 40 |                 #mov <reg>, <reg>
 41 |                 match1 = re.match(r"(?P<reg1>[a-z]{3}), (?P<reg2>[a-z]{3})$", ins[3])
 42 |                 if match1:
 43 |                     if match1.group("reg1") == register_name:
 44 |                         register_name = match1.group("reg2")
 45 |                 #mov <reg>, <const>
 46 |                 match2 = re.match(r"(?P<reg>[a-z]{3}), (?P<val>0x[0-9a-f]{,8})$", ins[3])
 47 |                 if match2:
 48 |                     registers[match2.group("reg")] = int(match2.group("val"), 16)
 49 |                     LOGGER.debug("**moved value 0x%08x to register %s", int(match2.group("val"), 16), match2.group("reg"))
 50 |                     if match2.group("reg") == register_name:
 51 |                         abs_value_found = True
 52 |                 #mov <reg>, dword ptr [<addr>]
 53 |                 match3 = re.match(r"(?P<reg>[a-z]{3}), dword ptr \[(?P<addr>0x[0-9a-f]{,8})\]$", ins[3])
 54 |                 if match3:
 55 |                     # HACK: test to see if the address points to a import and
 56 |                     # use that instead of the actual memory value
 57 |                     addr = int(match3.group("addr"), 16)
 58 |                     dll, api = self.disassembler.resolveApi(addr, addr)
 59 |                     if dll or api:
 60 |                         registers[match3.group("reg")] = addr
 61 |                         LOGGER.debug("**moved API ref (%s:%s) @0x%08x to register %s", dll, api, addr, match3.group("reg"))
 62 |                         if match3.group("reg") == register_name:
 63 |                             abs_value_found = True
 64 |                     else:
 65 |                         dword = self.getDword(addr)
 66 |                         if dword:
 67 |                             registers[match3.group("reg")] = dword
 68 |                             LOGGER.debug("**moved value 0x%08x to register %s", dword, match3.group("reg"))
 69 |                             if match3.group("reg") == register_name:
 70 |                                 abs_value_found = True
 71 |                 #mov <reg>, qword ptr [reg + <addr>]
 72 |                 match4 = re.match(r"(?P<reg>[a-z]{3}), qword ptr \[rip \+ (?P<addr>0x[0-9a-f]{,8})\]$", ins[3])
 73 |                 if match4:
 74 |                     rip = ins[0] + ins[1]
 75 |                     dword = self.getDword(rip + int(match4.group("addr"), 16))
 76 |                     if dword:
 77 |                         registers[match4.group("reg")] = rip + dword
 78 |                         LOGGER.debug("**moved value 0x%08x + 0x%08x == 0x%08x to register %s", rip, dword, rip + dword, match4.group("reg"))
 79 |                         if match4.group("reg") == register_name:
 80 |                             abs_value_found = True
 81 |             elif ins[2] == "lea":
 82 |                 LOGGER.debug("*checking %s %s", ins[2], ins[3])
 83 |                 #lea <reg>, dword ptr [<addr>]
 84 |                 match1 = re.match(r"(?P<reg>[a-z]{3}), dword ptr \[(?P<addr>0x[0-9a-f]{,8})\]$", ins[3])
 85 |                 if match1:
 86 |                     dword = self.getDword(int(match1.group("addr"), 16))
 87 |                     if dword:
 88 |                         registers[match1.group("reg")] = dword
 89 |                         LOGGER.debug("**moved value 0x%08x to register %s", dword, match1.group("reg"))
 90 |                         if match1.group("reg") == register_name:
 91 |                             abs_value_found = True
 92 |                 #lea <reg>, [<addr>]
 93 |                 match1 = re.match(r"(?P<reg>[a-z]{3}), \[(?P<addr>0x[0-9a-f]{,8})\]$", ins[3])
 94 |                 if match1:
 95 |                     dword = self.getDword(int(match1.group("addr"), 16))
 96 |                     if dword:
 97 |                         registers[match1.group("reg")] = dword
 98 |                         LOGGER.debug("**moved value 0x%08x to register %s", dword, match1.group("reg"))
 99 |                         if match1.group("reg") == register_name:
100 |                             abs_value_found = True
101 |                 # not handled: lea <reg>, dword ptr [<reg> +- <val>]
102 |                 # requires state-keeping of multiple registers
103 |             # there exist potentially many more way how the register being called can be calculated
104 |             # for now we ignore them
105 |             elif ins[2] == "other instruction":
106 |                 pass
107 |             #if the absolute value was found for the call <reg> instruction, detect API
108 |             if abs_value_found:
109 |                 candidate = registers[register_name] if register_name in registers else None
110 |                 self.state.setLeaf(False)
111 |                 if candidate:
112 |                     LOGGER.debug("candidate: 0x%x - %s, register: %s", candidate, ins[3], register_name)
113 |                     dll, api = self.disassembler.resolveApi(candidate, candidate)
114 |                     if dll or api:
115 |                         LOGGER.debug("successfully resolved: %s %s", dll, api)
116 |                         api_entry = {"referencing_addr": [], "dll_name": dll, "api_name": api}
117 |                         if candidate in self.disassembly.apis:
118 |                             api_entry = self.disassembly.apis[candidate]
119 |                         if self.current_calling_addr not in api_entry["referencing_addr"]:
120 |                             api_entry["referencing_addr"].append(self.current_calling_addr)
121 |                         self.disassembly.apis[candidate] = api_entry
122 |                     elif self.disassembly.isAddrWithinMemoryImage(candidate):
123 |                         LOGGER.debug("successfully resolved: 0x%x", candidate)
124 |                         self.disassembler.fc_manager.addCandidate(candidate, reference_source=self.current_calling_addr)
125 |                     else:
126 |                         LOGGER.debug("candidate not resolved")
127 |                 else:
128 |                     LOGGER.debug("no candidate to resolved")
129 | 
130 |                 return True
131 |         #process previous blocks
132 |         if depth >= 0:
133 |             refs_in = [
134 |                 fr for (fr, to) in analysis_state.code_refs
135 |                 if to == block[0][0] and
136 |                 fr not in [ins[0] for block in processed for ins in block]
137 |             ]
138 |             LOGGER.debug("start processing previous blocks, searching in %d in_refs with remaining depth: %d", len(refs_in), depth - 1)
139 |             if any(self.processBlock(analysis_state, b, registers.copy(), register_name, processed, depth - 1) for b in [self.searchBlock(analysis_state, i) for i in refs_in]):
140 |                 return True
141 | 
142 |     def resolveRegisterCalls(self, analysis_state, block_depth=3):
143 |         # after block reconstruction do simple data flow analysis to resolve open cases like "call <register>" as stored in self.call_register_ins
144 |         if analysis_state.call_register_ins:
145 |             LOGGER.debug("Trying to resolve %d register calls in function: 0x%x", len(analysis_state.call_register_ins), analysis_state.start_addr)
146 |         max_calls_per_block = 10
147 |         calls_per_block = {}
148 |         for calling_addr in analysis_state.call_register_ins:
149 |             LOGGER.debug("#" * 20)
150 |             self.current_calling_addr = calling_addr
151 |             self.state = analysis_state
152 |             start_block = [ins for ins in self.searchBlock(analysis_state, calling_addr) if ins[0] <= calling_addr]
153 |             if not start_block:
154 |                 return
155 |             # we only process at most 10 register-calls per block to avoid extreme cases 
156 |             # found one Go sample with 130k register calls.
157 |             if start_block[0] not in calls_per_block:
158 |                 calls_per_block[start_block[0]] = 0
159 |             calls_per_block[start_block[0]] += 1
160 |             # if we have an old config, default to 50
161 |             max_calls = self.disassembler.config.MAX_INDIRECT_CALLS_PER_BASIC_BLOCK if hasattr(self.disassembler.config, 'MAX_INDIRECT_CALLS_PER_BASIC_BLOCK') else 50
162 |             if calls_per_block[start_block[0]] > max_calls:
163 |                 break
164 |             LOGGER.debug("For this block, we can still analyze %d indirect calls.", max_calls_per_block - calls_per_block[start_block[0]])
165 |             if start_block:
166 |                 self.processBlock(analysis_state, start_block, dict(), start_block[-1][3], list(), block_depth)
167 | 


--------------------------------------------------------------------------------
/smda/intel/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/smda/intel/definitions.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # some mnemonics as specific to capstone
  3 | CJMP_INS = ["je", "jne", "js", "jns", "jp", "jnp", "jo", "jno", "jl", "jle", "jg", "jge", "jb", "jbe", "ja", "jae", "jcxz", "jecxz", "jrcxz"]
  4 | LOOP_INS = ["loop", "loopne", "loope"]
  5 | JMP_INS = ["jmp", "ljmp"]
  6 | CALL_INS = ["call", "lcall"]
  7 | RET_INS = ["ret", "retn", "retf", "iret"]
  8 | END_INS = ["ret", "retn", "retf", "iret", "int3", "hlt"]
  9 | REGS_32BIT = ["eax", "ebx", "ecx", "edx", "esi", "edi", "ebp", "esp"]
 10 | REGS_64BIT = ["rax", "rbx", "rcx", "rdx", "rsp", "rbp", "rsi", "rdi", "rip", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"]
 11 | DOUBLE_ZERO = bytearray(b"\x00\x00")
 12 | 
 13 | DEFAULT_PROLOGUES = [
 14 |     b"\x8B\xFF\x55\x8B\xEC",
 15 |     b"\x89\xFF\x55\x8B\xEC",
 16 |     b"\x55\x8B\xEC",
 17 |     b"\x55\x89\xE5"
 18 | ]
 19 | 
 20 | # these cover 99% of confirmed function starts in the reference data set
 21 | COMMON_PROLOGUES = {
 22 |     "5": {
 23 |         32: {
 24 |             b"\x8B\xFF\x55\x8B\xEC": 50,  # mov edi, edi, push ebp, mov ebp, esp
 25 |             b"\x89\xFF\x55\x8B\xEC": 50,  # mov edi, edi, push ebp, mov ebp, esp
 26 |         },
 27 |         64: {}
 28 |         },
 29 |     "3": {
 30 |         32: {
 31 |             b"\x55\x8B\xEC": 50,  # push ebp, mov ebp, esp
 32 |         },
 33 |         64: {}
 34 |         },
 35 |     "1": {
 36 |         32: {
 37 |             b"\x55": 51,  # 311150 (51.09%) -- cumulative:  51.09%
 38 |             b"\x8b": 10,  #  62878 (10.32%) -- cumulative:  61.41%
 39 |             b"\x83": 7,  #  46477 (7.63%) -- cumulative:  69.05%
 40 |             b"\x53": 6,  #  38773 (6.37%) -- cumulative:  75.41%
 41 |             b"\x57": 5,  #  36048 (5.92%) -- cumulative:  81.33%
 42 |             b"\x56": 5,  #  31955 (5.25%) -- cumulative:  86.58%
 43 |             b"\xff": 4,  #  24444 (4.01%) -- cumulative:  90.59%
 44 |             b"\xe9": 2,  #  16420 (2.70%) -- cumulative:  93.29%
 45 |             b"\xb8": 1,  #   6577 (1.08%) -- cumulative:  94.37%
 46 |             b"\xc3": 1,  #   5638 (0.93%) -- cumulative:  95.29%
 47 |             b"\xa1": 1,  #   4168 (0.68%) -- cumulative:  95.98%
 48 |             b"\x6a": 1,  #   3815 (0.63%) -- cumulative:  96.60%
 49 |             b"\x51": 1,  #   2753 (0.45%) -- cumulative:  97.06%
 50 |             b"\x31": 1,  #   2514 (0.41%) -- cumulative:  97.47%
 51 |             b"\xf3": 1,  #   2109 (0.35%) -- cumulative:  97.82%
 52 |             b"\x33": 1,  #   1279 (0.21%) -- cumulative:  98.03%
 53 |             b"\x81": 1,  #   1261 (0.21%) -- cumulative:  98.23%
 54 |             b"\x85": 1,  #   1045 (0.17%) -- cumulative:  98.40%
 55 |             b"\xe8": 1,  #   1005 (0.17%) -- cumulative:  98.57%
 56 |             b"\x8d": 1,  #    896 (0.15%) -- cumulative:  98.72%
 57 |             b"\x68": 1,  #    749 (0.12%) -- cumulative:  98.84%
 58 |             b"\x80": 1,  #    703 (0.12%) -- cumulative:  98.95%
 59 |         },
 60 |         64: {
 61 |             b"\x55": 33,  # 196922 (33.40%) -- cumulative:  33.40%
 62 |             b"\x48": 21,  # 124360 (21.09%) -- cumulative:  54.49%
 63 |             b"\x41": 15,  #  91785 (15.57%) -- cumulative:  70.06%
 64 |             b"\x53": 6,  #  37559 (6.37%) -- cumulative:  76.43%
 65 |             b"\xff": 3,  #  22877 (3.88%) -- cumulative:  80.31%
 66 |             b"\x40": 3,  #  18018 (3.06%) -- cumulative:  83.36%
 67 |             b"\xe9": 2,  #  15434 (2.62%) -- cumulative:  85.98%
 68 |             b"\x50": 1,  #  11713 (1.99%) -- cumulative:  87.97%
 69 |             b"\x8b": 1,  #   9130 (1.55%) -- cumulative:  89.52%
 70 |             b"\x4c": 1,  #   6737 (1.14%) -- cumulative:  90.66%
 71 |             b"\xc3": 1,  #   5978 (1.01%) -- cumulative:  91.67%
 72 |             b"\x89": 1,  #   5852 (0.99%) -- cumulative:  92.66%
 73 |             b"\xb8": 1,  #   5073 (0.86%) -- cumulative:  93.52%
 74 |             b"\x31": 1,  #   4902 (0.83%) -- cumulative:  94.36%
 75 |             b"\x44": 1,  #   4504 (0.76%) -- cumulative:  95.12%
 76 |             b"\x0f": 1,  #   3196 (0.54%) -- cumulative:  95.66%
 77 |             b"\x83": 1,  #   3120 (0.53%) -- cumulative:  96.19%
 78 |             b"\xf3": 1,  #   2363 (0.40%) -- cumulative:  96.59%
 79 |             b"\xf2": 1,  #   2349 (0.40%) -- cumulative:  96.99%
 80 |             b"\x85": 1,  #   1806 (0.31%) -- cumulative:  97.30%
 81 |             b"\x33": 1,  #   1605 (0.27%) -- cumulative:  97.57%
 82 |             b"\x66": 1,  #   1370 (0.23%) -- cumulative:  97.80%
 83 |             b"\xba": 1,  #   1235 (0.21%) -- cumulative:  98.01%
 84 |             b"\x45": 1,  #   1227 (0.21%) -- cumulative:  98.22%
 85 |             b"\x80": 1,  #   1197 (0.20%) -- cumulative:  98.42%
 86 |             b"\xc7": 1,  #   1034 (0.18%) -- cumulative:  98.60%
 87 |             b"\xb0": 1,  #    911 (0.15%) -- cumulative:  98.75%
 88 |             b"\xbf": 1,  #    894 (0.15%) -- cumulative:  98.90%
 89 |         }
 90 |     }
 91 | }
 92 | 
 93 | #TODO: 2018-06-27 expand the coverage in this list
 94 | # https://stackoverflow.com/questions/25545470/long-multi-byte-nops-commonly-understood-macros-or-other-notation
 95 | GAP_SEQUENCES = {
 96 |     1: [
 97 |         b"\x90",  # NOP1_OVERRIDE_NOP - AMD / nop - INTEL
 98 |         b"\xCC",  # int3
 99 |         b"\x00",  # pass over sequences of null bytes
100 |     ],
101 |     2: [
102 |         b"\x66\x90",  # NOP2_OVERRIDE_NOP - AMD / nop - INTEL
103 |         b"\x8b\xc0",
104 |         b"\x8b\xff",  # mov edi, edi
105 |         b"\x8d\x00",  # lea eax, dword ptr [eax]
106 |         b"\x86\xc0",  # xchg al, al
107 |         b"\x66\x2e",  # NOP2_OVERRIDE_NOP - AMD / nop - INTEL
108 |     ],
109 |     3: [
110 |         b"\x0f\x1f\x00",  # NOP3_OVERRIDE_NOP - AMD / nop - INTEL
111 |         b"\x8d\x40\x00",  # lea eax, dword ptr [eax]
112 |         b"\x8d\x00\x00",  # lea eax, dword ptr [eax]
113 |         b"\x8d\x49\x00",  # lea ecx, dword ptr [ecx]
114 |         b"\x8d\x64\x24",  # lea esp, dword ptr [esp]
115 |         b"\x8d\x76\x00",
116 |         b"\x66\x66\x90"
117 |     ],
118 |     4: [
119 |         b"\x0f\x1f\x40\x00",  # NOP4_OVERRIDE_NOP - AMD / nop - INTEL
120 |         b"\x8d\x74\x26\x00",
121 |         b"\x66\x66\x66\x90"
122 |     ],
123 |     5: [
124 |         b"\x0f\x1f\x44\x00\x00",  # NOP5_OVERRIDE_NOP - AMD / nop - INTEL
125 |         b"\x90\x8d\x74\x26\x00"
126 |     ],
127 |     6: [
128 |         b"\x66\x0f\x1f\x44\x00\x00",  # NOP6_OVERRIDE_NOP - AMD / nop - INTEL
129 |         b"\x8d\xb6\x00\x00\x00\x00"
130 |     ],
131 |     7: [
132 |         b"\x0f\x1f\x80\x00\x00\x00\x00",  # NOP7_OVERRIDE_NOP - AMD / nop - INTEL,
133 |         b"\x8d\xb4\x26\x00\x00\x00\x00",
134 |         b"\x8D\xBC\x27\x00\x00\x00\x00"
135 |     ],
136 |     8: [
137 |         b"\x0f\x1f\x84\x00\x00\x00\x00\x00",  # NOP8_OVERRIDE_NOP - AMD / nop - INTEL
138 |         b"\x90\x8d\xb4\x26\x00\x00\x00\x00"
139 |     ],
140 |     9: [
141 |         b"\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",  # NOP9_OVERRIDE_NOP - AMD / nop - INTEL
142 |         b"\x89\xf6\x8d\xbc\x27\x00\x00\x00\x00"
143 |     ],
144 |     10: [
145 |         b"\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",  # NOP10_OVERRIDE_NOP - AMD
146 |         b"\x8d\x76\x00\x8d\xbc\x27\x00\x00\x00\x00",
147 |         b"\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00"
148 |     ],
149 |     11: [
150 |         b"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00",  # NOP11_OVERRIDE_NOP - AMD
151 |         b"\x8d\x74\x26\x00\x8d\xbc\x27\x00\x00\x00\x00",
152 |         b"\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00"
153 |     ],
154 |     12: [
155 |         b"\x8d\xb6\x00\x00\x00\x00\x8d\xbf\x00\x00\x00\x00",
156 |         b"\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00"
157 |     ],
158 |     13: [
159 |         b"\x8d\xb6\x00\x00\x00\x00\x8d\xbc\x27\x00\x00\x00\x00",
160 |         b"\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00"
161 |     ],
162 |     14: [
163 |         b"\x8d\xb4\x26\x00\x00\x00\x00\x8d\xbc\x27\x00\x00\x00\x00",
164 |         b"\x66\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00"
165 |     ],
166 |     15: [
167 |         b"\x66\x66\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00"
168 |     ]
169 | }
170 | 
171 | 
172 | COMMON_START_BYTES = {
173 |     "32": {
174 |         "55": 8334,
175 |         "6a": 758,
176 |         "56": 756,
177 |         "51": 312,
178 |         "8d": 566,
179 |         "83": 558,
180 |         "53": 548
181 |     },
182 |     "64": {
183 |         "48": 1341,
184 |         "40": 349,
185 |         "4c": 59,
186 |         "33": 56,
187 |         "44": 18,
188 |         "45": 17,
189 |         "e9": 16,
190 |     }
191 | }
192 | 


--------------------------------------------------------------------------------
/smda/utility/BracketQueue.py:
--------------------------------------------------------------------------------
 1 | class BracketQueue(object):
 2 |     """
 3 |     This queue is tailored based on our research rsults regarding function entry point identification
 4 |     """
 5 |     def __init__(self, candidates=None, initial_brackets=None):
 6 |         self.update_count = 0
 7 |         self.update_shift_count = 0
 8 |         self.brackets = {
 9 |             0: {},
10 |             1: {},
11 |             2: {}
12 |         }
13 |         if candidates is not None:
14 |             for candidate in candidates:
15 |                 self.add(candidate)
16 |             self.ensure_order()
17 |         elif initial_brackets is not None:
18 |             self.brackets = initial_brackets
19 |             self.ensure_order()
20 |         
21 |     def __iter__(self):
22 |         return self
23 | 
24 |     def __next__(self):
25 |         return self.next()
26 | 
27 |     def next(self):
28 |         if all(len(self.brackets[i]) == 0 for i in range(3)):
29 |             raise StopIteration
30 |         for bracket_index in range(2, -1, -1):
31 |             if self.brackets[bracket_index]:
32 |                 offset, candidate = self.brackets[bracket_index].popitem()
33 |                 return candidate
34 | 
35 |     def add(self, candidate):
36 |         bracket_index = min(2, len(candidate.call_ref_sources))
37 |         self.brackets[bracket_index][candidate.addr] = candidate
38 | 
39 |     def update(self, target_candidate=None):
40 |         if target_candidate:
41 |             updated_bracket_index = min(2, len(target_candidate.call_ref_sources))
42 |             # check if the element is still in the same bracket, otherwise shift to next bracket
43 |             self.update_count += 1
44 |             for bracket_index in range(2, -1, -1):
45 |                 if target_candidate.addr in self.brackets[bracket_index] and bracket_index != updated_bracket_index:
46 |                     self.update_shift_count += 1
47 |                     self.brackets[bracket_index].pop(target_candidate.addr)
48 |                     self.brackets[updated_bracket_index][target_candidate.addr] = target_candidate
49 |                     break
50 | 
51 |     def ensure_order(self):
52 |         for bracket_index in range(2, -1, -1):
53 |             if self.brackets[bracket_index]:
54 |                 self.brackets[bracket_index] = {offset: candidate for offset, candidate in sorted(self.brackets[bracket_index].items(), key=lambda x: x[1].getScore())}
55 | 
56 |     def __str__(self):
57 |         return f"BracketQueue | 2: {len(self.brackets[2])} candidates, 1: {len(self.brackets[1])} candidates, 0: {len(self.brackets[0])} candidates,"
58 | 


--------------------------------------------------------------------------------
/smda/utility/DelphiKbFileLoader.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | LOGGER = logging.getLogger(__name__)
 4 | 
 5 | 
 6 | class DelphiKbFileLoader(object):
 7 | 
 8 |     @staticmethod
 9 |     def isCompatible(data):
10 |         return data[:23] == b"IDR Knowledge Base File"
11 | 
12 |     @staticmethod
13 |     def getBaseAddress(binary):
14 |         # return fixed base address that will allow instruction escaping
15 |         return 0x400000
16 | 
17 |     @staticmethod
18 |     def mapBinary(binary):
19 |         return binary
20 | 
21 |     @staticmethod
22 |     def getBitness(binary):
23 |         # we only support 32bit for now
24 |         return 32
25 | 
26 |     @staticmethod
27 |     def getCodeAreas(binary):
28 |         return []
29 | 


--------------------------------------------------------------------------------
/smda/utility/FileLoader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from smda.utility.PeFileLoader import PeFileLoader
 3 | from smda.utility.ElfFileLoader import ElfFileLoader
 4 | from smda.utility.MachoFileLoader import MachoFileLoader
 5 | from smda.utility.DelphiKbFileLoader import DelphiKbFileLoader
 6 | 
 7 | class FileLoader(object):
 8 | 
 9 |     _file_path = None
10 |     _map_file = False
11 |     _data = b""
12 |     _raw_data = b""
13 |     _base_addr = 0
14 |     _bitness = 0
15 |     _architecture = ""
16 |     _code_areas = []
17 |     file_loaders = [PeFileLoader, ElfFileLoader, MachoFileLoader, DelphiKbFileLoader]
18 | 
19 |     def __init__(self, file_path, load_file=True, map_file=False):
20 |         self._file_path = file_path
21 |         self._map_file = map_file
22 |         if load_file:
23 |             self._loadFile()
24 | 
25 |     def _loadRawFileContent(self):
26 |         binary = ""
27 |         if os.path.isfile(self._file_path):
28 |             with open(self._file_path, "rb") as inf:
29 |                 binary = inf.read()
30 |         return binary
31 | 
32 |     def _loadFile(self, buffer=None):
33 |         self._raw_data = buffer if buffer is not None else self._loadRawFileContent()
34 |         if self._map_file:
35 |             for loader in self.file_loaders:
36 |                 if loader.isCompatible(self._raw_data):
37 |                     self._data = loader.mapBinary(self._raw_data)
38 |                     self._base_addr = loader.getBaseAddress(self._raw_data)
39 |                     self._bitness = loader.getBitness(self._raw_data)
40 |                     self._code_areas = loader.getCodeAreas(self._raw_data)
41 |                     self._architecture = loader.getArchitecture(self._raw_data)
42 |                     break
43 |         else:
44 |             self._data = self._raw_data
45 | 
46 |     def getData(self):
47 |         return self._data
48 | 
49 |     def getRawData(self):
50 |         return self._raw_data
51 | 
52 |     def getBaseAddress(self):
53 |         return self._base_addr
54 | 
55 |     def getArchitecture(self):
56 |         return self._architecture
57 | 
58 |     def getBitness(self):
59 |         return self._bitness
60 | 
61 |     def getCodeAreas(self):
62 |         return self._code_areas
63 | 


--------------------------------------------------------------------------------
/smda/utility/MachoFileLoader.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | LOGGER = logging.getLogger(__name__)
  4 | 
  5 | LIEF_AVAILABLE = False
  6 | try:
  7 |     import lief
  8 |     lief.logging.disable()
  9 |     LIEF_AVAILABLE = True
 10 | except:
 11 |     LOGGER.warning("LIEF not available, will not be able to parse data from MachO files.")
 12 | 
 13 | 
 14 | def align(v, alignment):
 15 |     remainder = v % alignment
 16 |     if remainder == 0:
 17 |         return v
 18 |     else:
 19 |         return v + (alignment - remainder)
 20 | 
 21 | 
 22 | class MachoFileLoader(object):
 23 | 
 24 |     @staticmethod
 25 |     def isCompatible(data):
 26 |         if not LIEF_AVAILABLE:
 27 |             return False
 28 |         # check for MachO magic
 29 |         return data[:4] == b"\xCE\xFA\xED\xFE" or data[:4] == b"\xCF\xFA\xED\xFE"
 30 | 
 31 |     @staticmethod
 32 |     def getBaseAddress(binary):
 33 |         macho_file = lief.parse(binary)
 34 |         # Determine base address of binary
 35 |         #
 36 |         base_addr = 0
 37 |         candidates = [0xFFFFFFFFFFFFFFFF, macho_file.imagebase]
 38 |         for section in macho_file.sections:
 39 |             if section.virtual_address:
 40 |                 candidates.append(section.virtual_address - section.offset)
 41 |         if len(candidates) > 1:
 42 |             base_addr = min(candidates)
 43 |         return base_addr
 44 | 
 45 |     @staticmethod
 46 |     def mapBinary(binary):
 47 |         """
 48 |         map the MachO file sections and segments into a contiguous bytearray
 49 |         as if into virtual memory with the given base address.
 50 |         """
 51 |         # MachO needs a file-like object...
 52 |         # Attention: for Python 2.x use the cStringIO package for StringIO
 53 |         macho_file = lief.parse(binary)
 54 |         base_addr = MachoFileLoader.getBaseAddress(binary)
 55 | 
 56 |         LOGGER.debug("MachO: base address: 0x%x", base_addr)
 57 | 
 58 |         # a segment may contain 0 or more sections.
 59 |         # ref: https://stackoverflow.com/a/14382477/87207
 60 |         #
 61 |         # i'm not sure if a section may be found outside of a segment.
 62 |         # therefore, lets load segments first, and then load sections over them.
 63 |         # we expect the section data to overwrite the segment data; however,
 64 |         # it should be exactly the same data.
 65 | 
 66 |         # find min and max virtual addresses.
 67 |         max_virtual_address = 0
 68 |         min_virtual_address = 0xFFFFFFFFFFFFFFFF
 69 |         min_raw_offset = 0xFFFFFFFFFFFFFFFF
 70 | 
 71 |         # find begin of the first section/segment and end of the last section/segment.
 72 |         for section in sorted(macho_file.sections, key=lambda section: section.size, reverse=True):
 73 |             if not section.virtual_address:
 74 |                 continue
 75 | 
 76 |             max_virtual_address = max(max_virtual_address, section.size + section.virtual_address)
 77 |             min_virtual_address = min(min_virtual_address, section.virtual_address)
 78 |             min_raw_offset = min(min_raw_offset, section.offset)
 79 | 
 80 |         for segment in macho_file.segments:
 81 |             if not segment.virtual_address:
 82 |                 continue
 83 |             max_virtual_address = max(max_virtual_address, segment.virtual_size + segment.virtual_address)
 84 |             min_virtual_address = min(min_virtual_address, segment.virtual_address)
 85 |             min_raw_offset = min(min_raw_offset, segment.file_offset)
 86 | 
 87 |         if not max_virtual_address:
 88 |             LOGGER.debug("MachO: no section or segment data")
 89 |             return bytes()
 90 | 
 91 |         # create mapped region.
 92 |         # offset 0x0 corresponds to the MachO base address
 93 |         virtual_size = max_virtual_address - base_addr
 94 |         LOGGER.debug("MachO: max virtual section offset: 0x%x", max_virtual_address)
 95 |         LOGGER.debug("MachO: mapped size: 0x%x", virtual_size)
 96 |         LOGGER.debug("MachO: min raw offset: 0x%x", min_raw_offset)
 97 |         mapped_binary = bytearray(align(virtual_size, 0x1000))
 98 | 
 99 |         # map segments.
100 |         # segments may contains 0 or more sections,
101 |         # so we do segments first.
102 |         #
103 |         # load sections from largest to smallest,
104 |         # because some segments may overlap.
105 |         #
106 |         # technically, we should only have to load PT_LOAD segments,
107 |         # but we do all of them here.
108 |         for segment in sorted(macho_file.segments, key=lambda segment: segment.file_size, reverse=True):
109 |             if not segment.virtual_address:
110 |                 continue
111 |             rva = segment.virtual_address - base_addr
112 |             LOGGER.debug("MachO: mapping segment of 0x%04x bytes at 0x%08x-0x%08x (0x%08x)", segment.file_size, rva, rva + segment.file_size, segment.virtual_address)
113 |             assert len(segment.content) == segment.file_size
114 |             mapped_binary[rva:rva + segment.file_size] = segment.content
115 | 
116 |         # map sections.
117 |         # may overwrite some segment data, but we expect the content to be identical.
118 |         for section in sorted(macho_file.sections, key=lambda section: section.size, reverse=True):
119 |             if not section.virtual_address:
120 |                 continue
121 |             rva = section.virtual_address - base_addr
122 |             LOGGER.debug("MachO: mapping section of 0x%04x bytes at 0x%08x-0x%08x (0x%08x)", section.size, rva, rva + section.size, section.virtual_address)
123 |             # section may be empty or smaller, so we may not always copy data here
124 |             if len(section.content) == section.size:
125 |                 mapped_binary[rva:rva + section.size] = section.content
126 |             # assert len(section.content) == section.size
127 | 
128 |         # map header.
129 |         # we consider the headers to be any data found before the first section/segment
130 |         if min_raw_offset != 0:
131 |             LOGGER.debug("MachO: mapping 0x%x bytes of header at 0x0 (0x%x)", min_raw_offset, base_addr)
132 |             mapped_binary[0:min_raw_offset] = binary[0:min_raw_offset]
133 | 
134 |         LOGGER.debug("MachO: final mapped size: 0x%x", len(mapped_binary))
135 |         return bytes(mapped_binary)
136 | 
137 |     @staticmethod
138 |     def getArchitecture(binary):
139 |         # TODO add machine types whenever we add more architectures
140 |         macho_file = lief.parse(binary)
141 |         machine_type = macho_file.header.cpu_type
142 |         if machine_type in [lief.MachO.Header.CPU_TYPE.X86_64, lief.MachO.Header.CPU_TYPE.X86]:
143 |             return "intel"
144 |         elif machine_type == [lief.MachO.Header.CPU_TYPE.ARM64, lief.MachO.Header.CPU_TYPE.ARM]:
145 |             return "arm"
146 |         raise NotImplementedError("SMDA does not support this architecture yet.")
147 | 
148 |     @staticmethod
149 |     def getBitness(binary):
150 |         # TODO add machine types whenever we add more architectures
151 |         macho_file = lief.parse(binary)
152 |         machine_type = macho_file.header.cpu_type
153 |         if machine_type == lief.MachO.Header.CPU_TYPE.X86_64:
154 |             return 64
155 |         elif machine_type == lief.MachO.Header.CPU_TYPE.X86:
156 |             return 32
157 |         elif machine_type == lief.MachO.Header.CPU_TYPE.ARM64:
158 |             raise NotImplementedError("SMDA does not support ARM yet.")
159 |         return 0
160 | 
161 |     @staticmethod
162 |     def mergeCodeAreas(code_areas):
163 |         merged_code_areas = sorted(code_areas)
164 |         result = []
165 |         index = 0
166 |         while index < len(merged_code_areas) - 1:
167 |             this_area = merged_code_areas[index]
168 |             next_area = merged_code_areas[index + 1]
169 |             if this_area[1] != next_area[0]:
170 |                 result.append(this_area)
171 |                 index += 1
172 |             else:
173 |                 merged_code_areas = merged_code_areas[:index] + [[this_area[0], next_area[1]]] + merged_code_areas[index + 2:]
174 |         return merged_code_areas
175 | 
176 |     @staticmethod
177 |     def getCodeAreas(binary):
178 |         # TODO add machine types whenever we add more architectures
179 |         macho_file = lief.parse(binary)
180 |         ins_flags = (
181 |             lief.MachO.Section.FLAGS.PURE_INSTRUCTIONS.value +
182 |             lief.MachO.Section.FLAGS.SELF_MODIFYING_CODE.value +
183 |             lief.MachO.Section.FLAGS.SOME_INSTRUCTIONS.value
184 |         )
185 |         code_areas = []
186 |         for section in macho_file.sections:
187 |             # SHF_EXECINSTR = 4
188 |             if section.flags.value & ins_flags:
189 |                 section_start = section.virtual_address
190 |                 section_size = section.size
191 |                 if section.alignment and section_size % section.alignment != 0:
192 |                     section_size += section.alignment - (section_size % section.alignment)
193 |                 section_end = section_start + section_size
194 |                 code_areas.append([section_start, section_end])              
195 |         return MachoFileLoader.mergeCodeAreas(code_areas)
196 | 


--------------------------------------------------------------------------------
/smda/utility/MemoryFileLoader.py:
--------------------------------------------------------------------------------
1 | from smda.utility.FileLoader import FileLoader
2 | 
3 | 
4 | class MemoryFileLoader(FileLoader):
5 | 
6 |     def __init__(self, buffer, load_file=True, map_file=False):
7 |         super().__init__("", load_file=False, map_file=map_file)
8 |         self._loadFile(buffer=buffer)
9 | 


--------------------------------------------------------------------------------
/smda/utility/PeFileLoader.py:
--------------------------------------------------------------------------------
  1 | import struct
  2 | import logging
  3 | 
  4 | import lief
  5 | lief.logging.disable()
  6 | 
  7 | LOG = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | 
 11 | class PeFileLoader(object):
 12 | 
 13 |     BITNESS_MAP = {0x14c: 32, 0x8664: 64}
 14 | 
 15 |     @staticmethod
 16 |     def isCompatible(data):
 17 |         return data[:2] == b"MZ"
 18 | 
 19 |     @staticmethod
 20 |     def mapBinary(binary):
 21 |         # This is a pretty rough implementation but does the job for now
 22 |         mapped_binary = bytearray([])
 23 |         pe_offset = PeFileLoader.getPeOffset(binary)
 24 |         if pe_offset:
 25 |             num_sections = 0
 26 |             bitness = 0
 27 |             section_infos = []
 28 |             optional_header_size = 0xF8
 29 |             if pe_offset and len(binary) >= pe_offset + 0x8:
 30 |                 num_sections = struct.unpack("H", binary[pe_offset + 0x6:pe_offset + 0x8])[0]
 31 |                 bitness = PeFileLoader.getBitness(binary)
 32 |                 if bitness == 64:
 33 |                     optional_header_size = 0x108
 34 |             if pe_offset and num_sections and len(binary) >= pe_offset + optional_header_size + num_sections * 0x28:
 35 |                 for section_index in range(num_sections):
 36 |                     section_offset = section_index * 0x28
 37 |                     slice_start = pe_offset + optional_header_size + section_offset + 0x8
 38 |                     slice_end = pe_offset + optional_header_size + section_offset + 0x8 + 0x10
 39 |                     virt_size, virt_offset, raw_size, raw_offset = struct.unpack("IIII", binary[slice_start:slice_end])
 40 |                     section_info = {
 41 |                         "section_index": section_index,
 42 |                         "virt_size": virt_size,
 43 |                         "virt_offset": virt_offset,
 44 |                         "raw_size": raw_size,
 45 |                         "raw_offset": raw_offset,
 46 |                     }
 47 |                     section_infos.append(section_info)
 48 |             max_virt_section_offset = 0
 49 |             min_raw_section_offset = 0xFFFFFFFF
 50 |             if section_infos:
 51 |                 for section_info in section_infos:
 52 |                     max_virt_section_offset = max(max_virt_section_offset, section_info["virt_size"] + section_info["virt_offset"])
 53 |                     max_virt_section_offset = max(max_virt_section_offset, section_info["raw_size"] + section_info["virt_offset"])
 54 |                     if section_info["raw_offset"] > 0x200:
 55 |                         min_raw_section_offset = min(min_raw_section_offset, section_info["raw_offset"])
 56 |             # support up to 100MB for now.
 57 |             if max_virt_section_offset and max_virt_section_offset < 100 * 1024 * 1024:
 58 |                 mapped_binary = bytearray([0] * max_virt_section_offset)
 59 |                 mapped_binary[0:min_raw_section_offset] = binary[0:min_raw_section_offset]
 60 |             for section_info in section_infos:
 61 |                 mapped_from = section_info["virt_offset"]
 62 |                 mapped_to = section_info["virt_offset"] + section_info["raw_size"]
 63 |                 mapped_binary[mapped_from:mapped_to] = binary[section_info["raw_offset"]:section_info["raw_offset"] + section_info["raw_size"]]
 64 |                 LOG.debug("Mapping %d: raw 0x%x (0x%x bytes) -> virtual 0x%x (0x%x bytes)",
 65 |                           section_info["section_index"],
 66 |                           section_info["raw_offset"],
 67 |                           section_info["raw_size"],
 68 |                           section_info["virt_offset"],
 69 |                           section_info["virt_size"])
 70 |             LOG.debug("Mapped binary of size %d bytes (%d sections) to memory view of size %d bytes", len(binary), num_sections, len(mapped_binary))
 71 |         return bytes(mapped_binary)
 72 | 
 73 |     @staticmethod
 74 |     def getBitness(binary):
 75 |         bitness_id = 0
 76 |         pe_offset = PeFileLoader.getPeOffset(binary)
 77 |         if pe_offset:
 78 |             if pe_offset and len(binary) >= pe_offset + 0x6:
 79 |                 bitness_id = struct.unpack("H", binary[pe_offset + 0x4:pe_offset + 0x6])[0]
 80 |         return PeFileLoader.BITNESS_MAP.get(bitness_id, 0)
 81 | 
 82 |     @staticmethod
 83 |     def getBaseAddress(binary):
 84 |         base_addr = 0
 85 |         pe_offset = PeFileLoader.getPeOffset(binary)
 86 |         if pe_offset and len(binary) >= pe_offset + 0x38:
 87 |             if PeFileLoader.getBitness(binary) == 32:
 88 |                 base_addr = struct.unpack("I", binary[pe_offset + 0x34:pe_offset + 0x38])[0]
 89 |             elif PeFileLoader.getBitness(binary) == 64:
 90 |                 base_addr = struct.unpack("Q", binary[pe_offset + 0x30:pe_offset + 0x38])[0]
 91 |         if base_addr:
 92 |             LOG.debug("Changing base address from 0 to: 0x%x for inference of reference counts (based on PE header)", base_addr)
 93 |         return base_addr
 94 | 
 95 |     @staticmethod
 96 |     def getPeOffset(binary):
 97 |         if len(binary) >= 0x40:
 98 |             pe_offset = struct.unpack("H", binary[0x3c:0x3c + 2])[0]
 99 |             return pe_offset
100 |         return 0
101 | 
102 |     @staticmethod
103 |     def getOEP(binary):
104 |         oep_rva = 0
105 |         if PeFileLoader.checkPe(binary):
106 |             pe_offset = PeFileLoader.getPeOffset(binary)
107 |             if pe_offset and len(binary) >= pe_offset + 0x2c:
108 |                 oep_rva = struct.unpack("I", binary[pe_offset + 0x28:pe_offset + 0x2C])[0]
109 |         return oep_rva
110 |     
111 |     @staticmethod
112 |     def getArchitecture(binary):
113 |         architecture = "intel"
114 |         pefile = lief.parse(binary)
115 |         if pefile:
116 |             for d in pefile.data_directories:
117 |                  if d.type == lief.PE.DataDirectory.TYPES.CLR_RUNTIME_HEADER:
118 |                      if d.size > 0:
119 |                         architecture = "cil"
120 |         return architecture
121 | 
122 |     @staticmethod
123 |     def checkPe(binary):
124 |         pe_offset = PeFileLoader.getPeOffset(binary)
125 |         if pe_offset and len(binary) >= pe_offset + 6:
126 |             bitness = struct.unpack("H", binary[pe_offset + 4:pe_offset + 4 + 2])[0]
127 |             return bitness in PeFileLoader.BITNESS_MAP
128 |         return False
129 | 
130 |     @staticmethod
131 |     def getCodeAreas(binary):
132 |         pefile = lief.parse(binary)
133 |         code_areas = []
134 |         base_address = PeFileLoader.getBaseAddress(binary)
135 |         if pefile and pefile.sections:
136 |             for section in pefile.sections:
137 |                 # MEM_EXECUTE
138 |                 if section.characteristics & 0x20000000:
139 |                     section_start = base_address + section.virtual_address
140 |                     section_size = section.virtual_size
141 |                     if section_size % 0x1000 != 0:
142 |                         section_size += 0x1000 - (section_size % 0x1000)
143 |                     section_end = section_start + section_size
144 |                     code_areas.append([section_start, section_end])
145 |         return PeFileLoader.mergeCodeAreas(code_areas)
146 | 
147 |     @staticmethod
148 |     def mergeCodeAreas(code_areas):
149 |         merged_code_areas = sorted(code_areas)
150 |         result = []
151 |         index = 0
152 |         while index < len(merged_code_areas) - 1:
153 |             this_area = merged_code_areas[index]
154 |             next_area = merged_code_areas[index + 1]
155 |             if this_area[1] != next_area[0]:
156 |                 result.append(this_area)
157 |                 index += 1
158 |             else:
159 |                 merged_code_areas = merged_code_areas[:index] + [[this_area[0], next_area[1]]] + merged_code_areas[index + 2:]
160 |         return merged_code_areas
161 | 


--------------------------------------------------------------------------------
/smda/utility/PriorityQueue.py:
--------------------------------------------------------------------------------
 1 | import heapq
 2 | 
 3 | class PriorityQueue(object):
 4 |     def __init__(self, content=None):
 5 |         if content is None:
 6 |             content = []
 7 |         self.heap = content
 8 |         if self.heap:
 9 |             self.update()
10 | 
11 |     def __iter__(self):
12 |         return self
13 | 
14 |     def __next__(self):
15 |         return self.next()
16 | 
17 |     def next(self):
18 |         if not self.heap:
19 |             raise StopIteration
20 |         if len(self.heap) == 1:
21 |             return self.heap.pop()
22 |         last_item = self.heap.pop()
23 |         result = self.heap[0]
24 |         self.heap[0] = last_item
25 |         heapq._siftup_max(self.heap, 0)
26 |         return result
27 | 
28 |     def add(self, element):
29 |         self.heap.append(element)
30 |         heapq._siftdown_max(self.heap, 0, len(self.heap)-1)
31 | 
32 |     def update(self, target_candidate=None):
33 |         if target_candidate is None:
34 |             heapq._heapify_max(self.heap)
35 | 
36 |     def __str__(self):
37 |         return str(self.heap)
38 | 


--------------------------------------------------------------------------------
/smda/utility/StringExtractor.py:
--------------------------------------------------------------------------------
  1 | import string
  2 | import struct
  3 | from typing import Tuple, Iterator
  4 | 
  5 | from smda.common import SmdaFunction
  6 | 
  7 | # ported back from our PR to capa v4.0.0
  8 | # https://github.com/mandiant/capa/blob/v4.0.0/capa/features/extractors/smda/insn.py
  9 | 
 10 | 
 11 | def read_bytes(smda_report, va, num_bytes=None):
 12 |     """
 13 |     read up to MAX_BYTES_FEATURE_SIZE from the given address.
 14 |     """
 15 | 
 16 |     rva = va - smda_report.base_addr
 17 |     if smda_report.buffer is None:
 18 |         raise ValueError("buffer is empty")
 19 |     buffer_end = len(smda_report.buffer)
 20 |     max_bytes = num_bytes if num_bytes is not None else 0x100
 21 |     if rva + max_bytes > buffer_end:
 22 |         return smda_report.buffer[rva:]
 23 |     else:
 24 |         return smda_report.buffer[rva : rva + max_bytes]
 25 | 
 26 | 
 27 | def derefs(smda_report, p):
 28 |     """
 29 |     recursively follow the given pointer, yielding the valid memory addresses along the way.
 30 |     useful when you may have a pointer to string, or pointer to pointer to string, etc.
 31 | 
 32 |     this is a "do what i mean" type of helper function.
 33 | 
 34 |     based on the implementation in viv/insn.py
 35 |     """
 36 |     depth = 0
 37 |     while True:
 38 |         if not smda_report.isAddrWithinMemoryImage(p):
 39 |             return
 40 |         yield p
 41 | 
 42 |         bytes_ = read_bytes(smda_report, p, num_bytes=4)
 43 |         val = struct.unpack("I", bytes_)[0]
 44 | 
 45 |         # sanity: pointer points to self
 46 |         if val == p:
 47 |             return
 48 | 
 49 |         # sanity: avoid chains of pointers that are unreasonably deep
 50 |         depth += 1
 51 |         if depth > 10:
 52 |             return
 53 | 
 54 |         p = val
 55 | 
 56 | 
 57 | def detect_ascii_len(smda_report, offset):
 58 |     if smda_report.buffer is None:
 59 |         return 0
 60 |     ascii_len = 0
 61 |     rva = offset - smda_report.base_addr
 62 |     char = smda_report.buffer[rva]
 63 |     while char < 127 and chr(char) in string.printable:
 64 |         ascii_len += 1
 65 |         rva += 1
 66 |         char = smda_report.buffer[rva]
 67 |     if char == 0:
 68 |         return ascii_len
 69 |     return 0
 70 | 
 71 | 
 72 | def detect_unicode_len(smda_report, offset):
 73 |     if smda_report.buffer is None:
 74 |         return 0
 75 |     unicode_len = 0
 76 |     rva = offset - smda_report.base_addr
 77 |     char = smda_report.buffer[rva]
 78 |     second_char = smda_report.buffer[rva + 1]
 79 |     while char < 127 and chr(char) in string.printable and second_char == 0:
 80 |         unicode_len += 2
 81 |         rva += 2
 82 |         char = smda_report.buffer[rva]
 83 |         second_char = smda_report.buffer[rva + 1]
 84 |     if char == 0 and second_char == 0:
 85 |         return unicode_len
 86 |     return 0
 87 | 
 88 | 
 89 | def read_string(smda_report, offset):
 90 |     alen = detect_ascii_len(smda_report, offset)
 91 |     if alen > 1:
 92 |         return read_bytes(smda_report, offset, alen).decode("utf-8")
 93 |     ulen = detect_unicode_len(smda_report, offset)
 94 |     if ulen > 2:
 95 |         return read_bytes(smda_report, offset, ulen).decode("utf-16")
 96 | 
 97 | 
 98 | def extract_strings(f: SmdaFunction) -> Iterator[Tuple[str, int]]:
 99 |     """parse string features from the given instruction."""
100 |     for insn in f.getInstructions():
101 |         for data_ref in insn.getDataRefs():
102 |             for v in derefs(f.smda_report, data_ref):
103 |                 string_read = read_string(f.smda_report, v)
104 |                 if string_read:
105 |                     yield string_read.rstrip("\x00"), insn.offset
106 | 


--------------------------------------------------------------------------------
/smda/utility/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/asprox_0x008D0000_xored:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/asprox_0x008D0000_xored


--------------------------------------------------------------------------------
/tests/bashlite_xored:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/bashlite_xored


--------------------------------------------------------------------------------
/tests/context.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 6 | 
 7 | import smda
 8 | from smda.SmdaConfig import SmdaConfig
 9 | config = SmdaConfig()
10 | config.API_COLLECTION_FILES = {"winxp": config.PROJECT_ROOT + os.sep + "data" + os.sep + "apiscout_winxp_prof_sp3.json"}
11 | 


--------------------------------------------------------------------------------
/tests/cutwail_xored:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/cutwail_xored


--------------------------------------------------------------------------------
/tests/komplex_xored:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/komplex_xored


--------------------------------------------------------------------------------
/tests/njrat_xored:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/njrat_xored


--------------------------------------------------------------------------------
/tests/testEscaper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import logging
 4 | import os
 5 | import unittest
 6 | 
 7 | from smda.common.SmdaReport import SmdaReport
 8 | from smda.common.SmdaFunction import SmdaFunction
 9 | from smda.common.SmdaInstruction import SmdaInstruction
10 | from smda.intel.IntelInstructionEscaper import IntelInstructionEscaper
11 | from smda.cil.CilInstructionEscaper import CilInstructionEscaper
12 | 
13 | from .context import config
14 | 
15 | LOG = logging.getLogger(__name__)
16 | logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
17 | logging.disable(logging.CRITICAL)
18 | 
19 | 
20 | class DisassemblyTestSuite(unittest.TestCase):
21 |     """Run a full example on a memory dump"""
22 | 
23 |     def testInstructionEscaping(self):
24 |         test_data = [
25 |             {"ins": (0, "55", "push", "ebp"), "mnemonic_group": "S", "escaped_operands": "REG"},
26 |             {"ins": (1, "8365fc00", "and", "dword ptr [ebp - 4], 0"), "mnemonic_group": "A", "escaped_operands": "PTR, CONST"},
27 |             {"ins": (2, "f30f1efa", "endbr64", ""), "mnemonic_group": "C", "escaped_operands": ""},
28 |             {"ins": (3, "c58e5ad3", "vcvtss2sd", "xmm2, xmm14, xmm3"), "mnemonic_group": "X", "escaped_operands": "XREG, XREG, XREG"},
29 |         ]
30 |         for data in test_data:
31 |             smda_ins = SmdaInstruction(data["ins"])
32 |             self.assertEqual(smda_ins.getMnemonicGroup(IntelInstructionEscaper), data["mnemonic_group"])
33 |             self.assertEqual(smda_ins.getEscapedOperands(IntelInstructionEscaper), data["escaped_operands"])
34 | 
35 |     def testIntelInstructionWildcarding(self):
36 |         test_data = [
37 |             # simple mov with IMM outside of address space
38 |             {"ins": (0, "b803400080", "mov", "eax, 0x80004003"), "lower": 0x63300000, "upper": 0x63400000, "expected_bin": "b803400080", "bitness": 32, "expected_opc": "b8????????"},
39 |             # simple mov with IMM within address space
40 |             {"ins": (0, "ba2c893863", "mov", "edx, 0x6338892c"), "lower": 0x63300000, "upper": 0x63400000, "expected_bin": "ba????????", "bitness": 32, "expected_opc": "ba????????"},
41 |             # mov with with address calc within address space
42 |             {"ins": (0, "0fb681808f3b63", "mov", "eax, byte ptr [ecx + 0x633b8f80]"), "lower": 0x63300000, "upper": 0x63400000, "expected_bin": "0fb681????????", "bitness": 32, "expected_opc": "0fb6??????????"},
43 |             # jump table calculation
44 |             {"ins": (0, "ff2485788f3b63", "jmp", "dword ptr [eax*4 + 0x633b8f78]"), "lower": 0x63300000, "upper": 0x63400000, "expected_bin": "ff2485????????", "bitness": 32, "expected_opc": "ff????????????"},
45 |             # should only wildcard last part as escaper doesn't know address space
46 |             {"ins": (0, "c705ac974a00ac974a00", "mov", "dword ptr [0x4a97ac], 0x4a97ac"), "lower": None, "upper": None, "expected_bin": "c705ac974a00????????", "bitness": 32, "expected_opc": "c7??????????????????"},
47 |             # should escape both operands
48 |             {"ins": (0, "c705ac974a00ac974a00", "mov", "dword ptr [0x4a97ac], 0x4a97ac"), "lower": 0x400000, "upper": 0x4f0000, "expected_bin": "c705????????????????", "bitness": 32, "expected_opc": "c7??????????????????"},
49 |             # should escape from the right side and only blank out one, despite finding two matches for the pattern
50 |             {"ins": (0, "010505050505", "add", "dword ptr [0x5050505], eax"), "lower": 0x400000, "upper": 0x4f0000, "expected_bin": "0105????????", "bitness": 32, "expected_opc": "01??????????"},
51 |             # should escape from the right side and only blank out one, despite finding two matches for the pattern
52 |             {"ins": (0, "0f101515151515", "movups", "xmm2, xmmword ptr [0x15151515]"), "lower": 0x400000, "upper": 0x4f0000, "expected_bin": "0f1015????????", "bitness": 32, "expected_opc": "0f10??????????"},
53 |             # should ignore prefixes while wildcarding
54 |             {"ins": (0, "666666660f008000224000", "sldt", "word ptr [rax + 0x402200]"), "lower": 0x400000, "upper": 0x4f0000, "expected_bin": "666666660f0080????????", "bitness": 32, "expected_opc": "666666660f00??????????"},
55 |             # should ignore prefixes and REX while wildcarding
56 |             {"ins": (0, "66666666480f008000224000", "sldt", "word ptr [rax + 0x402200]"), "lower": 0x400000, "upper": 0x4f0000, "expected_bin": "66666666480f0080????????", "bitness": 64, "expected_opc": "66666666480f00??????????"},
57 |         ]
58 |         for data in test_data:
59 |             smda_report = SmdaReport()
60 |             smda_report.bitness = data["bitness"]
61 |             smda_function = SmdaFunction(smda_report=smda_report)
62 |             smda_ins = SmdaInstruction(data["ins"], smda_function=smda_function)
63 |             self.assertEqual(smda_ins.getEscapedBinary(IntelInstructionEscaper, lower_addr=data["lower"], upper_addr=data["upper"]), data["expected_bin"])
64 |             self.assertEqual(smda_ins.getEscapedToOpcodeOnly(IntelInstructionEscaper), data["expected_opc"])
65 | 
66 |     def testCilInstructionWildcarding(self):
67 |         test_data = [
68 |             # call MemberRef
69 |             {"ins": (0, "280a000006", "call", "SomeFunc"), "expected_bin": "28??????06", "expected_bin_intraprocedural": "28??????06", "expected_opc": "28????????", "bitness": 32},
70 |             {"ins": (0, "6fbb00000a", "callvirt", "SomeFunc"), "expected_bin": "6f??????0a", "expected_bin_intraprocedural": "6f??????0a", "expected_opc": "6f????????", "bitness": 32},
71 |             {"ins": (0, "2d3a", "brtrue.s", "0x5994"), "expected_bin": "2d3a", "expected_bin_intraprocedural": "2d??", "expected_opc": "2d??", "bitness": 32},
72 |             {"ins": (0, "450300000002000000060000000a000000", "switch", "[(0D50), (0D54), (0D58)]"), "expected_bin": "450300000002000000060000000a000000", "expected_bin_intraprocedural": "45????????????????????????????????", "expected_opc": "45????????????????????????????????", "bitness": 32},
73 |             {"ins": (0, "20c48efb0e", "ldc.i4", "0xefb8ec4"), "expected_bin": "20c48efb0e", "expected_bin_intraprocedural": "20c48efb0e", "expected_opc": "20????????", "bitness": 32},
74 |         ]
75 |         for data in test_data:
76 |             smda_report = SmdaReport()
77 |             smda_report.bitness = data["bitness"]
78 |             smda_function = SmdaFunction(smda_report=smda_report)
79 |             smda_ins = SmdaInstruction(data["ins"], smda_function=smda_function)
80 |             self.assertEqual(CilInstructionEscaper.escapeToOpcodeOnly(smda_ins), data["expected_opc"])
81 |             self.assertEqual(CilInstructionEscaper.escapeBinary(smda_ins), data["expected_bin"])
82 |             self.assertEqual(CilInstructionEscaper.escapeBinary(smda_ins, escape_intraprocedural_jumps=True), data["expected_bin_intraprocedural"])
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     unittest.main()
87 | 


--------------------------------------------------------------------------------
/tests/testFileFormatParsers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import logging
  4 | import os
  5 | import lief
  6 | import unittest
  7 | 
  8 | from smda.utility.FileLoader import FileLoader
  9 | from smda.common.BinaryInfo import BinaryInfo
 10 | from smda.Disassembler import Disassembler
 11 | from smda.common.SmdaReport import SmdaReport
 12 | from smda.common.SmdaFunction import SmdaFunction
 13 | from .context import config
 14 | 
 15 | LOG = logging.getLogger(__name__)
 16 | logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
 17 | logging.disable(logging.CRITICAL)
 18 | 
 19 | 
 20 | class SmdaIntegrationTestSuite(unittest.TestCase):
 21 |     """Run a full example on a memory dump"""
 22 | 
 23 |     @classmethod
 24 |     def setUpClass(cls):
 25 |         super(SmdaIntegrationTestSuite, cls).setUpClass()
 26 | 
 27 |     def testPeParsingWithCutwail(self):
 28 |         disasm = Disassembler(config, backend="intel")
 29 |         # load encrypted malicious win.cutwail
 30 |         with open(os.path.join(config.PROJECT_ROOT, "tests", "cutwail_xored"), "rb") as f_binary:
 31 |             binary = f_binary.read()
 32 |         decrypted_cutwail = bytearray()
 33 |         for index, byte in enumerate(binary):
 34 |             if isinstance(byte, str):
 35 |                 byte = ord(byte)
 36 |             decrypted_cutwail.append(byte ^ (index % 256))
 37 |         cutwail_binary = bytes(decrypted_cutwail)
 38 |         # run FileLoader and disassemble as file
 39 |         loader = FileLoader("/", map_file=True)
 40 |         loader._loadFile(cutwail_binary)
 41 |         file_content = loader.getData()
 42 |         binary_info = BinaryInfo(file_content)
 43 |         binary_info.raw_data = loader.getRawData()
 44 |         binary_info.file_path = ""
 45 |         binary_info.base_addr = loader.getBaseAddress()
 46 |         binary_info.bitness = loader.getBitness()
 47 |         binary_info.code_areas = loader.getCodeAreas()
 48 |         binary_info.oep = binary_info.getOep()
 49 |         cutwail_binary_info = binary_info
 50 |         # parse bytes of 0x400 truncated PE header
 51 |         pe_header = lief.parse(binary_info.getHeaderBytes())
 52 |         assert pe_header.dos_header.magic == 0x5A4D
 53 |         assert pe_header.header.machine == 0x14C
 54 |         cutwail_disassembly = disasm._disassemble(binary_info)
 55 |         cutwail_unmapped_disassembly = disasm.disassembleUnmappedBuffer(cutwail_binary)
 56 |         assert cutwail_unmapped_disassembly.num_functions == 33
 57 |         # TODO test label extraction for PE, add another binary for testing
 58 | 
 59 |     def testElfParsingWithBashlite(self):
 60 |         disasm = Disassembler(config, backend="intel")
 61 |         # load encrypted benign /bin/cat
 62 |         with open(os.path.join(config.PROJECT_ROOT, "tests", "bashlite_xored"), "rb") as f_binary:
 63 |             binary = f_binary.read()
 64 |         decrypted_bashlite = bytearray()
 65 |         for index, byte in enumerate(binary):
 66 |             if isinstance(byte, str):
 67 |                 byte = ord(byte)
 68 |             decrypted_bashlite.append(byte ^ (index % 256))
 69 |         bashlite_binary = bytes(decrypted_bashlite)
 70 |         # run FileLoader and disassemble as file
 71 |         loader = FileLoader("/", map_file=True)
 72 |         loader._loadFile(bashlite_binary)
 73 |         file_content = loader.getData()
 74 |         binary_info = BinaryInfo(file_content)
 75 |         binary_info.raw_data = loader.getRawData()
 76 |         binary_info.file_path = ""
 77 |         binary_info.base_addr = loader.getBaseAddress()
 78 |         binary_info.bitness = loader.getBitness()
 79 |         binary_info.code_areas = loader.getCodeAreas()
 80 |         binary_info.oep = binary_info.getOep()
 81 |         bashlite_binary_info = binary_info
 82 |         bashlite_disassembly = disasm._disassemble(binary_info)
 83 |         bashlite_unmapped_disassembly = disasm.disassembleUnmappedBuffer(bashlite_binary)
 84 |         assert bashlite_unmapped_disassembly.num_functions == 177
 85 |         assert len([f.function_name for f in bashlite_unmapped_disassembly.getFunctions() if f.function_name]) == 174
 86 | 
 87 |     def testDotnetParsingWithNjRAT(self):
 88 |         disasm = Disassembler(config, backend="cil")
 89 |         # load encrypted malicious win.cutwail
 90 |         with open(os.path.join(config.PROJECT_ROOT, "tests", "njrat_xored"), "rb") as f_binary:
 91 |             binary = f_binary.read()
 92 |         decrypted_njrat = bytearray()
 93 |         for index, byte in enumerate(binary):
 94 |             if isinstance(byte, str):
 95 |                 byte = ord(byte)
 96 |             decrypted_njrat.append(byte ^ (index % 256))
 97 |         njrat_binary = bytes(decrypted_njrat)
 98 |         # run FileLoader and disassemble as file
 99 |         njrat_unmapped_disassembly = disasm.disassembleUnmappedBuffer(njrat_binary)
100 |         assert njrat_unmapped_disassembly.num_functions == 64
101 |         assert len([f.function_name for f in njrat_unmapped_disassembly.getFunctions() if f.function_name]) == 64
102 | 
103 |     def testMacOsParsingWithKomplex(self):
104 |         disasm = Disassembler(config, backend="intel")
105 |         # load encrypted malicious osx.komplex
106 |         with open(os.path.join(config.PROJECT_ROOT, "tests", "komplex_xored"), "rb") as f_binary:
107 |             binary = f_binary.read()
108 |         decrypted_komplex = bytearray()
109 |         for index, byte in enumerate(binary):
110 |             if isinstance(byte, str):
111 |                 byte = ord(byte)
112 |             decrypted_komplex.append(byte ^ (index % 256))
113 |         komplex_binary = bytes(decrypted_komplex)
114 |         # run FileLoader and disassemble as file
115 |         loader = FileLoader("/", map_file=True)
116 |         loader._loadFile(komplex_binary)
117 |         file_content = loader.getData()
118 |         binary_info = BinaryInfo(file_content)
119 |         binary_info.raw_data = loader.getRawData()
120 |         binary_info.file_path = ""
121 |         binary_info.base_addr = loader.getBaseAddress()
122 |         binary_info.bitness = loader.getBitness()
123 |         binary_info.code_areas = loader.getCodeAreas()
124 |         binary_info.oep = binary_info.getOep()
125 |         komplex_binary_info = binary_info
126 |         komplex_disassembly = disasm._disassemble(binary_info)
127 |         komplex_unmapped_disassembly = disasm.disassembleUnmappedBuffer(komplex_binary)
128 |         komplex_unmapped_disassembly.num_functions == 208
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     unittest.main()
133 | 


--------------------------------------------------------------------------------
/tests/testIntegration.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import logging
  4 | import os
  5 | import unittest
  6 | 
  7 | from smda.utility.FileLoader import FileLoader
  8 | from smda.common.BinaryInfo import BinaryInfo
  9 | from smda.Disassembler import Disassembler
 10 | from smda.common.SmdaReport import SmdaReport
 11 | from smda.common.SmdaFunction import SmdaFunction
 12 | from .context import config
 13 | 
 14 | LOG = logging.getLogger(__name__)
 15 | logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
 16 | logging.disable(logging.CRITICAL)
 17 | 
 18 | 
 19 | class SmdaIntegrationTestSuite(unittest.TestCase):
 20 |     """Run a full example on a memory dump"""
 21 | 
 22 |     @classmethod
 23 |     def setUpClass(cls):
 24 |         super(SmdaIntegrationTestSuite, cls).setUpClass()
 25 |         config.WITH_STRINGS = True
 26 |         disasm = Disassembler(config)
 27 |         # load encrypted Asprox
 28 |         with open(os.path.join(config.PROJECT_ROOT, "tests", "asprox_0x008D0000_xored"), "rb") as f_binary:
 29 |             binary = f_binary.read()
 30 |         decrypted_asprox = bytearray()
 31 |         for index, byte in enumerate(binary):
 32 |             if isinstance(byte, str):
 33 |                 byte = ord(byte)
 34 |             decrypted_asprox.append(byte ^ (index % 256))
 35 |         cls.asprox_binary = decrypted_asprox
 36 |         cls.asprox_disassembly = disasm.disassembleBuffer(bytes(decrypted_asprox), 0x8D0000)
 37 |         # load encrypted Cutwail
 38 |         with open(os.path.join(config.PROJECT_ROOT, "tests", "cutwail_xored"), "rb") as f_binary:
 39 |             binary = f_binary.read()
 40 |         decrypted_cutwail = bytearray()
 41 |         for index, byte in enumerate(binary):
 42 |             if isinstance(byte, str):
 43 |                 byte = ord(byte)
 44 |             decrypted_cutwail.append(byte ^ (index % 256))
 45 |         cls.cutwail_binary = bytes(decrypted_cutwail)
 46 |         # run FileLoader and disassemble as file
 47 |         loader = FileLoader("/", map_file=True)
 48 |         loader._loadFile(cls.cutwail_binary)
 49 |         file_content = loader.getData()
 50 |         binary_info = BinaryInfo(file_content)
 51 |         binary_info.raw_data = loader.getRawData()
 52 |         binary_info.file_path = ""
 53 |         binary_info.base_addr = loader.getBaseAddress()
 54 |         binary_info.bitness = loader.getBitness()
 55 |         binary_info.code_areas = loader.getCodeAreas()
 56 |         binary_info.oep = binary_info.getOep()
 57 |         cls.cutwail_binary_info = binary_info
 58 |         cls.cutwail_disassembly = disasm._disassemble(binary_info)
 59 |         cls.cutwail_unmapped_disassembly = disasm.disassembleUnmappedBuffer(cls.cutwail_binary)
 60 | 
 61 |     def testAsproxDisassemblyCoverage(self):
 62 |         assert len([fn for fn in self.asprox_disassembly.getFunctions()]) == 105
 63 | 
 64 |     def testOep(self):
 65 |         # PE header from buffers are not parsed, so we don't get header infos
 66 |         assert self.asprox_disassembly.oep == None
 67 |         # PE headers are parsed for regularly processed files (PE+ELF)
 68 |         assert self.cutwail_unmapped_disassembly.oep == 0x1730
 69 | 
 70 |     def testCodeXrefCreation(self):
 71 |         example_function = self.asprox_disassembly.getFunction(0x008d8292)
 72 |         # should be initialized on demand only
 73 |         assert example_function.code_inrefs == None
 74 |         # example function has inrefs and outrefs
 75 |         inrefs = [code_inref for code_inref in example_function.getCodeInrefs()]
 76 |         assert len(inrefs) == 1
 77 |         for xref in example_function.getCodeInrefs():
 78 |             print(xref.from_function, xref.from_instruction, xref.to_function, xref.to_instruction)
 79 |         outrefs = [code_outref for code_outref in example_function.getCodeOutrefs()]
 80 |         assert len(outrefs) == 10
 81 | 
 82 |     def testAsproxStringRefs(self):
 83 |         function_with_strings = self.asprox_disassembly.getFunction(0x008d2850)
 84 |         assert function_with_strings.stringrefs[9251000] == "Software"
 85 |         marshalled = function_with_strings.toDict()
 86 |         unmarshalled = SmdaFunction.fromDict(marshalled)
 87 |         assert unmarshalled.stringrefs[9251000] == "Software"
 88 | 
 89 |     def testAsproxApiCoverage(self):
 90 |         num_api_ref_srcs = 0
 91 |         api_ref_dsts = set()
 92 |         for fn in self.asprox_disassembly.getFunctions():
 93 |             num_api_ref_srcs += len(fn.apirefs)
 94 |             api_ref_dsts.update(fn.apirefs.values())
 95 |         assert num_api_ref_srcs == 546
 96 |         assert len(api_ref_dsts) == 95
 97 | 
 98 |     def testAsproxMarshalling(self):
 99 |         report_as_dict = self.asprox_disassembly.toDict()
100 |         assert report_as_dict["status"] == "ok"
101 |         assert report_as_dict["base_addr"] == 0x8D0000
102 |         assert report_as_dict["statistics"]["num_instructions"] == 15706
103 |         assert report_as_dict["sha256"] == "db8a133fed1b706608a4492079b702ded6b70369a980d2b5ae355a6adc78ef00"
104 |         reimported_report = SmdaReport.fromDict(report_as_dict)
105 | 
106 |     def testCutwailMarshalling(self):
107 |         report_as_dict = self.cutwail_disassembly.toDict()
108 |         assert report_as_dict["status"] == "ok"
109 |         assert report_as_dict["base_addr"] == 0x4000000
110 |         assert report_as_dict["statistics"]["num_instructions"] == 1611
111 |         assert report_as_dict["sha256"] == "a348a0ddfab135d152b684d561a3215ab6c472570facd3d75aa2c7ee845a8e2b"
112 |         # compare our manual file loading with unmapped buffer
113 |         assert self.cutwail_disassembly.num_instructions == self.cutwail_unmapped_disassembly.num_instructions
114 |         reimported_report = SmdaReport.fromDict(report_as_dict)
115 | 
116 |     def testBlockLocator(self):
117 |         # test with a function start
118 |         found_function = self.asprox_disassembly.findFunctionByContainedAddress(0x008d8292)
119 |         found_block = self.asprox_disassembly.findBlockByContainedAddress(0x008d8292)
120 |         assert found_function.offset == 0x008d8292
121 |         assert found_block.offset == 0x008d8292
122 |         # test with an instruction in a block a bit deeper in the function
123 |         found_function = self.asprox_disassembly.findFunctionByContainedAddress(0x008d82a6)
124 |         found_block = self.asprox_disassembly.findBlockByContainedAddress(0x008d82a6)
125 |         assert found_function.offset == 0x008d8292
126 |         assert found_block.offset == 0x008d82a4
127 |         # test with an offset that is not start of an instruction
128 |         found_function = self.asprox_disassembly.findFunctionByContainedAddress(0x008d82a7)
129 |         found_block = self.asprox_disassembly.findBlockByContainedAddress(0x008d82a7)
130 |         assert found_function.offset == 0x008d8292
131 |         assert found_block.offset == 0x008d82a4
132 |         # test with offsets beyond image base and binary size
133 |         found_function = self.asprox_disassembly.findFunctionByContainedAddress(0x100)
134 |         found_block = self.asprox_disassembly.findBlockByContainedAddress(0x100)
135 |         assert found_function is None
136 |         assert found_block is None
137 |         found_function = self.asprox_disassembly.findFunctionByContainedAddress(0xFFFFFF00)
138 |         found_block = self.asprox_disassembly.findBlockByContainedAddress(0xFFFFFF00)
139 |         assert found_function is None
140 |         assert found_block is None
141 | 
142 | 
143 | if __name__ == '__main__':
144 |     unittest.main()
145 | 


--------------------------------------------------------------------------------
/tests/testTarjan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import logging
 4 | import os
 5 | import unittest
 6 | 
 7 | from smda.common.Tarjan import Tarjan
 8 | 
 9 | from .context import config
10 | 
11 | LOG = logging.getLogger(__name__)
12 | logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
13 | logging.disable(logging.CRITICAL)
14 | 
15 | 
16 | class TarjanTestSuite(unittest.TestCase):
17 |     """Provoke recursion"""
18 | 
19 |     def testInstructionEscaping(self):
20 |         test_data = {i: [] for i in range(1000)}
21 |         for i in range(1, 1000):
22 |             for j in range(i + 1, 1000, 1):
23 |                 test_data[i].append(j)
24 |         test_data[1000] = []
25 | 
26 |         tarjan = Tarjan(test_data)
27 |         tarjan.calculateScc()
28 |         sccs = tarjan.getResult()
29 |         self.assertEqual(1001, len(sccs))
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     unittest.main()
34 | 


--------------------------------------------------------------------------------
/version_history.md:
--------------------------------------------------------------------------------
 1 | # Full Version History
 2 | 
 3 |  * 2022-11-18: v1.9.16- Fixed a bug where handling of inrefs in SmdaReport could lead to crashes (THX to @1337-42!).
 4 |  * 2022-09-27: v1.9.15- Fixed a bug where recognition of code areas would not incorporate virtual addressing (infinite loops while Delphi VMT parsing).
 5 |  * 2022-09-20: v1.9.13- Fixed a bug for listing unreachable basic block refs pointing outside of function boundaries (exception handling).
 6 |  * 2022-09-19: v1.9.12- Fixed a logic binding bug in IntelInstructionEscaper (THX to @1337-42!).
 7 |  * 2022-09-08: v1.9.11- Exposed masking of intraprocedural jmps/calls in SmdaInstruction.
 8 |  * 2022-08-31: v1.9.9 - Better handling of colliding code due to tailjumps.
 9 |  * 2022-08-30: v1.9.8 - Improved accuracy for references around tailcalls.
10 |  * 2022-08-25: v1.9.6 - Fixed bug in delphi knowledge base handling and improved performance.
11 |  * 2022-08-23: v1.9.4 - Fixed bug in section padding for ELF files.
12 |  * 2022-08-22: v1.9.3 - Added parsing for Delphi knowledge base files (THX to @danielenders1!).
13 |  * 2022-08-22: v1.9.2 - Improved structural parsing of Delphi binaries (THX to @danielenders1!).
14 |  * 2022-08-22: v1.9.3 - Added parsing for Delphi knowledge base files (THX to @danielenders1!).
15 |  * 2022-08-22: v1.9.2 - Improved structural parsing of Delphi binaries (THX to @danielenders1!).
16 |  * 2022-08-12: v1.9.1 - Added support for parsing intel MachO files, including Go parsing.
17 |  * 2022-08-10: v1.8.5 - Fixed Go 64bit lavel parsing for v1.12 binaries.
18 |  * 2022-08-04: v1.8.4 - Dot export now uses hex formatted addresses in node names.
19 |  * 2022-08-03: v1.8.3 - Added support for producing a Dot export for SmdaFunction.
20 |  * 2022-08-01: v1.8.1 - Added support for parsing 32bit Go binaries as well.
21 |  * 2022-08-01: v1.8.0 - Added support for parsing Go function information (THX to @danielenders1!).
22 |  * 2022-07-22: v1.7.4 - Bugfix for marshalling of reports.
23 |  * 2022-07-08: v1.7.2 - Excluded overly aggressive tailcall recognition heuristics when processing Golang binaries.
24 |  * 2022-01-27: v1.7.0 - SmdaReports now contains a field `oep`; SmdaFunctions now indicate `is_exported` and can provide CodeXrefs via `getCodeInrefs()` and `getCodeOutrefs()`. (THX for the ideas: @mr-tz)
25 |  * 2021-08-20: v1.6.1 - Bugfix for alignment calculation of binary mappings. (THX: @williballenthin)
26 |  * 2021-08-19: v1.6.0 - Bugfix for truncation during ELF segment/section loading. API usage in ELF files is now resolved as well! (THX: @williballenthin)
27 |  * 2021-07-22: v1.5.19 - Now also parsing plt.sec structures to identify functions.
28 |  * 2021-06-07: v1.5.18 - Bugfix for struct.pack 8byte conversion using L instead Q (works on Linux, not on Windows).
29 |  * 2021-05-21: v1.5.17 - Bugfix for MemoryError when having LIEF try to process section data.
30 |  * 2021-05-20: v1.5.16 - Bugfix for formatting exceptions in report output (THX: @BonusPlay)
31 |  * 2021-05-18: v1.5.15 - Changed SHA256 in SmdaReports for unmapped files (was hash of memory-mapped image, not it's the input file's hash).
32 |  * 2021-04-07: v1.5.14 - Bugfix when processing Exception handler addresses as function entry point candidates (THX: capa team).
33 |  * 2021-01-20: v1.5.13 - Now using LIEF 0.11 and moved some print output to logging.
34 |  * 2021-01-15: v1.5.11 - Disassembler now offers `disassembleUnmappedBuffer(buffer)` to load and process unmapped files directly from memory.
35 |  * 2020-12-11: v1.5.10 - Pinned LIEF to 0.10.1. 
36 |  * 2020-12-01: v1.5.9 - Bugfix for section names. again. :) 
37 |  * 2020-11-25: v1.5.6 - Now considering segments for content when ELF file has no sections (THX: @jcrussell).
38 |  * 2020-11-10: v1.5.5 - Unmarshalling setting default value for older reports.
39 |  * 2020-11-06: v1.5.4 - Minor fix on PE header parsing.
40 |  * 2020-11-05: v1.5.3 - Adjusted API thunk identification.
41 |  * 2020-10-30: v1.5.2 - One bugfix, also removed one print and reduced logging priority for the message in case the PDB parser module is missing.
42 |  * 2020-10-30: v1.5.1 - PE section table now contained in SmdaReport and added `SmdaReport.getSection(offset)`.
43 |  * 2020-10-30: v1.4.12 - Bugfix in IndirectCallHandler (THX: @jcrussell).
44 |  * 2020-10-29: v1.4.11 - Populate exception handlers specified in PE64 `.pdata` section as FEPs.
45 |  * 2020-10-29: v1.4.10 - Resolves 64bit API calls of style `call qword ptr [rip + offset]` and more register-based API calls in general (THX: @jcrussell).
46 |  * 2020-10-29: v1.4.8 - Bugfixes. Verbose mode added (THX: @jcrussell).
47 |  * 2020-10-28: v1.4.6 - WinApiResolver now tries to resolve import by ordinal to their name if it is known - can be extended in the database of OrdinalHelper.
48 |  * 2020-10-28: v1.4.5 - Store the (mapped) buffer that was used to do disassembly along inside a SmdaReport - goal: enable to read strings/bytes at offsets at a later time.
49 |  * 2020-10-27: v1.4.4 - SmdaInstructions can now provide potential data references via `SmdaInstruction.getDataRefs()`.
50 |  * 2020-10-27: v1.4.3 - SmdaInstructions can now on demand provide the detailed capstone instruction representation via `SmdaInstruction.getDetailed()`.
51 |  * 2020-10-27: v1.4.1 - 10-20% gain in processing speed by switching to `capstone.disasm_lite()`.
52 |  * 2020-10-26: v1.4.0 - Adding SmdaBasicBlock. Some convenience code to ease intgration with capa. (GeekWeek edition!) 
53 |  * 2020-09-07: v1.3.11 - Summarizable DisassemblyStatistics.
54 |  * 2020-09-02: v1.3.10 - Fixed a bug where IDA Pro would crash when failing to demangle a function name while exporting a SMDA report.
55 |  * 2020-08-31: v1.3.9 - Adjusted Logging to avoid interference with other loggers configured outside of SMDA (THX: @BonusPlay).
56 |  * 2020-08-25: v1.3.6 - PicHash no longer stored as list.
57 |  * 2020-08-17: v1.3.5 - Bugfix for import parsing (ELF files).
58 |  * 2020-08-17: v1.3.4 - Recalculate PIC hash and nesting depth for  older (v1.2.x) reports on import for compatibility.
59 |  * 2020-08-17: v1.3.3 - Added binary variation of `push ebp;mov ebp, esp` to list of default prologues and added exception handling for DominatorTrees (THX: @fxb).
60 |  * 2020-07-13: v1.3.2 - Use LIEF to parse Import Table for WinAPI usage data when processing unmapped files.
61 |  * 2020-07-13: v1.3.1 - Fixed `setup.py` to properly specify dependencies (THX: @BonusPlay).
62 |  * 2020-06-22: v1.3.0 - Added DominatorTree (Implementation by Armin Rigo) to calculate function nesting depth, shortened PIC hash to 8 byte, added some missing instructions for the InstructionEscaper, IdaInterface now demangles names.
63 |  * 2020-05-28: v1.2.15 - Bugfixes in IntelInstructionEscaper (handling of negative RIP-relative offsets), SmdaReport (datetime handling), PeFileParser (handling of empty pefile.sections); SCC calculation changed to iterative algorithm (using @bwesterb's implementation) and activated by default again. 
64 |  * 2020-05-14: v1.2.10 - Bug in IdaInterface fixed.
65 |  * 2020-05-13: v1.2.9 - Bugfix in code gap identification in FunctionCandidateManager, SCC calculation is now optional.
66 |  * 2020-05-12: v1.2.7 - Added additional default metadata field "component" to SmdaReport.
67 |  * 2020-05-11: v1.2.6 - Export from IDA to SMDA data format is now supported (IDA 7.4).
68 |  * 2020-05-09: v1.2.5 - Fixed off-by-one that affected wildcarding of instructions (THX to Viviane Zwanger).
69 |  * 2020-05-04: v1.2.4 - Various minor fixes.
70 |  * 2020-04-29: v1.2.0 - Restructured config.py into smda/SmdaConfig.py to similfy usage and now available via PyPI! The smda/Disassembler.py now emits a report object (smda.common.SmdaReport) that allows direct (pythonic) interaction with the results - a JSON can still be easily generated by using toDict() on the report.
71 |  * 2020-04-28: v1.1.0 - Several improvements, including: x64 jump table handling, better data flow handling for calls using registers and tailcalls, extended list of common prologues based on much more groundtruth data, extended padding instruction list for gap function discovery, adjusted weights in candidate priority score, filtering code areas based on section tables, using exported symbols as candidates, new function output metadata: confidence score based on instruction mnemonic histogram, PIC hash based on escaped binary instruction sequence
72 |  * 2020-03-10: Various minor fixes and QoL improvements.
73 |  * 2019-08-20: IdaExporter is now handling failed instruction conversion via capstone properly.
74 |  * 2019-08-19: Minor fix for crashes caused by PDB parser.
75 |  * 2019-08-05: v1.0.3 - SMDA can now export reports from IDA Pro (requires capstone to be available for idapython).
76 |  * 2019-06-13: PDB symbols for functions are now resolved if given a PDB file using parameter "-d" (THX to @VPaulV).
77 |  * 2019-05-15: Fixed a bug in PE mapper where buffer would be shortened because of misinterpretation of section sizes.
78 |  * 2019-02-14: v1.0.2 - ELF symbols for functions are now resolved, if present in the file. Also "-m" parameter changed to "-p" to imply parsing instead of just mapping (THX: @VPaulV).
79 |  * 2018-12-12: all gcc jump table styles are now parsed correctly. 
80 |  * 2018-11-26: Better handling of multibyte NOPs, ELF loader now provides base addr.
81 |  * 2018-09-28: We now have functional PE/ELF loaders.
82 |  * 2018-07-09: v1.0.1 - Performance improvements.
83 |  * 2018-07-01: v1.0.0 - Initial Release.


--------------------------------------------------------------------------------