├── .coveragerc ├── .github └── workflows │ └── test.yml ├── .gitignore ├── .pylintrc ├── LICENSE ├── Makefile ├── README.md ├── analyze.py ├── data ├── apiscout_win7_prof-n_sp1.json └── apiscout_winxp_prof_sp3.json ├── export.py ├── ida_analyze.py ├── requirements.txt ├── setup.py ├── smda ├── Disassembler.py ├── DisassemblyResult.py ├── DisassemblyStatistics.py ├── SmdaConfig.py ├── __init__.py ├── cil │ ├── CilDisassembler.py │ ├── CilInstructionEscaper.py │ ├── FunctionAnalysisState.py │ └── __init__.py ├── common │ ├── BasicBlock.py │ ├── BinaryInfo.py │ ├── BlockLocator.py │ ├── CodeXref.py │ ├── DominatorTree.py │ ├── SmdaBasicBlock.py │ ├── SmdaFunction.py │ ├── SmdaInstruction.py │ ├── SmdaReport.py │ ├── TailcallAnalyzer.py │ ├── Tarjan.py │ ├── __init__.py │ └── labelprovider │ │ ├── AbstractLabelProvider.py │ │ ├── CilSymbolProvider.py │ │ ├── DelphiKbSymbolProvider.py │ │ ├── ElfApiResolver.py │ │ ├── ElfSymbolProvider.py │ │ ├── GoLabelProvider.py │ │ ├── OrdinalHelper.py │ │ ├── PdbSymbolProvider.py │ │ ├── PeSymbolProvider.py │ │ ├── WinApiResolver.py │ │ └── __init__.py ├── ida │ ├── BackendInterface.py │ ├── IdaExporter.py │ ├── IdaInterface.py │ └── __init__.py ├── intel │ ├── BitnessAnalyzer.py │ ├── FunctionAnalysisState.py │ ├── FunctionCandidate.py │ ├── FunctionCandidateManager.py │ ├── IndirectCallAnalyzer.py │ ├── IntelDisassembler.py │ ├── IntelInstructionEscaper.py │ ├── JumpTableAnalyzer.py │ ├── LanguageAnalyzer.py │ ├── MnemonicTfIdf.py │ ├── __init__.py │ └── definitions.py └── utility │ ├── BracketQueue.py │ ├── DelphiKbFileLoader.py │ ├── ElfFileLoader.py │ ├── FileLoader.py │ ├── MachoFileLoader.py │ ├── MemoryFileLoader.py │ ├── PeFileLoader.py │ ├── PriorityQueue.py │ ├── StringExtractor.py │ └── __init__.py ├── tests ├── __init__.py ├── asprox_0x008D0000_xored ├── bashlite_xored ├── context.py ├── cutwail_xored ├── komplex_xored ├── njrat_xored ├── testBracketQueue.py ├── testEscaper.py ├── testFileFormatParsers.py ├── testIntegration.py └── testTarjan.py └── version_history.md /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | branch = True 4 | source = 5 | *smda* 6 | include = 7 | *smda* 8 | *tests* 9 | omit = 10 | *lib* 11 | capstone/* 12 | *distutils/* 13 | ctypes/* 14 | 15 | [report] 16 | # Regexes for lines to exclude from consideration 17 | exclude_lines = 18 | # Have to re-enable the standard pragma 19 | pragma: no cover 20 | 21 | # Don't complain about missing debug-only code: 22 | def __repr__ 23 | if self\.debug 24 | 25 | # Don't complain if tests don't hit defensive assertion code: 26 | raise AssertionError 27 | raise NotImplementedError 28 | 29 | # Don't complain if non-runnable code isn't run: 30 | if 0: 31 | if __name__ == "__main__": 32 | def main(argv): 33 | 34 | ignore_errors = True 35 | 36 | [html] 37 | directory = coverage_html_report 38 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Python Tests 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install -r requirements.txt 28 | pip install -e . 29 | 30 | - name: Run tests 31 | run: make test -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .coverage.* 34 | .cache 35 | .noseids 36 | nosetests.xml 37 | coverage.xml 38 | *,cover 39 | .hypothesis/ 40 | 41 | 42 | # pyenv 43 | .python-version 44 | 45 | # dotenv 46 | .env 47 | 48 | # virtualenv 49 | .venv/ 50 | venv/ 51 | ENV/ 52 | 53 | # Spyder project settings 54 | .spyderproject 55 | 56 | # Rope project settings 57 | .ropeproject 58 | 59 | # more IDE settings 60 | .idea 61 | .vscode 62 | 63 | 64 | # project files 65 | config.ini 66 | figures 67 | reports 68 | analyze_*.py 69 | coverage-html -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018-2020, Daniel Plohmann and Steffen Enders 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | init: 2 | pip install -r requirements.txt 3 | package: 4 | rm -rf dist/* 5 | python setup.py sdist 6 | publish: 7 | python -m twine upload dist/* -u __token__ 8 | pylint: 9 | python -m pylint --rcfile=.pylintrc smda 10 | test: 11 | pytest tests/test* 12 | test-coverage: 13 | python -m nose --with-coverage --cover-erase --cover-html-dir=./coverage-html --cover-html --cover-package=smda 14 | clean: 15 | find . | grep -E "(__pycache__|\.pyc|\.pyo$\)" | xargs rm -rf 16 | rm -rf .coverage 17 | rm -rf coverage-html 18 | rm -rf dist/* 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # SMDA 3 | 4 | SMDA is a minimalist recursive disassembler library that is optimized for accurate Control Flow Graph (CFG) recovery from memory dumps. 5 | It is based on [Capstone](http://www.capstone-engine.org/) and currently supports x86/x64 Intel machine code. 6 | As input, arbitrary memory dumps (ideally with known base address) can be processed. 7 | The output is a collection of functions, basic blocks, and instructions with their respective edges between blocks and functions (in/out). 8 | Optionally, references to the Windows API can be inferred by using the ApiScout method. 9 | 10 | ## Installation 11 | 12 | With version 1.2.0, we have finally simplified things by moving to [PyPI](https://pypi.org/project/smda/)! 13 | So installation now is as easy as: 14 | 15 | ``` 16 | $ pip install smda 17 | ``` 18 | 19 | ## Usage 20 | 21 | A typical workflow using SMDA could like this: 22 | 23 | ``` 24 | >>> from smda.Disassembler import Disassembler 25 | >>> disassembler = Disassembler() 26 | >>> report = disassembler.disassembleFile("/bin/cat") 27 | >>> print(report) 28 | 0.777s -> (architecture: intel.64bit, base_addr: 0x00000000): 143 functions 29 | >>> for fn in report.getFunctions(): 30 | ... print(fn) 31 | ... for ins in fn.getInstructions(): 32 | ... print(ins) 33 | ... 34 | 0x00001720: (-> 1, 1->) 3 blocks, 7 instructions. 35 | 0x00001720: ( 4883ec08) - sub rsp, 8 36 | 0x00001724: (488b05bd682000) - mov rax, qword ptr [rip + 0x2068bd] 37 | 0x0000172b: ( 4885c0) - test rax, rax 38 | 0x0000172e: ( 7402) - je 0x1732 39 | 0x00001730: ( ffd0) - call rax 40 | 0x00001732: ( 4883c408) - add rsp, 8 41 | 0x00001736: ( c3) - ret 42 | 0x00001ad0: (-> 1, 4->) 1 blocks, 12 instructions. 43 | [...] 44 | >>> json_report = report.toDict() 45 | ``` 46 | 47 | There is also a demo script: 48 | 49 | * analyze.py -- example usage: perform disassembly on a file or memory dump and optionally store results in JSON to a given output path. 50 | 51 | The code should be fully compatible with Python 3.8+. 52 | Further explanation on the innerworkings follow in separate publications but will be referenced here. 53 | 54 | To take full advantage of SMDA's capabilities, make sure to (optionally) install: 55 | * lief 56 | * pdbparse (currently as fork from https://github.com/VPaulV/pdbparse to support Python3) 57 | 58 | ## Version History 59 | * 2025-02-26: v2.0.2 - Adjusting relative import, adding init file. 60 | * 2025-02-25: v2.0.0 - Initial experimental support for CIL (.NET) disassembly. 61 | * 2025-02-24: v1.14.3 - PicHashing can now be disabled via SmdaConfig to save some processing time. (THX to @Nalexander-hanel!) 62 | * 2025-02-24: v1.14.2 - We are Python 3.8+ compatible again (changed UTC usage) and (DWARF) PE symbols for PE files should be extracted again (THX to @N0fix for the update!) 63 | * 2025-02-21: v1.14.1 - Fixed changed field names in LIEF usage that broke ELF parsing, added tests for ELF+macOS parsing (THX to @N0fix for the update!) 64 | * 2025-01-29: v1.14.0 - Bump to LIEF 0.16.0+ (THX to @huettenhain for the ping!). Migrated tests to `pytest`, UTC datetime handling fixes. 65 | * 2025-01-26: v1.13.24 - Added functionality to import and export SMDA reports as JSON. Fixed byte patterns matching special regex chars (THX to @alexander-hanel!). 66 | * 2024-07-26: v1.13.23 - Now using OEP as symbol function candidate when available (THX to @alexander-hanel for reporting!). 67 | * 2024-05-10: v1.13.22 - Handled odd case where disassembly with capstone and IDA would return different results (THX to @r0ny123 for reporting!). 68 | * 2024-04-17: v1.13.21 - Fixed handling of Go binaries for version 1.20+ (THX to @Manny684!). 69 | * 2024-04-08: v1.13.20 - Fixed handling of bnd prefix in CFG instructions to help with parsing PLT (THX to @Manny684!). 70 | * 2024-04-02: v1.13.19 - Fixed bug in string parsing, added tests, strings now no longer are hex-encoded as they are always printable anyway. 71 | * 2024-03-12: v1.13.18 - Added functionality to extract and store all referenced strings along SmdaFunctions (has to be enabled via SmdaConfig). 72 | * 2024-03-12: v1.13.17 - Extended disassembleBuffer() to now take additional arguments `code_areas` and `oep`. 73 | * 2024-02-21: v1.13.16 - BREAKING IntelInstructionEscaper.escapeMnemonic: Escaper now handles another 200 instruction names found in other capstone source files (THX for reporting @malwarefrank!). 74 | * 2024-02-15: v1.13.15 - Fixed issues with version recognition in SmdaFunction which cause issues in MCRIT (THX to @ 75 | malwarefrank!) 76 | * 2024-02-02: v1.13.12 - Versions might be non-numerical, addressed that in SmdaFunction. 77 | * 2024-01-23: v1.13.11 - Introduced indicator in SmdaConfig for compatibility of instruction escaping. 78 | * 2024-01-23: v1.13.10 - Parsing of PE files should work again with lief >=0.14.0. 79 | * 2024-01-23: v1.13.9 - Improved parsing robustness for section/segment tables in ELF files, also now padding with zeroes when finding less content than expected physical size in a segment (THX for reporting @schrodyn!). 80 | * 2024-01-23: v1.13.8 - BREAKING adjustments to IntelInstructionEscaper.escapeMnemonic: Escaper now is capable of handling all known x86/x64 instructions in capstone (THX for reporting @schrodyn!). 81 | * 2023-12-01: v1.13.7 - Skip processing of Delphi structs for large files, workaround until this is properly reimplemented. 82 | * 2023-11-29: v1.13.6 - Made OpcodeHash an attribute with on-demand calculation to save processing time. 83 | * 2023-11-29: v1.13.3 - Implemented an alternative queue working with reference count based brackets in pursuit of accelerated processing. 84 | * 2023-11-28: v1.13.2 - IndirectCallAnalyzer will now analyze at most a configurable amount of calls per basic block, default 50. 85 | * 2023-11-21: v1.13.1 - SmdaBasicBlock now has `getPredecessors()` and `getSuccessors()`. 86 | * 2023-11-21: v1.13.0 - BREAKING adjustments to PicHashing (now wildcarding intraprocedural jumps in functions, additionally more immediates if within address space). Introduction of OpcodeHash (OpcHash), which wildcards all but prefixes and opcode bytes. 87 | * 2023-10-12: v1.12.7 - Bugfix for parsing Delphi structs. 88 | * 2023-09-15: v1.12.6 - Bugfix in BlockLocator (THX to @cccs-ay!). 89 | * 2023-08-28: v1.12.5 - Bugfix for address dereferencing where buffer sizes were not properly checked (THX to @yankovs!). 90 | * 2023-08-08: v1.12.4 - SmdaBasicBlock can now do getPicBlockHash(). 91 | * 2023-05-23: v1.12.3 - Fixed bugs in PE parser and Go parser. 92 | * 2023-05-08: v1.12.1 - Get rid of deprecation warning in IDA 8.0+. 93 | * 2023-03-24: v1.12.0 - SMDA now parses PE export directories for symbols, as well as MinGW DWARF information if available. 94 | * 2023-03-14: v1.11.2 - SMDA report now also contains SHA1 and MD5. 95 | * 2023-03-14: v1.11.1 - rendering dotGraph can now include API references instead of plain calls. 96 | * 2023-02-06: v1.11.0 - SmdaReport now has functionality to find a function/block by a given offset contained within in (THX to @cccs-ay!). 97 | * 2023-02-06: v1.10.0 - Adjusted to LIEF 0.12.3 API for binary parsing (THX to @lainswork!). 98 | * 2022-08-12: v1.9.1 - Added support for parsing intel MachO files, including Go parsing. 99 | * 2022-08-01: v1.8.0 - Added support for parsing Go function information (THX to @danielenders1!). 100 | * 2022-01-27: v1.7.0 - SmdaReports now contains a field `oep`; SmdaFunctions now indicate `is_exported` and can provide CodeXrefs via `getCodeInrefs()` and `getCodeOutrefs()`. (THX for the ideas: @mr-tz) 101 | * 2021-08-20: v1.6.0 - Bugfix for alignment calculation of binary mappings. (THX: @williballenthin) 102 | * 2021-08-19: v1.6.0 - Bugfix for truncation during ELF segment/section loading. API usage in ELF files is now resolved as well! (THX: @williballenthin) 103 | * 2020-10-30: v1.5.0 - PE section table now contained in SmdaReport and added `SmdaReport.getSection(offset)`. 104 | * 2020-10-26: v1.4.0 - Adding SmdaBasicBlock. Some convenience code to ease intgration with capa. (GeekWeek edition!) 105 | * 2020-06-22: v1.3.0 - Added DominatorTree (Implementation by Armin Rigo) to calculate function nesting depth, shortened PIC hash to 8 byte, added some missing instructions for the InstructionEscaper, IdaInterface now demangles names. 106 | * 2020-04-29: v1.2.0 - Restructured config.py into smda/SmdaConfig.py to similfy usage and now available via PyPI! The smda/Disassembler.py now emits a report object (smda.common.SmdaReport) that allows direct (pythonic) interaction with the results - a JSON can still be easily generated by using toDict() on the report. 107 | * 2020-04-28: v1.1.0 - Several improvements, including: x64 jump table handling, better data flow handling for calls using registers and tailcalls, extended list of common prologues based on much more groundtruth data, extended padding instruction list for gap function discovery, adjusted weights in candidate priority score, filtering code areas based on section tables, using exported symbols as candidates, new function output metadata: confidence score based on instruction mnemonic histogram, PIC hash based on escaped binary instruction sequence 108 | * 2018-07-01: v1.0.0 - Initial Release. 109 | 110 | 111 | ## Credits 112 | 113 | Thanks to Steffen Enders for his extensive contributions to this project! 114 | Thanks to Paul Hordiienko for adding symbol parsing support (ELF+PDB)! 115 | Thanks to Jonathan Crussell for helping me to beef up SMDA enough to make it a disassembler backend in capa! 116 | Thanks to Willi Ballenthin for improving handling of ELF files, including properly handling API usage! 117 | Thanks to Daniel Enders for his contributions to the parsing of the Golang function registry and label information! 118 | The project uses the implementation of Tarjan's Algorithm by Bas Westerbaan and the implementation of Lengauer-Tarjan's Algorithm for the DominatorTree by Armin Rigo. 119 | 120 | Pull requests welcome! :) 121 | 122 | -------------------------------------------------------------------------------- /analyze.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import re 6 | import sys 7 | 8 | from smda.SmdaConfig import SmdaConfig 9 | from smda.Disassembler import Disassembler 10 | 11 | def parseBaseAddrFromArgs(args): 12 | if args.base_addr: 13 | parsed_base_addr = int(args.base_addr, 16) if args.base_addr.startswith("0x") else int(args.base_addr) 14 | logging.info("using provided base address: 0x%08x", parsed_base_addr) 15 | return parsed_base_addr 16 | # try to infer base addr from filename: 17 | baddr_match = re.search(re.compile("_0x(?P[0-9a-fA-F]{8,16})"), args.input_path) 18 | if baddr_match: 19 | parsed_base_addr = int(baddr_match.group("base_addr"), 16) 20 | logging.info("Parsed base address from file name: 0x%08x %d", parsed_base_addr, parsed_base_addr) 21 | return parsed_base_addr 22 | logging.warning("No base address recognized, using 0.") 23 | return 0 24 | 25 | def parseOepFromArgs(args): 26 | if args.oep and args.oep != '': 27 | parsed_oep = int(args.oep, 16) if args.oep.startswith("0x") else int(args.oep) 28 | logging.info("using provided OEP(RVA): 0x%08x", parsed_oep) 29 | return parsed_oep 30 | logging.warning("No OEP recognized, skipping.") 31 | return None 32 | 33 | 34 | def readFileContent(file_path): 35 | file_content = b"" 36 | with open(file_path, "rb") as fin: 37 | file_content = fin.read() 38 | return file_content 39 | 40 | 41 | if __name__ == "__main__": 42 | PARSER = argparse.ArgumentParser(description='Demo: Use SMDA to disassemble a given file (loaded memory view), optionally map it first and/or write the output to a file.') 43 | PARSER.add_argument('-p', '--parse_header', action='store_true', default=False, help='Parse header/symbols and perform mapping of the file as normalization.') 44 | PARSER.add_argument('-d', '--pdb_path', type=str, default='', help='If available, use a PDB file to enhance disassembly (function offsets and names).') 45 | PARSER.add_argument('-r', '--architecture', type=str, default='', help='Use the disassembler for the following architecture if available (default:auto, options: [intel, cil]).') 46 | PARSER.add_argument('-a', '--base_addr', type=str, default='', help='When analyzing a buffer, set base address to given value (int or 0x-hex format).') 47 | PARSER.add_argument('-b', '--bitness', type=int, default=0, help='Optionally force bitness to [32, 64] when processing dumps.') 48 | PARSER.add_argument('-i', '--oep', type=str, default='', help='Force OEP for buffers, defined as RVA.') 49 | PARSER.add_argument('-o', '--output_path', type=str, default='', help='Optionally write the output to a file (JSON format).') 50 | PARSER.add_argument('-s', '--strings', action='store_true', default=False, help='Enable string extraction.') 51 | PARSER.add_argument('-v', '--verbose', action='store_true', default=False, help='Enable debug logging.') 52 | PARSER.add_argument('input_path', type=str, default='', help='Path to file to analyze.') 53 | 54 | 55 | ARGS = PARSER.parse_args() 56 | 57 | if not ARGS.input_path: 58 | PARSER.print_help() 59 | sys.exit(1) 60 | 61 | # optionally create and set up a config, e.g. when using ApiScout profiles for WinAPI import usage discovery 62 | config = SmdaConfig() 63 | if ARGS.verbose: 64 | config.LOG_LEVEL = logging.DEBUG 65 | if ARGS.strings: 66 | config.WITH_STRINGS = True 67 | logging.basicConfig(level=config.LOG_LEVEL, format=config.LOG_FORMAT) 68 | 69 | SMDA_REPORT = None 70 | INPUT_FILENAME = "" 71 | BITNESS = ARGS.bitness if (ARGS.bitness in [32, 64]) else None 72 | if os.path.isfile(ARGS.input_path): 73 | print("now analyzing {}".format(ARGS.input_path)) 74 | INPUT_FILENAME = os.path.basename(ARGS.input_path) 75 | if ARGS.parse_header: 76 | DISASSEMBLER = Disassembler(config, backend=ARGS.architecture) 77 | SMDA_REPORT = DISASSEMBLER.disassembleFile(ARGS.input_path, pdb_path=ARGS.pdb_path) 78 | else: 79 | BUFFER = readFileContent(ARGS.input_path) 80 | BASE_ADDR = parseBaseAddrFromArgs(ARGS) 81 | OEP = parseOepFromArgs(ARGS) 82 | config.API_COLLECTION_FILES = {"win_7": os.sep.join([config.PROJECT_ROOT, "data", "apiscout_win7_prof-n_sp1.json"])} 83 | DISASSEMBLER = Disassembler(config, backend=ARGS.architecture) 84 | SMDA_REPORT = DISASSEMBLER.disassembleBuffer(BUFFER, BASE_ADDR, BITNESS, oep=OEP) 85 | SMDA_REPORT.filename = os.path.basename(ARGS.input_path) 86 | print(SMDA_REPORT) 87 | if SMDA_REPORT and os.path.isdir(ARGS.output_path): 88 | with open(ARGS.output_path + os.sep + INPUT_FILENAME + ".smda", "w") as fout: 89 | json.dump(SMDA_REPORT.toDict(), fout, indent=1, sort_keys=True) 90 | -------------------------------------------------------------------------------- /export.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | 5 | from smda.Disassembler import Disassembler 6 | 7 | 8 | def detectBackend(): 9 | backend = "" 10 | version = "" 11 | try: 12 | import idaapi 13 | import idautils 14 | backend = "IDA" 15 | version = idaapi.IDA_SDK_VERSION 16 | except: 17 | pass 18 | return (backend, version) 19 | 20 | 21 | if __name__ == "__main__": 22 | BACKEND, VERSION = detectBackend() 23 | if BACKEND == "IDA": 24 | from smda.ida.IdaInterface import IdaInterface 25 | ida_interface = IdaInterface() 26 | binary = ida_interface.getBinary() 27 | base_addr = ida_interface.getBaseAddr() 28 | DISASSEMBLER = Disassembler(backend=BACKEND) 29 | REPORT = DISASSEMBLER.disassembleBuffer(binary, base_addr) 30 | output_path = ida_interface.getIdbDir() 31 | output_filepath = output_path + "ConvertedFromIdb.smda" 32 | with open(output_filepath, "w") as fout: 33 | json.dump(REPORT.toDict(), fout, indent=1, sort_keys=True) 34 | print("Output saved to: %s" % output_filepath) 35 | else: 36 | raise Exception("No supported backend found.") 37 | -------------------------------------------------------------------------------- /ida_analyze.py: -------------------------------------------------------------------------------- 1 | from smda.SmdaConfig import SmdaConfig 2 | from smda.Disassembler import Disassembler 3 | 4 | from export import detectBackend 5 | 6 | 7 | if __name__ == "__main__": 8 | BACKEND, VERSION = detectBackend() 9 | if BACKEND == "IDA": 10 | from smda.ida.IdaInterface import IdaInterface 11 | ida_interface = IdaInterface() 12 | binary = ida_interface.getBinary() 13 | base_addr = ida_interface.getBaseAddr() 14 | config = SmdaConfig() 15 | DISASSEMBLER = Disassembler(config) 16 | REPORT = DISASSEMBLER.disassembleBuffer(binary, base_addr) 17 | smda_function_count = 0 18 | smda_name_count = 0 19 | for smda_function in REPORT.getFunctions(): 20 | smda_function_count += ida_interface.makeFunction(smda_function.offset) 21 | if smda_function.function_name != "": 22 | smda_name_count += ida_interface.makeNameEx(smda_function.offset, smda_function.function_name) 23 | print(f"Defined {smda_function_count} functions and assigned {smda_name_count} function names.") 24 | else: 25 | raise Exception("Run this script from within IDA.") 26 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | capstone 3 | dncil 4 | dnfile 5 | lief>=0.16.0 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | from setuptools import setup, find_packages 4 | 5 | with open("README.md", "r") as fh: 6 | long_description = fh.read() 7 | 8 | 9 | requirements = ["capstone", "dncil", "dnfile"] 10 | 11 | if sys.version_info >= (3, 0): 12 | # py3 13 | requirements.append("lief>=0.16.0") 14 | else: 15 | # py2 - newer LIEF is Python3 only 16 | requirements.append("lief==0.9.0") 17 | 18 | 19 | setup( 20 | name='smda', 21 | # note to self: always change this in config as well. 22 | version='2.0.2', 23 | description='A recursive disassmbler optimized for CFG recovery from memory dumps. Based on capstone.', 24 | long_description_content_type="text/markdown", 25 | long_description=long_description, 26 | author='Daniel Plohmann', 27 | author_email='daniel.plohmann@mailbox.org', 28 | url='https://github.com/danielplohmann/smda', 29 | license="BSD 2-Clause", 30 | packages=find_packages(exclude=('tests', 'docs')), 31 | install_requires=requirements, 32 | data_files=[ 33 | ('', ['LICENSE']), 34 | ], 35 | classifiers=[ 36 | "Development Status :: 4 - Beta", 37 | "License :: OSI Approved :: BSD License", 38 | "Operating System :: OS Independent", 39 | "Programming Language :: Python :: 2.7", 40 | "Programming Language :: Python :: 3", 41 | "Topic :: Security", 42 | "Topic :: Software Development :: Disassemblers", 43 | ], 44 | ) 45 | -------------------------------------------------------------------------------- /smda/Disassembler.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import datetime 3 | import json 4 | import os 5 | import traceback 6 | import logging 7 | 8 | from smda.utility.FileLoader import FileLoader 9 | from smda.utility.MemoryFileLoader import MemoryFileLoader 10 | from smda.utility.StringExtractor import extract_strings 11 | from smda.SmdaConfig import SmdaConfig 12 | from smda.common.BinaryInfo import BinaryInfo 13 | from smda.common.SmdaReport import SmdaReport 14 | from smda.intel.IntelDisassembler import IntelDisassembler 15 | from smda.cil.CilDisassembler import CilDisassembler 16 | from smda.ida.IdaExporter import IdaExporter 17 | 18 | LOGGER = logging.getLogger(__name__) 19 | 20 | 21 | class Disassembler(object): 22 | 23 | def __init__(self, config=None, backend=None): 24 | if config is None: 25 | config = SmdaConfig() 26 | self.config = config 27 | self.disassembler = None 28 | if backend == "intel": 29 | self.disassembler = IntelDisassembler(self.config) 30 | elif backend == "cil": 31 | self.disassembler = CilDisassembler(self.config) 32 | elif backend == "IDA": 33 | self.disassembler = IdaExporter(self.config) 34 | self._start_time = None 35 | self._timeout = 0 36 | # cache the last DisassemblyResult 37 | self.disassembly = None 38 | 39 | def initDisassembler(self, architecture="intel"): 40 | """ Initialize disassembler backend to given architecture, if not initialized yet, default: intel """ 41 | if self.disassembler is None: 42 | if architecture == "intel": 43 | self.disassembler = IntelDisassembler(self.config) 44 | elif architecture == "cil": 45 | self.disassembler = CilDisassembler(self.config) 46 | 47 | def _getDurationInSeconds(self, start_ts, end_ts): 48 | return (end_ts - start_ts).seconds + ((end_ts - start_ts).microseconds / 1000000.0) 49 | 50 | def _callbackAnalysisTimeout(self): 51 | if not self._timeout: 52 | return False 53 | time_diff = datetime.datetime.now(datetime.timezone.utc) - self._start_time 54 | LOGGER.debug("Current analysis callback time %s", (time_diff)) 55 | return time_diff.seconds >= self._timeout 56 | 57 | def _addStringsToReport(self, smda_report, buffer): 58 | smda_report.buffer = buffer 59 | for smda_function in smda_report.getFunctions(): 60 | function_strings = {} 61 | for string, addr in extract_strings(smda_function): 62 | function_strings[addr] = string 63 | smda_function.stringrefs = function_strings 64 | 65 | def disassembleFile(self, file_path, pdb_path=""): 66 | loader = FileLoader(file_path, map_file=True) 67 | file_content = loader.getData() 68 | binary_info = BinaryInfo(file_content) 69 | binary_info.raw_data = loader.getRawData() 70 | # we want the SHA256/SHA1/MD5 of the unmapped file not how we mapped it to memory 71 | binary_info.sha256 = hashlib.sha256(binary_info.raw_data).hexdigest() 72 | binary_info.sha1 = hashlib.sha1(binary_info.raw_data).hexdigest() 73 | binary_info.md5 = hashlib.md5(binary_info.raw_data).hexdigest() 74 | binary_info.file_path = file_path 75 | binary_info.base_addr = loader.getBaseAddress() 76 | binary_info.bitness = loader.getBitness() 77 | binary_info.architecture = loader.getArchitecture() 78 | binary_info.code_areas = loader.getCodeAreas() 79 | self.initDisassembler(binary_info.architecture) 80 | start = datetime.datetime.now(datetime.timezone.utc) 81 | try: 82 | self.disassembler.addPdbFile(binary_info, pdb_path) 83 | smda_report = self._disassemble(binary_info, timeout=self.config.TIMEOUT) 84 | if self.config.WITH_STRINGS: 85 | self._addStringsToReport(smda_report, file_content) 86 | if self.config.STORE_BUFFER: 87 | smda_report.buffer = file_content 88 | except Exception as exc: 89 | LOGGER.error("An error occurred while disassembling file.") 90 | # print("-> an error occured (", str(exc), ").") 91 | smda_report = self._createErrorReport(start, exc) 92 | return smda_report 93 | 94 | def disassembleUnmappedBuffer(self, file_content): 95 | loader = MemoryFileLoader(file_content, map_file=True) 96 | file_content = loader.getData() 97 | binary_info = BinaryInfo(file_content) 98 | binary_info.raw_data = loader.getRawData() 99 | # we want the SHA256/SHA1/MD5 of the unmapped file not how we mapped it to memory 100 | binary_info.sha256 = hashlib.sha256(binary_info.raw_data).hexdigest() 101 | binary_info.sha1 = hashlib.sha1(binary_info.raw_data).hexdigest() 102 | binary_info.md5 = hashlib.md5(binary_info.raw_data).hexdigest() 103 | binary_info.file_path = "" 104 | binary_info.base_addr = loader.getBaseAddress() 105 | binary_info.bitness = loader.getBitness() 106 | binary_info.architecture = loader.getArchitecture() 107 | binary_info.code_areas = loader.getCodeAreas() 108 | self.initDisassembler(binary_info.architecture) 109 | start = datetime.datetime.now(datetime.timezone.utc) 110 | try: 111 | smda_report = self._disassemble(binary_info, timeout=self.config.TIMEOUT) 112 | if self.config.WITH_STRINGS: 113 | self._addStringsToReport(smda_report, file_content) 114 | if self.config.STORE_BUFFER: 115 | smda_report.buffer = file_content 116 | except Exception as exc: 117 | LOGGER.error("An error occurred while disassembling unmapped buffer.") 118 | # print("-> an error occured (", str(exc), ").") 119 | smda_report = self._createErrorReport(start, exc) 120 | return smda_report 121 | 122 | def disassembleBuffer(self, file_content, base_addr, bitness=None, code_areas=None, oep=None, architecture="intel"): 123 | """ 124 | Disassemble a given buffer (file_content), with given base_addr. 125 | Optionally specify bitness, the areas to which disassembly should be limited to (code_areas) and an entry point (oep) 126 | """ 127 | binary_info = BinaryInfo(file_content) 128 | binary_info.base_addr = base_addr 129 | binary_info.bitness = bitness 130 | binary_info.is_buffer = True 131 | binary_info.code_areas = code_areas 132 | binary_info.architecture = architecture 133 | binary_info.oep = oep 134 | self.initDisassembler(binary_info.architecture) 135 | start = datetime.datetime.now(datetime.timezone.utc) 136 | try: 137 | smda_report = self._disassemble(binary_info, timeout=self.config.TIMEOUT) 138 | if self.config.WITH_STRINGS: 139 | self._addStringsToReport(smda_report, file_content) 140 | if self.config.STORE_BUFFER: 141 | smda_report.buffer = file_content 142 | except Exception as exc: 143 | LOGGER.error("An error occurred while disassembling buffer.") 144 | # print("-> an error occured (", str(exc), ").") 145 | smda_report = self._createErrorReport(start, exc) 146 | return smda_report 147 | 148 | def _disassemble(self, binary_info, timeout=0): 149 | self._start_time = datetime.datetime.now(datetime.timezone.utc) 150 | self._timeout = timeout 151 | self.disassembly = self.disassembler.analyzeBuffer(binary_info, self._callbackAnalysisTimeout) 152 | return SmdaReport(self.disassembly, config=self.config) 153 | 154 | def _createErrorReport(self, start, exception): 155 | report = SmdaReport(config=self.config) 156 | report.smda_version = self.config.VERSION 157 | report.status = "error" 158 | report.execution_time = self._getDurationInSeconds(start, datetime.datetime.now(datetime.timezone.utc)) 159 | report.message = traceback.format_exc() 160 | return report 161 | -------------------------------------------------------------------------------- /smda/DisassemblyStatistics.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class DisassemblyStatistics(object): 4 | 5 | num_functions = None 6 | num_recursive_functions = None 7 | num_leaf_functions = None 8 | num_basic_blocks = None 9 | num_instructions = None 10 | num_api_calls = None 11 | num_function_calls = None 12 | num_failed_functions = None 13 | num_failed_instructions = None 14 | 15 | def __init__(self, disassembly_result=None): 16 | if disassembly_result is not None: 17 | self.num_functions = len(disassembly_result.functions) 18 | self.num_recursive_functions = len(disassembly_result.recursive_functions) 19 | self.num_leaf_functions = len(disassembly_result.leaf_functions) 20 | self.num_basic_blocks = self._countBlocks(disassembly_result) 21 | self.num_instructions = self._countInstructions(disassembly_result) 22 | self.num_api_calls = self._countApiCalls(disassembly_result) 23 | self.num_function_calls = self._countFunctionCalls(disassembly_result) 24 | self.num_failed_functions = len(disassembly_result.failed_analysis_addr) 25 | self.num_failed_instructions = len(disassembly_result.errors) 26 | 27 | def _countBlocks(self, disassembly_result): 28 | num_blocks = 0 29 | for _, blocks in disassembly_result.functions.items(): 30 | num_blocks += len(blocks) 31 | return num_blocks 32 | 33 | def _countApiCalls(self, disassembly_result): 34 | return len(disassembly_result.getAllApiRefs()) 35 | 36 | def _countInstructions(self, disassembly_result): 37 | num_ins = 0 38 | for function_offset in sorted(disassembly_result.functions): 39 | for block in disassembly_result.functions[function_offset]: 40 | num_ins += len(block) 41 | return num_ins 42 | 43 | def _countFunctionCalls(self, disassembly_result): 44 | num_calls = 0 45 | for function_start in disassembly_result.functions: 46 | if function_start in disassembly_result.code_refs_to: 47 | num_calls += len(disassembly_result.code_refs_to[function_start]) 48 | return num_calls 49 | 50 | @classmethod 51 | def fromDict(cls, statistics_dict): 52 | statistics = cls(None) 53 | statistics.num_functions = statistics_dict["num_functions"] 54 | statistics.num_recursive_functions = statistics_dict["num_recursive_functions"] 55 | statistics.num_leaf_functions = statistics_dict["num_leaf_functions"] 56 | statistics.num_basic_blocks = statistics_dict["num_basic_blocks"] 57 | statistics.num_instructions = statistics_dict["num_instructions"] 58 | statistics.num_api_calls = statistics_dict["num_api_calls"] 59 | statistics.num_function_calls = statistics_dict["num_function_calls"] 60 | statistics.num_failed_functions = statistics_dict["num_failed_functions"] 61 | statistics.num_failed_instructions = statistics_dict["num_failed_instructions"] 62 | return statistics 63 | 64 | def toDict(self): 65 | return { 66 | "num_functions": self.num_functions, 67 | "num_recursive_functions": self.num_recursive_functions, 68 | "num_leaf_functions": self.num_leaf_functions, 69 | "num_basic_blocks": self.num_basic_blocks, 70 | "num_instructions": self.num_instructions, 71 | "num_api_calls": self.num_api_calls, 72 | "num_function_calls": self.num_function_calls, 73 | "num_failed_functions": self.num_failed_functions, 74 | "num_failed_instructions": self.num_failed_instructions 75 | } 76 | 77 | def __add__(self, other): 78 | if not isinstance(other, DisassemblyStatistics): 79 | raise ValueError("Needs another DisassemblyStatistics to perform addition of values") 80 | self.num_functions += other.num_functions 81 | self.num_recursive_functions += other.num_recursive_functions 82 | self.num_leaf_functions += other.num_leaf_functions 83 | self.num_basic_blocks += other.num_basic_blocks 84 | self.num_instructions += other.num_instructions 85 | self.num_api_calls += other.num_api_calls 86 | self.num_function_calls += other.num_function_calls 87 | self.num_failed_functions += other.num_failed_functions 88 | self.num_failed_instructions += other.num_failed_instructions 89 | return self 90 | -------------------------------------------------------------------------------- /smda/SmdaConfig.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | 5 | class SmdaConfig(object): 6 | 7 | # note to self: always change this in setup.py as well! 8 | VERSION = "2.0.2" 9 | ESCAPER_DOWNWARD_COMPATIBILITY = "1.13.16" 10 | CONFIG_FILE_PATH = str(os.path.abspath(__file__)) 11 | PROJECT_ROOT = str(os.path.abspath(os.sep.join([CONFIG_FILE_PATH, "..", ".."]))) 12 | 13 | ### An (optional) WinAPI database as generated by ApiScout (https://github.com/danielplohmann/apiscout) 14 | API_COLLECTION_FILES = {} 15 | ### global logging-config setup 16 | # Only do basicConfig if no handlers have been configured 17 | LOG_PATH = "./" 18 | LOG_LEVEL = logging.INFO 19 | LOG_FORMAT = "%(asctime)-15s: %(name)-32s - %(message)s" 20 | 21 | ### SMDA disassembler config 22 | # maximum time in seconds for disassembly to complete 23 | TIMEOUT = 300 24 | # maximum number of bytes to allocate while loading 25 | MAX_IMAGE_SIZE = 100 * 1024 * 1024 26 | # store raw binary buffer in SmdaReport to enable carving data from refs 27 | STORE_BUFFER = False 28 | # extract strings during disassembly 29 | WITH_STRINGS = False 30 | # the queue to use for candidate management 31 | CANDIDATE_QUEUE = "PriorityQueue" # choose from: ["BracketQueue", "PriorityQueue"] 32 | # improve disassembly by resolving references through data flows 33 | USE_ALIGNMENT = True 34 | USE_SYMBOLS_AS_CANDIDATES = True 35 | RESOLVE_REGISTER_CALLS = True 36 | # limit this to avoid blowing up analysis time for weird samples 37 | MAX_INDIRECT_CALLS_PER_BASIC_BLOCK = 50 38 | HIGH_ACCURACY = True 39 | RESOLVE_TAILCALLS = False 40 | # optional metadata generation options 41 | CALCULATE_SCC = True 42 | CALCULATE_NESTING = True 43 | CALCULATE_HASHING = True 44 | # confidence score to use for filtering functions before including them in the output 45 | CONFIDENCE_THRESHOLD = 0.0 46 | -------------------------------------------------------------------------------- /smda/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /smda/cil/FunctionAnalysisState.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | LOGGER = logging.getLogger(__name__) 4 | 5 | CALL_INS = ["call", "calli", "callvirt"] 6 | END_INS = ["ret"] 7 | 8 | class FunctionAnalysisState(object): 9 | 10 | def __init__(self, start_addr, code_start_addr, disassembly): 11 | self.start_addr = start_addr 12 | self.code_start_addr = code_start_addr 13 | self.disassembly = disassembly 14 | self.block_queue = [start_addr] 15 | self.current_block = [] 16 | self.blocks = [] 17 | self.num_blocks_analyzed = 0 18 | self.instructions = [] 19 | self.instruction_start_bytes = set([]) 20 | self.processed_blocks = set([]) 21 | self.processed_bytes = set([]) 22 | self.jump_targets = set([]) 23 | self.call_register_ins = [] 24 | self.block_start = 0xFFFFFFFF 25 | self.data_bytes = set([]) 26 | self.data_refs = set([]) 27 | self.code_refs = set([]) 28 | self.code_refs_from = {} 29 | self.code_refs_to = {} 30 | self.prev_opcode = "" 31 | self.suspicious_ins_count = 0 32 | self.is_jmp = False 33 | self.is_next_instruction_reachable = True 34 | self.is_block_ending_instruction = False 35 | self.is_sanely_ending = False 36 | self.has_collision = False 37 | self.colliding_addresses = set() 38 | # set a flag that this tailcall has already been resolved so it does not have to be reanalyzed several times 39 | self.is_tailcall_function = False 40 | self.is_leaf_function = True 41 | self.is_recursive = False 42 | self.is_thunk_call = False 43 | self.label = "" 44 | 45 | def addInstruction(self, i_address, i_size, i_mnemonic, i_op_str, i_bytes): 46 | ins = (i_address, i_size, i_mnemonic, i_op_str, i_bytes) 47 | self.instructions.append(ins) 48 | self.instruction_start_bytes.add(ins[0]) 49 | for byte in range(i_size): 50 | self.processed_bytes.add(i_address + byte) 51 | if self.is_next_instruction_reachable: 52 | self.addCodeRef(i_address, i_address + i_size, self.is_jmp) 53 | self.is_jmp = False 54 | 55 | def addCodeRef(self, addr_from, addr_to, by_jump=False): 56 | self.code_refs.update([(addr_from, addr_to)]) 57 | refs_from = self.code_refs_from.get(addr_from, set([])) 58 | refs_from.update([addr_to]) 59 | self.code_refs_from[addr_from] = refs_from 60 | refs_to = self.code_refs_to.get(addr_to, set([])) 61 | refs_to.update([addr_from]) 62 | self.code_refs_to[addr_to] = refs_to 63 | if by_jump: 64 | self.is_jmp = True 65 | self.jump_targets.update([addr_to]) 66 | 67 | def removeCodeRef(self, addr_from, addr_to): 68 | if (addr_from, addr_to) in self.code_refs: 69 | self.code_refs.remove((addr_from, addr_to)) 70 | if addr_from in self.code_refs_from and addr_to in self.code_refs_from[addr_from]: 71 | self.code_refs_from[addr_from].remove(addr_to) 72 | if addr_to in self.code_refs_to and addr_from in self.code_refs_to[addr_to]: 73 | self.code_refs_to[addr_to].remove(addr_from) 74 | if addr_to in self.jump_targets: 75 | self.jump_targets.remove(addr_to) 76 | 77 | def addDataRef(self, addr_from, addr_to, size=1): 78 | self.data_refs.update([(addr_from, addr_to)]) 79 | for i in range(size): 80 | self.data_bytes.update([addr_to + i]) 81 | 82 | def finalizeAnalysis(self, as_gap=False): 83 | fn_min = min([ins[0] for ins in self.instructions]) 84 | fn_max = max([ins[0] + ins[1] for ins in self.instructions]) 85 | 86 | self.disassembly.function_symbols[self.start_addr] = self.label 87 | self.disassembly.function_borders[self.start_addr] = (fn_min, fn_max) 88 | for ins in self.instructions: 89 | self.disassembly.instructions[ins[0]] = (ins[2], ins[1]) 90 | for offset in range(ins[1]): 91 | self.disassembly.code_map[ins[0] + offset] = ins[0] 92 | self.disassembly.ins2fn[ins[0] + offset] = self.start_addr 93 | self.disassembly.data_map.update(self.data_bytes) 94 | self.disassembly.functions[self.start_addr] = self.getBlocks() 95 | for cref in self.code_refs: 96 | self.disassembly.addCodeRefs(cref[0], cref[1]) 97 | for dref in self.data_refs: 98 | self.disassembly.addDataRefs(dref[0], dref[1]) 99 | if self.is_recursive: 100 | self.disassembly.recursive_functions.add(self.start_addr) 101 | if self.is_leaf_function: 102 | self.disassembly.leaf_functions.add(self.start_addr) 103 | if self.is_thunk_call: 104 | self.disassembly.thunk_functions.add(self.start_addr) 105 | return True 106 | 107 | def getBlocks(self): 108 | """ 109 | block derivation strategy: 110 | walk over all potential block starts, which are the start_addr + all "jump" targets (i.e. CFG redirection targets) 111 | then, for consecutive instructions, break if 112 | * they have more than 1 outgoing edge 113 | * the following instruction has more than 1 incoming edge 114 | """ 115 | if self.blocks: 116 | return self.blocks 117 | self.instructions.sort() 118 | ins = {i[0]:ind for ind, i in enumerate(self.instructions)} 119 | potential_starts = set([self.code_start_addr]) 120 | potential_starts.update(list(self.jump_targets)) 121 | blocks = [] 122 | for start in sorted(potential_starts): 123 | if not start in ins: 124 | continue 125 | block = [] 126 | for i in range(ins[start], len(self.instructions)): 127 | current = self.instructions[i] 128 | block.append(current) 129 | # if one code reference is to another address than the next 130 | if current[0] in self.code_refs_from: 131 | if not current[2] in CALL_INS and not i == len(self.instructions) - 1: 132 | if any([r != self.instructions[i+1][0] for r in self.code_refs_from[current[0]]]): 133 | break 134 | # if we can reach a colliding address from here, the block is broken and should end. 135 | reachable_collisions = self.code_refs_from[current[0]].intersection(self.colliding_addresses) 136 | next_addr = current[0] + current[1] 137 | is_next_addr = next_addr in reachable_collisions 138 | if reachable_collisions and is_next_addr: 139 | # we should remove the from/to code references for this collision as there should be no non CFG instruction references between instructions of different functions 140 | self.removeCodeRef(current[0], next_addr) 141 | break 142 | if not i == len(self.instructions) - 1 and self.instructions[i+1][0] in self.code_refs_to: 143 | if len(self.code_refs_to[self.instructions[i+1][0]]) > 1 or self.instructions[i+1][0] in potential_starts: 144 | break 145 | if current[2] in END_INS: 146 | break 147 | if block: 148 | blocks.append(block) 149 | self.blocks = blocks 150 | return self.blocks 151 | 152 | def isProcessed(self, addr): 153 | return addr in self.processed_bytes 154 | 155 | def isProcessedFunction(self): 156 | return self.start_addr in self.disassembly.code_map 157 | 158 | def isNextInstructionReachable(self): 159 | return self.is_next_instruction_reachable 160 | 161 | def setNextInstructionReachable(self, is_reachable): 162 | self.is_next_instruction_reachable = is_reachable 163 | 164 | def __str__(self): 165 | result = "0x{:x} | current: 0x{:x} | blocks: {} | queue: {} | processed: {} | crefs: {} | drefs: {} | suspicious: {} | ending: {}".format( 166 | self.start_addr, 167 | self.block_start, 168 | len(self.getBlocks()), 169 | ",".join(["0x%x" % b for b in sorted(self.block_queue)]), 170 | ",".join(["0x%x" % b for b in sorted(list(self.processed_blocks))]), 171 | len(self.code_refs), 172 | len(self.data_refs), 173 | self.suspicious_ins_count, 174 | self.is_sanely_ending 175 | ) 176 | return result 177 | -------------------------------------------------------------------------------- /smda/cil/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/smda/cil/__init__.py -------------------------------------------------------------------------------- /smda/common/BasicBlock.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class BasicBlock(object): 4 | 5 | def __init__(self): 6 | self.start_addr = 0 7 | self.end_addr = 0 8 | self.instructions = [] 9 | self.successors = [] 10 | 11 | def __str__(self): 12 | return "0x%x - 0x%x (%d) -> [%s]" % (self.start_addr, self.end_addr, len(self.instructions), ", ".join(["0x%x" % ref for ref in self.successors])) 13 | -------------------------------------------------------------------------------- /smda/common/BinaryInfo.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | import lief 4 | lief.logging.disable() 5 | 6 | 7 | class BinaryInfo(object): 8 | """ simple DTO to contain most information related to the binary/buffer to be analyzed """ 9 | 10 | architecture = "" 11 | base_addr = 0 12 | binary = b"" 13 | raw_data = b"" 14 | binary_size = 0 15 | bitness = None 16 | code_areas = [] 17 | component = "" 18 | family = "" 19 | file_path = "" 20 | is_library = False 21 | is_buffer = False 22 | sha256 = "" 23 | sha1 = "" 24 | md5 = "" 25 | version = "" 26 | exported_functions = None 27 | oep = None 28 | 29 | def __init__(self, binary): 30 | self.binary = binary 31 | self.raw_data = binary 32 | self.binary_size = len(binary) 33 | self.sha256 = hashlib.sha256(binary).hexdigest() 34 | self.sha1 = hashlib.sha1(binary).hexdigest() 35 | self.md5 = hashlib.md5(binary).hexdigest() 36 | 37 | def getOep(self): 38 | if self.oep is None: 39 | lief_result = lief.parse(self.raw_data) 40 | if isinstance(lief_result, lief.PE.Binary): 41 | self.oep = lief_result.optional_header.addressof_entrypoint 42 | elif isinstance(lief_result, lief.ELF.Binary): 43 | self.oep = lief_result.header.entrypoint 44 | return self.oep 45 | 46 | def getExportedFunctions(self): 47 | if self.exported_functions is None: 48 | lief_result = lief.parse(self.raw_data) 49 | if isinstance(lief_result, lief.PE.Binary) or isinstance(lief_result, lief.ELF.Binary): 50 | self.exported_functions = {} 51 | for function in lief_result.exported_functions: 52 | self.exported_functions[function.address] = function.name 53 | return self.exported_functions 54 | 55 | def getSections(self): 56 | pefile = lief.parse(self.raw_data) 57 | # TODO 20201030 might want to add ELF sections as well 58 | if not isinstance(pefile, lief.PE.Binary): 59 | return 60 | if pefile and pefile.sections: 61 | for section in pefile.sections: 62 | section_start = self.base_addr + section.virtual_address 63 | section_size = section.virtual_size 64 | if section_size % 0x1000 != 0: 65 | section_size += 0x1000 - (section_size % 0x1000) 66 | section_end = section_start + section_size 67 | yield section.name, section_start, section_end 68 | 69 | def isInCodeAreas(self, address): 70 | is_inside = False 71 | # if no code areas found, assume the whole image is code and calculate according to base address and size 72 | if len(self.code_areas) == 0: 73 | if self.base_addr <= address <= self.base_addr + self.binary_size: 74 | is_inside = True 75 | else: 76 | is_inside = any([a[0] <= address < a[1] for a in self.code_areas]) 77 | return is_inside 78 | 79 | def getHeaderBytes(self): 80 | if self.raw_data: 81 | lief_result = lief.parse(self.raw_data) 82 | if isinstance(lief_result, lief.PE.Binary): 83 | return self.raw_data[:0x400] 84 | elif isinstance(lief_result, lief.ELF.Binary): 85 | return self.raw_data[:0x40] 86 | return None 87 | -------------------------------------------------------------------------------- /smda/common/BlockLocator.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import bisect 3 | 4 | 5 | class BlockLocator(): 6 | """ Class that finds a block by any address within. 7 | When instantiated, creates the required data structures. 8 | """ 9 | 10 | sorted_blocks_addresses = None 11 | blocks_dict = None 12 | 13 | def __init__(self, functions): 14 | # Instantiate the datastructures required : 15 | # 1. get a flat list of all the blocks in all the functions 16 | blocks = list(itertools.chain(*[f.getBlocks() for f in functions])) 17 | self.sorted_blocks_addresses = sorted(b.offset for b in blocks) 18 | 19 | # 2 a dict of blocks by addresses 20 | self.blocks_dict = {b.offset:b for b in blocks} 21 | 22 | def _get_block_end(self, block): 23 | last_ins = block.instructions[-1] 24 | return last_ins.offset + len(last_ins.bytes) // 2 # bytes is actuall a hex string 25 | 26 | def findBlockByContainedAddress(self, inner_address): 27 | # do a binary search to find the closest address to the left of inner_address 28 | block_num = bisect.bisect(self.sorted_blocks_addresses, inner_address) - 1 29 | 30 | if block_num == -1: 31 | # target address is smaller than first block. return none 32 | return None 33 | 34 | block_start = self.sorted_blocks_addresses[block_num] 35 | block = self.blocks_dict[block_start] 36 | block_end = self._get_block_end(block) 37 | 38 | # make sure inner_address falls within the selected block 39 | if block.offset <= inner_address < block_end: 40 | return block 41 | 42 | return None 43 | -------------------------------------------------------------------------------- /smda/common/CodeXref.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class CodeXref(object): 4 | 5 | def __init__(self, smda_ins_from, smda_ins_to): 6 | self.smda_ins_from = smda_ins_from 7 | self.smda_ins_to = smda_ins_to 8 | 9 | @property 10 | def from_function(self): 11 | return self.smda_ins_from.smda_function 12 | 13 | @property 14 | def to_function(self): 15 | return self.smda_ins_to.smda_function 16 | 17 | @property 18 | def from_instruction(self): 19 | return self.smda_ins_from 20 | 21 | @property 22 | def to_instruction(self): 23 | return self.smda_ins_to 24 | 25 | def __str__(self): 26 | return "0x%x (0x%x) -> 0x%x (0x%x)" % (self.smda_ins_from.offset, self.smda_ins_from.smda_function.offset, self.smda_ins_to.offset, self.smda_ins_to.smda_function.offset) 27 | 28 | def __repr__(self): 29 | return "".format(self.smda_ins_from.offset, self.smda_ins_to.offset) 30 | -------------------------------------------------------------------------------- /smda/common/DominatorTree.py: -------------------------------------------------------------------------------- 1 | # Implementation by Armin Rigo 2 | # source: https://bitbucket.org/arigo/arigo/src/default/hack/pypy-hack/heapstats/dominator.py 3 | 4 | # Implementation following: 5 | # 6 | # Lengauer, Thomas; and Tarjan, Robert Endre (July 1979). 7 | # "A fast algorithm for finding dominators in a flowgraph". 8 | # ACM Transactions on Programming Languages and Systems (TOPLAS) 1 (1): 9 | # 121-141. 10 | # 11 | # http://portal.acm.org/ft_gateway.cfm?id=357071 12 | 13 | import logging 14 | 15 | LOGGER = logging.getLogger(__name__) 16 | 17 | class DominatorTree(object): 18 | 19 | def __init__(self, G, r): 20 | assert r in G 21 | self.succ = G 22 | self.r = r 23 | 24 | def init_variables(self): 25 | self.parent = {} 26 | self.pred = {} 27 | self.semi = {} 28 | self.vertex = [] 29 | self.bucket = {} 30 | self.dom = {} 31 | self.ancestor = {} 32 | self.label = {} 33 | for v in self.succ: 34 | self.pred[v] = set() 35 | self.bucket[v] = set() 36 | 37 | def depth_first_search(self, v): 38 | stack = [v] 39 | while stack: 40 | v = stack.pop() 41 | n = len(self.vertex) 42 | self.semi[v] = n 43 | self.vertex.append(v) 44 | for w in self.succ[v]: 45 | self.pred[w].add(v) 46 | if w not in self.semi: 47 | self.parent[w] = v 48 | self.semi[w] = None # temporarily 49 | stack.append(w) 50 | 51 | def LINK(self, v, w): 52 | self.ancestor[w] = v 53 | 54 | def EVAL(self, v): 55 | if v not in self.ancestor: 56 | return v 57 | else: 58 | self.COMPRESS(v) 59 | return self.label.get(v, v) 60 | 61 | def COMPRESS(self, v): 62 | if self.ancestor[v] in self.ancestor: 63 | self.COMPRESS(self.ancestor[v]) 64 | w = self.ancestor[v] 65 | if self.semi[self.label.get(w,w)] < self.semi[self.label.get(v,v)]: 66 | self.label[v] = self.label.get(w,w) 67 | self.ancestor[v] = self.ancestor[w] 68 | 69 | def steps_2_3(self): 70 | for w in self.vertex[:0:-1]: 71 | # step 2 72 | for v in self.pred[w]: 73 | u = self.EVAL(v) 74 | if self.semi[u] < self.semi[w]: 75 | self.semi[w] = self.semi[u] 76 | self.bucket[self.vertex[self.semi[w]]].add(w) 77 | self.LINK(self.parent[w], w) 78 | # step 3 79 | for v in list(self.bucket[self.parent[w]]): 80 | self.bucket[self.parent[w]].remove(v) 81 | u = self.EVAL(v) 82 | if self.semi[u] < self.semi[v]: 83 | self.dom[v] = u 84 | else: 85 | self.dom[v] = self.parent[w] 86 | 87 | def step_4(self): 88 | for w in self.vertex[1:]: 89 | if self.dom[w] != self.vertex[self.semi[w]]: 90 | self.dom[w] = self.dom[self.dom[w]] 91 | 92 | def compute(self): 93 | self.init_variables() 94 | self.depth_first_search(self.r) 95 | self.steps_2_3() 96 | self.step_4() 97 | 98 | 99 | def fix_graph(graph): 100 | expanded_graph = {} 101 | for key, values in graph.items(): 102 | expanded_graph[key] = values 103 | for value in values: 104 | if value not in expanded_graph: 105 | expanded_graph[value] = [] 106 | return expanded_graph 107 | 108 | 109 | # Calculation of Nesting Depth by walking down dominators and summarizing weights 110 | # Implementation by Steffen Enders and Daniel Plohmann 111 | 112 | def build_dominator_tree(G, r): 113 | expanded_graph = fix_graph(G) 114 | if not r in expanded_graph: 115 | # print("r not in G:", r, G) 116 | LOGGER.debug("r not in G: %s %s", r, G) 117 | return None 118 | domtree = DominatorTree(expanded_graph, r) 119 | domtree.compute() 120 | inverted = {} 121 | for key, value in domtree.dom.items(): 122 | if value not in inverted: 123 | inverted[value] = [] 124 | inverted[value].append(key) 125 | return inverted 126 | 127 | def get_nesting_depth(graph, domtree, root): 128 | expanded_graph = fix_graph(graph) 129 | significant_nodes = set.union(*([set(v) for v in expanded_graph.values() if len(v) > 1] + [set()])) 130 | # print("significant_nodes", significant_nodes) 131 | def maximum_costs(cn): 132 | # print(" maximum_costs cn", cn) 133 | if cn not in domtree or not domtree[cn]: 134 | # print(" %d not in domtree or not domtree[%d]" % (cn, cn), 1 if cn in significant_nodes else 0) 135 | return (1 if cn in significant_nodes else 0) 136 | val = max(maximum_costs(n) for n in domtree[cn]) + (1 if cn in significant_nodes else 0) 137 | # print(" ", val, 1 if cn in significant_nodes else 0) 138 | return val 139 | try: 140 | return maximum_costs(root) 141 | except: 142 | return 0 143 | 144 | 145 | 146 | if __name__ == "__main__": 147 | test_data = [ 148 | { 149 | "smda": {10208: [10229], 10229: [10240, 10253], 10240: [10244, 10246], 10244: [10246], 10246: [10240, 10253], 10253: [10229, 10261]}, 150 | "smda_function": 10208, 151 | "fixed": {10208: [10229], 10229: [10240, 10253], 10240: [10244, 10246], 10253: [10229, 10261], 10244: [10246], 10246: [10240, 10253], 10261: []}, 152 | "dt": {10240: [10244, 10246], 10229: [10240, 10253], 10253: [10261], 10208: [10229]}, 153 | "nd": 3 154 | }, { 155 | "smda": {1: [2], 2: [3, 4, 6], 3: [5], 4: [5], 5: [2]}, 156 | "smda_function": 1, 157 | "fixed": {1: [2], 2: [3, 4, 6], 3: [5], 4: [5], 6: [], 5: [2]}, 158 | "dt": {2: [3, 4, 5, 6], 1: [2]}, 159 | "nd": 1 160 | }, { 161 | "smda": {1: [2], 2: [3, 6], 3: [41, 42], 41: [5], 42: [5], 5: [2]}, 162 | "smda_function": 1, 163 | "fixed": {1: [2], 2: [3, 6], 3: [41, 42], 6: [], 41: [5], 42: [5], 5: [2]}, 164 | "dt": {3: [41, 42, 5], 2: [3, 6], 1: [2]}, 165 | "nd": 2 166 | }, 167 | 168 | ] 169 | for data in test_data: 170 | print("*" * 80) 171 | print("Running Test Case: ", data["smda_function"]) 172 | print("*" * 80) 173 | print("smda", data["smda"]) 174 | fixed_smda = {} 175 | for key, values in data["smda"].items(): 176 | fixed_smda[key] = values 177 | for value in values: 178 | if value not in fixed_smda: 179 | fixed_smda[value] = [] 180 | print("fixed_smda", fixed_smda) 181 | assert fixed_smda == data["fixed"] 182 | dt = build_dominator_tree(data["smda"], data["smda_function"]) 183 | print("dominator tree", dt) 184 | assert dt == data["dt"] 185 | nd = get_nesting_depth(fixed_smda, dt, data["smda_function"]) 186 | print("nd", nd) 187 | assert nd == data["nd"] 188 | -------------------------------------------------------------------------------- /smda/common/SmdaBasicBlock.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import hashlib 3 | from typing import Iterator 4 | 5 | from smda.common.SmdaInstruction import SmdaInstruction 6 | 7 | 8 | class SmdaBasicBlock: 9 | 10 | smda_function = None 11 | instructions = None 12 | picblockhash = None 13 | opcblockhash = None 14 | offset = None 15 | length = None 16 | 17 | def __init__(self, instructions, smda_function=None): 18 | assert isinstance(instructions, list) 19 | self.smda_function = smda_function 20 | if instructions: 21 | self.instructions = instructions 22 | self.offset = instructions[0].offset 23 | self.length = len(instructions) 24 | self.picblockhash = self.getPicBlockHash() 25 | self.opcblockhash = self.getOpcBlockHash() 26 | 27 | def getInstructions(self) -> Iterator["SmdaInstruction"]: 28 | for instruction in self.instructions: 29 | yield instruction 30 | 31 | def getPicBlockHash(self): 32 | if self.picblockhash is not None: 33 | return self.picblockhash 34 | picblockhash_sequence = self.getPicBlockHashSequence() 35 | if picblockhash_sequence is not None: 36 | self.picblockhash = struct.unpack("Q", hashlib.sha256(picblockhash_sequence).digest()[:8])[0] 37 | return self.picblockhash 38 | 39 | def getPicBlockHashSequence(self): 40 | """ if we have a SmdaFunction as parent, we can try to generate the PicBlockHash ad-hoc """ 41 | # check all the prerequisites 42 | if self.smda_function and self.smda_function.smda_report and self.smda_function._escaper and self.smda_function.smda_report.base_addr is not None and self.smda_function.smda_report.binary_size: 43 | escaped_binary_seqs = [] 44 | for instruction in self.getInstructions(): 45 | escaped_binary_seqs.append(instruction.getEscapedBinary(self.smda_function._escaper, escape_intraprocedural_jumps=True, lower_addr=self.smda_function.smda_report.base_addr, upper_addr=self.smda_function.smda_report.base_addr + self.smda_function.smda_report.binary_size)) 46 | return bytes([ord(c) for c in "".join(escaped_binary_seqs)]) 47 | 48 | def getOpcBlockHash(self): 49 | if self.opcblockhash is not None: 50 | return self.opcblockhash 51 | opcblockhash_sequence = self.getOpcBlockHashSequence() 52 | if opcblockhash_sequence is not None: 53 | self.opcblockhash = struct.unpack("Q", hashlib.sha256(opcblockhash_sequence).digest()[:8])[0] 54 | return self.opcblockhash 55 | 56 | def getOpcBlockHashSequence(self): 57 | """ if we have a SmdaFunction as parent, we can try to generate the OpcBlockHash ad-hoc """ 58 | # check all the prerequisites 59 | if self.smda_function and self.smda_function.smda_report and self.smda_function._escaper: 60 | escaped_binary_seqs = [] 61 | for instruction in self.getInstructions(): 62 | escaped_binary_seqs.append(instruction.getEscapedToOpcodeOnly(self.smda_function._escaper)) 63 | return bytes([ord(c) for c in "".join(escaped_binary_seqs)]) 64 | 65 | def getPredecessors(self): 66 | predecessors = [] 67 | if self.smda_function is not None: 68 | for frm, to in self.smda_function.blockrefs.items(): 69 | if self.offset in to: 70 | predecessors.append(frm) 71 | return predecessors 72 | 73 | def getSuccessors(self): 74 | successors = [] 75 | if self.smda_function is not None: 76 | if self.offset in self.smda_function.blockrefs: 77 | successors.extend(self.smda_function.blockrefs[self.offset]) 78 | return successors 79 | 80 | @classmethod 81 | def fromDict(cls, block_dict, smda_function=None) -> "SmdaBasicBlock": 82 | smda_block = cls(None) 83 | smda_block.smda_function = smda_function 84 | smda_block.instructions = [SmdaInstruction.fromDict(d, smda_function=smda_function) for d in block_dict] 85 | return smda_block 86 | 87 | def toDict(self) -> dict: 88 | return [smda_ins.toDict() for smda_ins in self.instructions] 89 | 90 | def __int__(self): 91 | return self.offset 92 | 93 | def __str__(self): 94 | return "0x{:08x}: ({:>4})".format(self.offset, self.length) 95 | -------------------------------------------------------------------------------- /smda/common/SmdaInstruction.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import logging 3 | 4 | from capstone.x86 import X86_OP_IMM, X86_OP_MEM 5 | 6 | from smda.intel.IntelInstructionEscaper import IntelInstructionEscaper 7 | 8 | LOGGER = logging.getLogger(__name__) 9 | 10 | 11 | class SmdaInstruction: 12 | 13 | smda_function = None 14 | offset = None 15 | bytes = None 16 | mnemonic = None 17 | operands = None 18 | detailed = None 19 | 20 | def __init__(self, ins_list=None, smda_function=None): 21 | self.smda_function = smda_function 22 | if ins_list is not None: 23 | self.offset = ins_list[0] 24 | self.bytes = ins_list[1] 25 | self.mnemonic = ins_list[2] 26 | self.operands = ins_list[3] 27 | 28 | def getDataRefs(self): 29 | if self.getMnemonicGroup(IntelInstructionEscaper) != "C": 30 | detailed = self.getDetailed() 31 | if len(detailed.operands) > 0: 32 | for i in detailed.operands: 33 | value = None 34 | if i.type == X86_OP_IMM: 35 | value = i.imm 36 | if i.type == X86_OP_MEM: 37 | value = i.mem.disp 38 | if detailed.reg_name(i.mem.base) == "rip": 39 | # add RIP value 40 | value += detailed.address + detailed.size 41 | if value is not None and self.smda_function.smda_report.isAddrWithinMemoryImage(value): 42 | yield value 43 | 44 | def getDetailed(self): 45 | if self.detailed is None: 46 | capstone = self.smda_function.smda_report.getCapstone() 47 | with_details = [i for i in capstone.disasm(bytes.fromhex(self.bytes), self.offset)] 48 | # TODO 49 | # this may diverge on instructions like 50 | # 9bd93c24 - 51 | # 52 | # 1 wait 53 | # 54 | # 3 fnstcw word ptr [esp] 55 | # which is split by capstone but treated as one / prefix by IDA 56 | # https://fragglet.github.io/dos-help-files/alang.hlp/FLDCW.html 57 | # FSTCW has wait and no-wait versions. The wait version (FSTCW) 58 | # checks for unmasked numeric errors; the no-wait version (FNSTCW) 59 | # does not. When the .8087 directive is used, the assembler puts the 60 | # WAIT instruction before the wait version and the NOP instruction 61 | # before the no-wait version. 62 | if len(with_details) > 1: 63 | LOGGER.warn(f"Sequence {self.bytes} disassembles to {len(with_details)} instructions but expected one - taking the last instruction only!") 64 | self.detailed = with_details[-1] 65 | else: 66 | assert len(with_details) == 1 67 | self.detailed = with_details[0] 68 | return self.detailed 69 | 70 | def getMnemonicGroup(self, escaper): 71 | if escaper: 72 | return escaper.escapeMnemonic(self.mnemonic) 73 | return self.bytes 74 | 75 | def getEscapedOperands(self, escaper): 76 | if escaper: 77 | return escaper.escapeOperands(self) 78 | return self.bytes 79 | 80 | def getMaskedOperands(self, escaper): 81 | if escaper: 82 | return escaper.escapeOperands(self, offsets_only=True) 83 | return self.bytes 84 | 85 | def getEscapedToOpcodeOnly(self, escaper): 86 | if escaper: 87 | return escaper.escapeToOpcodeOnly(self) 88 | return self.bytes 89 | 90 | def getEscapedBinary(self, escaper, escape_intraprocedural_jumps=False, lower_addr=None, upper_addr=None): 91 | if escaper: 92 | return escaper.escapeBinary(self, escape_intraprocedural_jumps=escape_intraprocedural_jumps, lower_addr=lower_addr, upper_addr=upper_addr) 93 | return self.bytes 94 | 95 | @classmethod 96 | def fromDict(cls, instruction_dict, smda_function=None) -> Optional["SmdaInstruction"]: 97 | smda_instruction = cls(None) 98 | smda_instruction.smda_function = smda_function 99 | smda_instruction.offset = instruction_dict[0] 100 | smda_instruction.bytes = instruction_dict[1] 101 | smda_instruction.mnemonic = instruction_dict[2] 102 | smda_instruction.operands = instruction_dict[3] 103 | return smda_instruction 104 | 105 | def toDict(self) -> dict: 106 | return [self.offset, self.bytes, self.mnemonic, self.operands] 107 | 108 | def __int__(self): 109 | return self.offset 110 | 111 | def __str__(self): 112 | return "0x{:08x}: ({:>14s}) - {} {}".format(self.offset, self.bytes, self.mnemonic, self.operands) 113 | -------------------------------------------------------------------------------- /smda/common/TailcallAnalyzer.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from operator import itemgetter 3 | import bisect 4 | import json 5 | 6 | class TailcallAnalyzer(object): 7 | 8 | def __init__(self): 9 | self.__jumps = defaultdict(set) 10 | self.__tmp_jumps = defaultdict(list) 11 | self.__functions = list() 12 | 13 | def initFunction(self): 14 | self.__tmp_jumps = defaultdict(list) 15 | 16 | def addJump(self, source, destination): 17 | self.__tmp_jumps[source].append(destination) 18 | 19 | def finalizeFunction(self, function_state): 20 | for source, destinations in self.__tmp_jumps.items(): 21 | self.__jumps[source].update(destinations) 22 | self.__tmp_jumps.clear() 23 | self.__functions.append(function_state) 24 | 25 | def getTailcalls(self): 26 | result = list() 27 | # jumps sorted by (destination, source) 28 | jumps = list(sorted(((s, d) for s in self.__jumps for d in self.__jumps[s]), key=itemgetter(1, 0))) 29 | jumps_dest = [d for s, d in jumps] 30 | # for each function generate the intervals that contain the instructions 31 | for function in self.__functions: 32 | # check if there are any jumps from outside the function to inside the function 33 | function_intervals = self.__getFunctionIntervals(function) 34 | if not function_intervals: 35 | # empty function? 36 | continue 37 | min_addr = min(interval[0] for interval in function_intervals) 38 | max_addr = max(interval[1] for interval in function_intervals) 39 | for source, destination in jumps[bisect.bisect_left(jumps_dest, min_addr):bisect.bisect_right(jumps_dest, max_addr)]: 40 | if ( 41 | # the jumps destination is different from the functions start address 42 | destination != function.start_addr and 43 | # the jumps destination is in one of the functions intervals 44 | any((first <= destination <= last) for first, last in function_intervals) and 45 | # the jump originates from outside the function (outside all intervals) 46 | all((source < first or source > last) for first, last in function_intervals)): 47 | 48 | result.append({ 49 | "source_addr": source, 50 | "destination_addr": destination, 51 | "destination_function": function.start_addr 52 | }) 53 | 54 | return result 55 | 56 | def __getFunctionIntervals(self, function_state): 57 | intervals = list() 58 | instructions = sorted(function_state.instructions, key=itemgetter(0)) 59 | first_instruction = instructions[0] if instructions else None 60 | last_instruction = first_instruction 61 | for instruction in instructions: 62 | if instruction[0] > last_instruction[0] + last_instruction[1]: 63 | intervals.append((first_instruction[0], last_instruction[0])) 64 | first_instruction = instruction 65 | last_instruction = instruction 66 | if last_instruction: 67 | intervals.append((first_instruction[0], last_instruction[0])) 68 | return intervals 69 | 70 | def __getFunctionByStartAddr(self, start_addr): 71 | for function in self.__functions: 72 | if function.start_addr == start_addr: 73 | return function 74 | 75 | def __printIntervals(self, intervals): 76 | # return 77 | if len(intervals) < 25: 78 | for one, two in intervals: 79 | print(" 0x{:x} -> 0x{:x}".format(one, two)) 80 | else: print("Function has too many intervals to display") 81 | 82 | def resolveTailcalls(self, disassembler, verbose=False): 83 | newly_created_functions = set([]) 84 | for tailcall in self.getTailcalls(): 85 | if verbose: 86 | print("Processing tailcall:\n{}".format(json.dumps(tailcall, indent=2, sort_keys=True))) 87 | # remove the information from the function-analysis state of the disassembly 88 | function = self.__getFunctionByStartAddr(tailcall["destination_function"]) 89 | if not function or function.is_tailcall_function: 90 | disassembler.analyzeFunction(tailcall["destination_function"]) 91 | continue 92 | 93 | self.__functions.remove(function) 94 | if function: 95 | if verbose: 96 | print("Old function:") 97 | self.__printIntervals(self.__getFunctionIntervals(function)) 98 | function.revertAnalysis() 99 | 100 | # analyze the tailcall destination as function 101 | disassembler.analyzeFunction(tailcall["destination_addr"]) 102 | newly_created_functions.add(tailcall["destination_addr"]) 103 | function = self.__getFunctionByStartAddr(tailcall["destination_addr"]) 104 | if function and not tailcall["destination_function"] in function.instruction_start_bytes: 105 | # analyze the (previously) broken function a second time 106 | try: 107 | disassembler.analyzeFunction(tailcall["destination_function"]) 108 | function = self.__getFunctionByStartAddr(tailcall["destination_function"]) 109 | function.is_tailcall_function = True 110 | except: 111 | pass 112 | # print ("0x{:x} -> 0x{:x}".format(tailcall["destination_function"], tailcall["destination_addr"])) 113 | elif verbose: 114 | print("**** 0x{:x} IS NOW PART OF 0x{:x}".format(tailcall["destination_function"], tailcall["destination_addr"])) 115 | 116 | if verbose: 117 | function = self.__getFunctionByStartAddr(tailcall["destination_function"]) 118 | new_function = self.__getFunctionByStartAddr(tailcall["destination_addr"]) 119 | print("New function:") 120 | if new_function: 121 | self.__printIntervals(self.__getFunctionIntervals(new_function)) 122 | print("Re-disassembled old function:") 123 | if function: 124 | self.__printIntervals(self.__getFunctionIntervals(function)) 125 | return sorted(list(newly_created_functions)) 126 | -------------------------------------------------------------------------------- /smda/common/Tarjan.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tarjan's Algorithm (named for its discoverer, Robert Tarjan) is a graph theory algorithm 3 | for finding the strongly connected components of a graph. 4 | This can be used to find loops. 5 | Based on: http://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm 6 | - Refactored into a class to allow pooled computation by Daniel Plohmann 7 | - Implementation by Bas Westerbaan: 8 | https://github.com/bwesterb/py-tarjan 9 | """ 10 | 11 | class Tarjan(object): 12 | """ g is the graph represented as a dictionary { : } """ 13 | 14 | def __init__(self, graph): 15 | self._graph = graph 16 | self._stack = [] 17 | self._stack_set = set([]) 18 | self._index = {} 19 | self._lowlink = {} 20 | self._nonrecursive_stack = [] 21 | self._result = [] 22 | 23 | def _tarjan_head(self, v): 24 | self._index[v] = len(self._index) 25 | self._lowlink[v] = self._index[v] 26 | self._stack.append(v) 27 | self._stack_set.add(v) 28 | it = iter(self._graph.get(v, ())) 29 | self._nonrecursive_stack.append((it, False, v, None)) 30 | 31 | def _tarjan_body(self, it, v): 32 | for w in it: 33 | if w not in self._index: 34 | self._nonrecursive_stack.append((it, True, v, w)) 35 | self._tarjan_head(w) 36 | return 37 | if w in self._stack_set: 38 | self._lowlink[v] = min(self._lowlink[v], self._index[w]) 39 | if self._lowlink[v] == self._index[v]: 40 | scc = [] 41 | w = None 42 | while v != w: 43 | w = self._stack.pop() 44 | scc.append(w) 45 | self._stack_set.remove(w) 46 | self._result.append(scc) 47 | 48 | def calculateScc(self): 49 | main_iter = iter(self._graph) 50 | while True: 51 | try: 52 | v = next(main_iter) 53 | except StopIteration: 54 | return self._result 55 | if v not in self._index: 56 | self._tarjan_head(v) 57 | while self._nonrecursive_stack: 58 | it, inside, v, w = self._nonrecursive_stack.pop() 59 | if inside: 60 | self._lowlink[v] = min(self._lowlink[w], self._lowlink[v]) 61 | self._tarjan_body(it, v) 62 | 63 | def closure(self): 64 | """ Given a graph @g, returns the transitive closure of @g """ 65 | ret = {} 66 | for scc in self.calculateScc(): 67 | ws = set() 68 | ews = set() 69 | for v in scc: 70 | ws.update(self._graph[v]) 71 | for w in ws: 72 | assert w in ret or w in scc 73 | ews.add(w) 74 | ews.update(ret.get(w, ())) 75 | if len(scc) > 1: 76 | ews.update(scc) 77 | ews = tuple(ews) 78 | for v in scc: 79 | ret[v] = ews 80 | return ret 81 | 82 | def getResult(self): 83 | return self._result 84 | -------------------------------------------------------------------------------- /smda/common/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /smda/common/labelprovider/AbstractLabelProvider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from abc import abstractmethod 4 | 5 | import logging 6 | LOGGER = logging.getLogger(__name__) 7 | 8 | 9 | class AbstractLabelProvider: 10 | 11 | def __init__(self, config): 12 | raise NotImplementedError 13 | 14 | @abstractmethod 15 | def update(self, binary_info): 16 | """If the LabelProvider needs to parse from the given target, update() can be used to populate the provider """ 17 | raise NotImplementedError 18 | 19 | @abstractmethod 20 | def getApi(self, absolute_addr): 21 | """If the LabelProvider has any information about a used API for the given address, return (dll, api), else return None""" 22 | raise NotImplementedError 23 | 24 | @abstractmethod 25 | def getSymbol(self, address): 26 | """If the LabelProvider has any information about a used Symbol for the given address, return the symbol, else return None""" 27 | raise NotImplementedError 28 | 29 | @abstractmethod 30 | def isApiProvider(self): 31 | """Returns whether the get_api(..) function of the AbstractLabelProvider is functional""" 32 | return False 33 | 34 | @abstractmethod 35 | def isSymbolProvider(self): 36 | """Returns whether the get_symbol(..) function of the AbstractLabelProvider is functional""" 37 | return False 38 | 39 | @abstractmethod 40 | def getFunctionSymbols(self): 41 | """Return all function symbol data """ 42 | return {} 43 | -------------------------------------------------------------------------------- /smda/common/labelprovider/CilSymbolProvider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import logging 4 | 5 | import dnfile 6 | from dnfile.enums import MetadataTables 7 | 8 | from .AbstractLabelProvider import AbstractLabelProvider 9 | 10 | LOGGER = logging.getLogger(__name__) 11 | 12 | 13 | class CilSymbolProvider(AbstractLabelProvider): 14 | """ Minimal resolver for CIL/DOTNET symbols """ 15 | 16 | def __init__(self, config): 17 | self._config = config 18 | #addr:func_name 19 | self._addr_to_func_symbols = {} 20 | self._func_symbol_to_addr = {} 21 | 22 | def isSymbolProvider(self): 23 | return True 24 | 25 | def decodeSymbolName(self, value): 26 | """ ensure a proper utf-8 escaped string """ 27 | return value.encode("utf-8").decode("utf-8") 28 | 29 | def update(self, binary_info): 30 | pe = dnfile.dnPE(data=binary_info.raw_data) 31 | for row in pe.net.mdtables.MethodDef: 32 | addr = pe.get_offset_from_rva(row.Rva) 33 | func_name = self.decodeSymbolName(row.Name.value) 34 | self._addr_to_func_symbols[addr] = func_name 35 | self._func_symbol_to_addr[func_name] = addr 36 | 37 | def getSymbol(self, address): 38 | return self._addr_to_func_symbols.get(address, "") 39 | 40 | def getAddress(self, func_symbol): 41 | return self._func_symbol_to_addr.get(func_symbol, None) 42 | 43 | def getFunctionSymbols(self): 44 | return self._addr_to_func_symbols 45 | -------------------------------------------------------------------------------- /smda/common/labelprovider/DelphiKbSymbolProvider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | import re 4 | import logging 5 | from io import BytesIO 6 | 7 | from .AbstractLabelProvider import AbstractLabelProvider 8 | from smda.utility.DelphiKbFileLoader import DelphiKbFileLoader 9 | 10 | LOGGER = logging.getLogger(__name__) 11 | 12 | 13 | class DelphiKbSymbolProvider(AbstractLabelProvider): 14 | """ Minimal resolver for Delphi knowledge base files """ 15 | 16 | def __init__(self, config): 17 | self._config = config 18 | # addr:func_name 19 | self._func_symbols = {} 20 | self._relocations = {} 21 | 22 | def update(self, binary_info): 23 | binary = binary_info.binary 24 | if DelphiKbFileLoader.isCompatible(binary): 25 | self._func_symbols = self.parseKbBuffer(binary, binary_info.base_addr) 26 | 27 | def isSymbolProvider(self): 28 | return True 29 | 30 | def getSymbol(self, address): 31 | return self._func_symbols.get(address, "") 32 | 33 | def getFunctionSymbols(self): 34 | return self._func_symbols 35 | 36 | def getRelocations(self): 37 | return self._relocations 38 | 39 | def parseKbBuffer(self, binary, base_addr): 40 | result = {} 41 | fh = BytesIO(binary) 42 | fh.seek(-4, os.SEEK_END) 43 | fh.seek(int.from_bytes(fh.read(4), byteorder="little")) 44 | # process modules 45 | len_mod_data_table = int.from_bytes(fh.read(4), byteorder="little") 46 | fh.read(4) 47 | modules = {} 48 | for i in range(len_mod_data_table): 49 | offset = int.from_bytes(fh.read(4), byteorder="little") 50 | size = int.from_bytes(fh.read(4), byteorder="little") 51 | modId = int.from_bytes(fh.read(4), byteorder="little") 52 | namID = int.from_bytes(fh.read(4), byteorder="little") 53 | modules[modId] = {} 54 | modules[modId]['offset'] = offset 55 | modules[modId]['size'] = size 56 | modules[modId]['namID'] = namID 57 | temp_off = fh.tell() 58 | for modID in modules: 59 | fh.seek(modules[modID]['offset']) 60 | if modID != int.from_bytes(fh.read(2), byteorder="little"): 61 | print('ModID doesnt match' + str(modules[modID]['offset'])) 62 | len_name = int.from_bytes(fh.read(2), byteorder="little") 63 | modules[modID]['name'] = fh.read(len_name).decode() 64 | modules[modID]['functions'] = [] 65 | fh.seek(temp_off) 66 | # process functions and their code 67 | for i in range(4): 68 | fh.seek(int.from_bytes(fh.read(4), byteorder="little") * 16 + fh.tell() + 4) 69 | len_fun_data_table = int.from_bytes(fh.read(4), byteorder="little") 70 | fh.read(4) 71 | for i in range(len_fun_data_table): 72 | offset = int.from_bytes(fh.read(4), byteorder="little") 73 | temp_off = fh.tell() 74 | fh.seek(offset) 75 | function_info = {} 76 | function_info['modId'] = int.from_bytes(fh.read(2), byteorder="little") 77 | len_name = int.from_bytes(fh.read(2), byteorder="little") 78 | function_info['name'] = fh.read(len_name).decode() 79 | fh.read(9) 80 | len_type = int.from_bytes(fh.read(2), byteorder="little") 81 | type = fh.read(len_type).decode() 82 | fh.read(5) 83 | function_info['dump_size'] = int.from_bytes(fh.read(4), byteorder="little") 84 | fh.read(4) 85 | function_code_start_offset = fh.tell() 86 | result[base_addr + function_code_start_offset] = function_info['name'] 87 | function_info['dump'] = list(fh.read(function_info['dump_size'])) 88 | # relocations mark both call but also data ref offsets 89 | function_info['reloc'] = fh.read(function_info['dump_size']) 90 | for match in re.finditer(b"\xFF\xFF\xFF\xFF", function_info['reloc']): 91 | self._relocations[function_code_start_offset + match.start()] = 0 92 | modules[function_info['modId']]['functions'].append(function_info) 93 | fh.seek(temp_off + 12) 94 | return result 95 | -------------------------------------------------------------------------------- /smda/common/labelprovider/ElfApiResolver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import lief 3 | lief.logging.disable() 4 | 5 | from .AbstractLabelProvider import AbstractLabelProvider 6 | 7 | 8 | class ElfApiResolver(AbstractLabelProvider): 9 | """ Minimal ELF API reference resolver, extracting APIs from ELF imports """ 10 | 11 | def __init__(self, config): 12 | self._api_map = { 13 | "lief": {} 14 | } 15 | 16 | def update(self, binary_info): 17 | if binary_info.is_buffer: 18 | # cannot reconstruct from shellcode/memory dump at this time 19 | return 20 | 21 | else: 22 | lief_binary = lief.parse(binary_info.raw_data) 23 | 24 | if not isinstance(lief_binary, lief.ELF.Binary): 25 | return 26 | 27 | for relocation in lief_binary.relocations: 28 | if not relocation.has_symbol: 29 | # doesn't have a name, we won't care about it 30 | continue 31 | if not relocation.symbol.imported: 32 | # only interested in APIs from external sources 33 | continue 34 | if not relocation.symbol.is_function: 35 | # only interested in APIs (which are functions) 36 | continue 37 | 38 | # we can't really say what library the symbol came from 39 | # however, we can treat the version (if present) as relevant metadata? 40 | # note: this only works for GNU binaries, such as for Linux 41 | lib = None 42 | if relocation.symbol.has_version and relocation.symbol.symbol_version.has_auxiliary_version: 43 | # like "GLIBC_2.2.5" 44 | lib = relocation.symbol.symbol_version.symbol_version_auxiliary.name 45 | 46 | name = relocation.symbol.name 47 | address = relocation.address 48 | 49 | self._api_map["lief"][address] = (lib, name) 50 | 51 | def isApiProvider(self): 52 | """Returns whether the get_api(..) function of the AbstractLabelProvider is functional""" 53 | return True 54 | 55 | def getApi(self, to_addr, absolute_addr): 56 | """ 57 | If the LabelProvider has any information about a used API for the given address, return (dll, api), else return (None, None). 58 | 59 | May return None for the `dll` if it cannot be determined. 60 | When it can be determined for ELF files, the `dll` field should be interpreted as the API version rather than shared library name. 61 | For example: "GLIBC_2.2.5". 62 | """ 63 | return self._api_map["lief"].get(to_addr, (None, None)) 64 | -------------------------------------------------------------------------------- /smda/common/labelprovider/ElfSymbolProvider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import logging 4 | from .AbstractLabelProvider import AbstractLabelProvider 5 | 6 | LOGGER = logging.getLogger(__name__) 7 | 8 | try: 9 | import lief 10 | lief.logging.disable() 11 | except: 12 | lief = None 13 | LOGGER.warning("3rd party library LIEF not installed - won't be able to extract symbols for ELF files where available.") 14 | 15 | 16 | 17 | class ElfSymbolProvider(AbstractLabelProvider): 18 | """ Minimal resolver for ELF symbols """ 19 | 20 | def __init__(self, config): 21 | self._config = config 22 | #addr:func_name 23 | self._func_symbols = {} 24 | 25 | def isSymbolProvider(self): 26 | return True 27 | 28 | def _parseOep(self, lief_result): 29 | if lief_result: 30 | self._func_symbols[lief_result.header.entrypoint] = "original_entry_point" 31 | 32 | def update(self, binary_info): 33 | #works both for PE and ELF 34 | self._func_symbols = {} 35 | data = b"" 36 | if binary_info.file_path: 37 | with open(binary_info.file_path, "rb") as fin: 38 | data = fin.read() 39 | return 40 | elif binary_info.raw_data: 41 | data = binary_info.raw_data 42 | else: 43 | return 44 | if data[:4] != b"\x7FELF" or lief is None: 45 | return 46 | lief_binary = lief.parse(data) 47 | self._parseOep(lief_binary) 48 | # TODO split resolution into API/dynamic part and local symbols 49 | self._parseExports(lief_binary) 50 | self._parseSymbols(lief_binary.symtab_symbols) 51 | self._parseSymbols(lief_binary.dynamic_symbols) 52 | for reloc in lief_binary.relocations: 53 | if reloc.has_symbol: 54 | self._func_symbols[reloc.address] = reloc.symbol.name 55 | 56 | def _parseExports(self, binary): 57 | for function in binary.exported_functions: 58 | self._func_symbols[function.address] = function.name 59 | 60 | def _parseSymbols(self, symbols): 61 | for symbol in symbols: 62 | if symbol.is_function: 63 | if symbol.value != 0: 64 | func_name = "" 65 | try: 66 | func_name = symbol.demangled_name 67 | except: 68 | func_name = symbol.name 69 | self._func_symbols[symbol.value] = func_name 70 | 71 | def getSymbol(self, address): 72 | return self._func_symbols.get(address, "") 73 | 74 | def getFunctionSymbols(self): 75 | return self._func_symbols 76 | -------------------------------------------------------------------------------- /smda/common/labelprovider/GoLabelProvider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import re 3 | import lief 4 | lief.logging.disable() 5 | import struct 6 | import logging 7 | from collections import OrderedDict 8 | 9 | from .AbstractLabelProvider import AbstractLabelProvider 10 | 11 | LOGGER = logging.getLogger(__name__) 12 | 13 | 14 | class GoSymbolProvider(AbstractLabelProvider): 15 | """ Minimal resolver for Go symbols """ 16 | 17 | def __init__(self, config): 18 | self._config = config 19 | # addr:func_name 20 | self._func_symbols = {} 21 | 22 | def update(self, binary_info): 23 | binary = binary_info.binary 24 | pclntab_offset = None 25 | try: 26 | lief_binary = lief.parse(binary) 27 | if lief_binary.format == lief.EXE_FORMATS.ELF: 28 | pclntab_offset = lief_binary.get_section(".gopclntab").offset 29 | elif lief_binary.format == lief.EXE_FORMATS.MACHO: 30 | pclntab_offset = lief_binary.get_section("__gopclntab").offset 31 | elif lief_binary.format == lief.EXE_FORMATS.PE: 32 | rdata_offset = lief_binary.get_section(".rdata").offset 33 | pclntab_offset = rdata_offset + lief_binary.get_symbol("runtime.pclntab").value 34 | except: 35 | pass 36 | if pclntab_offset is None: 37 | # scan for offset of structure 38 | pclntab_regex = re.compile(b".\xFF\xFF\xFF\x00\x00\x01(\x04|\x08)") 39 | hits = [match.start() for match in re.finditer(pclntab_regex, binary)] 40 | if len(hits) > 1: 41 | logging.error("GoLabelProvider found too many candidates for pclntab") 42 | elif len(hits) == 1: 43 | pclntab_offset = hits[0] 44 | # if we found a valid offset, do the pclntab parsing 45 | if pclntab_offset: 46 | try: 47 | result = self._parse_pclntab(pclntab_offset, binary) 48 | if result: 49 | self._func_symbols = result 50 | except: 51 | return 52 | 53 | def isSymbolProvider(self): 54 | return True 55 | 56 | def getSymbol(self, address): 57 | return self._func_symbols.get(address, "") 58 | 59 | def getFunctionSymbols(self): 60 | return self._func_symbols 61 | 62 | def _readUtf8(self, buffer): 63 | string_read = "" 64 | offset = 0 65 | while buffer[offset] != 0: 66 | string_read += f"{buffer[offset]:02x}" 67 | offset += 1 68 | # need to defang special char(s) 69 | decoded_string = bytearray.fromhex(string_read).decode().replace('\u00b7', ':') 70 | return decoded_string 71 | 72 | def _parse_pclntab(self, pclntab_offset, binary): 73 | pclntab_buffer = binary[pclntab_offset:] 74 | 75 | marker = struct.unpack("I", pclntab_buffer[0:4])[0] 76 | if marker == 0xfffffffb: 77 | version = '1.12' 78 | elif marker == 0xfffffffa: 79 | version = '1.16' 80 | elif marker == 0xfffffff0: 81 | version = '1.18' 82 | elif marker == 0xfffffff1: 83 | version = '1.20' 84 | else: 85 | raise ValueError(f"Could not recognize Golang version marker: 0x{marker}") 86 | 87 | bitness_indicator = struct.unpack("B", pclntab_buffer[7:8])[0] 88 | bitness = None 89 | if bitness_indicator == 8: 90 | bitness = 64 91 | elif bitness_indicator == 4: 92 | bitness = 32 93 | else: 94 | raise ValueError(f"Could not recognize Golang bitness marker: 0x{bitness_indicator}") 95 | 96 | field_size = 8 if bitness == 64 else 4 97 | field_indicator = "Q" if bitness == 64 else "I" 98 | if version == '1.12': 99 | number_of_functions = struct.unpack("I", pclntab_buffer[8:12])[0] 100 | function_name_offset = pclntab_offset 101 | weird_table_offset = pclntab_offset + 16 if bitness == 64 else pclntab_offset + 12 102 | start_text = 0 103 | elif version == '1.16': 104 | parsed_pclntab_fields = struct.unpack(7*field_indicator, pclntab_buffer[8:8+7*field_size]) 105 | number_of_functions = parsed_pclntab_fields[0] 106 | function_name_offset = pclntab_offset + parsed_pclntab_fields[2] 107 | file_name_offset = pclntab_offset + parsed_pclntab_fields[3] 108 | weird_table_offset = pclntab_offset + parsed_pclntab_fields[6] 109 | start_text = 0 110 | elif version == '1.18' or version == '1.20': 111 | parsed_pclntab_fields = struct.unpack(8*field_indicator, pclntab_buffer[8:8+8*field_size]) 112 | number_of_functions = parsed_pclntab_fields[0] 113 | start_text = parsed_pclntab_fields[2] 114 | function_name_offset = pclntab_offset + parsed_pclntab_fields[3] 115 | file_name_offset = pclntab_offset + parsed_pclntab_fields[5] 116 | weird_table_offset = pclntab_offset + parsed_pclntab_fields[7] 117 | 118 | # first parse function offsets 119 | offsets = OrderedDict() 120 | func_info_offsets = {} 121 | read_offset = 0 122 | table_buffer = binary[weird_table_offset:] 123 | for index in range(number_of_functions): 124 | # need to parse a second table in this case 125 | if version == '1.12': 126 | offsets[index] = struct.unpack(field_indicator, table_buffer[read_offset:read_offset+field_size])[0] 127 | read_offset += field_size 128 | func_info_offsets[index] = struct.unpack(field_indicator, table_buffer[read_offset:read_offset+field_size])[0] 129 | read_offset += field_size 130 | # advance element pointer 131 | if version == '1.16': 132 | offsets[index] = struct.unpack(field_indicator, table_buffer[read_offset:read_offset+field_size])[0] 133 | read_offset += 2 * field_size 134 | # here we have a more compact structure for both x86/x64, no need to skip 135 | if version == '1.18' or version == '1.20': 136 | offsets[index] = struct.unpack("I", table_buffer[read_offset:read_offset+4])[0] 137 | read_offset += 8 138 | 139 | functions = {} 140 | offsets2 = offsets.copy() 141 | function_name_buffer = binary[function_name_offset:] 142 | if version == '1.12': 143 | for index, info_offset in func_info_offsets.items(): 144 | function_offset = offsets[index] 145 | name_offset = struct.unpack(field_indicator, pclntab_buffer[info_offset+field_size:info_offset+2*field_size])[0] 146 | # only take lower 32bit in case of 64bit binaries. 147 | name_offset &= 0xFFFFFFFF 148 | function_name = self._readUtf8(function_name_buffer[name_offset:]) 149 | functions[function_offset + start_text] = function_name 150 | else: 151 | delete = False 152 | for offset, function_offset in offsets.items(): 153 | if delete: 154 | offsets2.pop(offset) 155 | bytes_read = struct.unpack("I", table_buffer[read_offset:read_offset+4])[0] 156 | read_offset += 4 157 | try: 158 | while bytes_read != function_offset: 159 | bytes_read = struct.unpack("I", table_buffer[read_offset:read_offset+4])[0] 160 | read_offset += 4 161 | except ValueError: 162 | delete = True 163 | offsets2.pop(offset) 164 | continue 165 | if version == '1.16' and bitness == 64: 166 | read_offset += 4 167 | name_offset = struct.unpack('I', table_buffer[read_offset:read_offset+4])[0] 168 | function_name = self._readUtf8(function_name_buffer[name_offset:]) 169 | read_offset += 4 170 | functions[function_offset + start_text] = function_name 171 | return functions 172 | -------------------------------------------------------------------------------- /smda/common/labelprovider/OrdinalHelper.py: -------------------------------------------------------------------------------- 1 | class OrdinalHelper(object): 2 | # TODO POC implementation, extend list. ole32.dll and mfc42.dll are candidates here 3 | ORDINALS = { 4 | "ws2_32.dll": { 5 | 1: "accept", 6 | 2: "bind", 7 | 3: "closesocket", 8 | 4: "connect", 9 | 97: "freeaddrinfo", 10 | 98: "getaddrinfo", 11 | 99: "getnameinfo", 12 | 51: "gethostbyaddr", 13 | 52: "gethostbyname", 14 | 53: "getprotobyname", 15 | 54: "getprotobynumber", 16 | 55: "getservbyname", 17 | 56: "getservbyport", 18 | 57: "gethostname", 19 | 5: "getpeername", 20 | 6: "getsockname", 21 | 7: "getsockopt", 22 | 8: "htonl", 23 | 9: "htons", 24 | 10: "ioctlsocket", 25 | 11: "inet_addr", 26 | 12: "inet_ntoa", 27 | 13: "listen", 28 | 14: "ntohl", 29 | 15: "ntohs", 30 | 16: "recv", 31 | 17: "recvfrom", 32 | 18: "select", 33 | 19: "send", 34 | 20: "sendto", 35 | 21: "setsockopt", 36 | 22: "shutdown", 37 | 23: "socket" 38 | } 39 | } 40 | 41 | @staticmethod 42 | def resolveOrdinal(dll_name, ordinal): 43 | dll_name = dll_name.lower() 44 | if dll_name in OrdinalHelper.ORDINALS and ordinal in OrdinalHelper.ORDINALS[dll_name]: 45 | return OrdinalHelper.ORDINALS[dll_name][ordinal] 46 | return "" 47 | -------------------------------------------------------------------------------- /smda/common/labelprovider/PdbSymbolProvider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import logging 4 | 5 | from smda.utility.PeFileLoader import PeFileLoader 6 | from .AbstractLabelProvider import AbstractLabelProvider 7 | 8 | LOGGER = logging.getLogger(__name__) 9 | 10 | try: 11 | import pdbparse 12 | from pdbparse.undname import undname 13 | except: 14 | pdbparse = None 15 | LOGGER.debug("3rd party library pdbparse (use fork @ https://github.com/VPaulV/pdbparse) not installed - won't be able to extract symbols from PDB files where available.") 16 | 17 | 18 | class DummyOmap(object): 19 | def remap(self, addr): 20 | return addr 21 | 22 | 23 | class PdbSymbolProvider(AbstractLabelProvider): 24 | """ Minimal resolver for PDB symbols """ 25 | 26 | def __init__(self, config): 27 | self._config = config 28 | self._base_addr = 0 29 | # addr:func_name 30 | self._func_symbols = {} 31 | 32 | def isSymbolProvider(self): 33 | return True 34 | 35 | def _parseOep(self, data): 36 | oep_rva = PeFileLoader.getOEP(data) 37 | if oep_rva: 38 | self._func_symbols[self._base_addr + oep_rva] = "original_entry_point" 39 | 40 | def update(self, binary_info): 41 | self._base_addr = binary_info.base_addr 42 | if not binary_info.file_path: 43 | return 44 | data = "" 45 | with open(binary_info.file_path, "rb") as fin: 46 | data = fin.read(16) 47 | self._parseOep(data) 48 | if data[:15] != b"Microsoft C/C++" or pdbparse is None: 49 | return 50 | try: 51 | pdb = pdbparse.parse(binary_info.file_path) 52 | self._parseSymbols(pdb) 53 | except Exception as exc: 54 | LOGGER.error("Failed parsing \"%s\" with exception type: %s", binary_info.file_path, type(exc)) 55 | 56 | def _parseSymbols(self, pdb): 57 | try: 58 | sects = pdb.STREAM_SECT_HDR_ORIG.sections 59 | omap = pdb.STREAM_OMAP_FROM_SRC 60 | except AttributeError: 61 | sects = pdb.STREAM_SECT_HDR.sections 62 | omap = DummyOmap() 63 | gsyms = pdb.STREAM_GSYM 64 | for sym in gsyms.globals: 65 | try: 66 | off = sym.offset 67 | if len(sects) < sym.segment: 68 | continue 69 | virt_base = sects[sym.segment - 1].VirtualAddress 70 | function_address = (self._base_addr + omap.remap(off + virt_base)) 71 | demangled_name = undname(sym.name) 72 | if sym.symtype == 2: 73 | # print("0x%x + 0x%x + 0x%x = 0x%x: %s || %s (type: %d)" % (self._base_addr, off, virt_base, function_address, sym.name, demangled_name, sym.symtype)) 74 | self._func_symbols[function_address] = demangled_name 75 | except AttributeError: 76 | pass 77 | 78 | def getSymbol(self, address): 79 | return self._func_symbols.get(address, "") 80 | 81 | def getFunctionSymbols(self): 82 | return self._func_symbols 83 | -------------------------------------------------------------------------------- /smda/common/labelprovider/PeSymbolProvider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import logging 4 | from .AbstractLabelProvider import AbstractLabelProvider 5 | 6 | LOGGER = logging.getLogger(__name__) 7 | 8 | try: 9 | import lief 10 | lief.logging.disable() 11 | except: 12 | lief = None 13 | LOGGER.warning("3rd party library LIEF not installed - won't be able to extract symbols for ELF files where available.") 14 | 15 | 16 | 17 | class PeSymbolProvider(AbstractLabelProvider): 18 | """ Minimal resolver for PE symbols """ 19 | 20 | def __init__(self, config): 21 | self._config = config 22 | #addr:func_name 23 | self._func_symbols = {} 24 | 25 | def isSymbolProvider(self): 26 | return True 27 | 28 | def _parseOep(self, lief_result): 29 | if lief_result: 30 | self._func_symbols[lief_result.entrypoint] = "original_entry_point" 31 | 32 | def update(self, binary_info): 33 | #works both for PE and ELF 34 | self._func_symbols = {} 35 | if not binary_info.file_path: 36 | return 37 | data = "" 38 | with open(binary_info.file_path, "rb") as fin: 39 | data = fin.read(16) 40 | if data[:2] != b"MZ" or lief is None: 41 | return 42 | lief_binary = lief.parse(binary_info.file_path) 43 | if lief_binary is not None: 44 | self._parseOep(lief_binary) 45 | self._parseExports(lief_binary) 46 | self._parseSymbols(lief_binary) 47 | 48 | def _parseExports(self, binary): 49 | for function in binary.exported_functions: 50 | function_name = "" 51 | try: 52 | # here may occur a LIEF exception that we want to skip -> 53 | # UnicodeDecodeError: 'utf-32-le' codec can't decode bytes in position 0-3: code point not in range(0x110000) 54 | function_name = function.name 55 | except: 56 | pass 57 | if function_name and all(c in range(0x20, 0x7f) for c in function_name): 58 | self._func_symbols[binary.imagebase + function.address] = function_name 59 | 60 | def _parseSymbols(self, lief_binary): 61 | # find VA of first code section 62 | code_base_address = None 63 | for section in lief_binary.sections: 64 | if section.characteristics & 0x20000000: 65 | code_base_address = lief_binary.imagebase + section.virtual_address 66 | break 67 | if code_base_address is None: 68 | return 69 | for symbol in lief_binary.symbols: 70 | if hasattr(symbol.complex_type, "name") and symbol.complex_type.name == "FUNCTION": 71 | function_name = "" 72 | try: 73 | # here may occur a LIEF exception that we want to skip -> 74 | # UnicodeDecodeError: 'utf-32-le' codec can't decode bytes in position 0-3: code point not in range(0x110000) 75 | function_name = symbol.name 76 | except: 77 | pass 78 | if function_name and all(ord(c) in range(0x20, 0x7f) for c in function_name): 79 | # for some reason, we need to add the section_offset of .text here 80 | function_offset = code_base_address + symbol.value 81 | if function_offset not in self._func_symbols: 82 | self._func_symbols[function_offset] = function_name 83 | 84 | def getSymbol(self, address): 85 | return self._func_symbols.get(address, "") 86 | 87 | def getFunctionSymbols(self): 88 | return self._func_symbols 89 | -------------------------------------------------------------------------------- /smda/common/labelprovider/WinApiResolver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import json 5 | import logging 6 | 7 | import lief 8 | lief.logging.disable() 9 | 10 | from .AbstractLabelProvider import AbstractLabelProvider 11 | from smda.common.labelprovider.OrdinalHelper import OrdinalHelper 12 | 13 | LOGGER = logging.getLogger(__name__) 14 | 15 | 16 | class WinApiResolver(AbstractLabelProvider): 17 | """ Minimal WinAPI reference resolver, extracted from ApiScout """ 18 | 19 | def __init__(self, config): 20 | self._config = config 21 | self._has_64bit = False 22 | self._api_map = { 23 | "lief": {} 24 | } 25 | self._os_name = None 26 | self._is_buffer = False 27 | for os_name, db_filepath in self._config.API_COLLECTION_FILES.items(): 28 | self._loadDbFile(os_name, db_filepath) 29 | self._os_name = os_name 30 | 31 | def update(self, binary_info): 32 | self._is_buffer = binary_info.is_buffer 33 | if not self._is_buffer: 34 | #setup import table info from LIEF 35 | lief_binary = lief.parse(binary_info.raw_data) 36 | if not isinstance(lief_binary, lief.PE.Binary): 37 | return 38 | for imported_library in lief_binary.imports: 39 | for func in imported_library.entries: 40 | if func.name: 41 | self._api_map["lief"][func.iat_address + binary_info.base_addr] = (imported_library.name.lower(), func.name) 42 | elif func.is_ordinal: 43 | resolved_ordinal = OrdinalHelper.resolveOrdinal(imported_library.name.lower(), func.ordinal) 44 | ordinal_name = resolved_ordinal if resolved_ordinal else "#%s" % func.ordinal 45 | self._api_map["lief"][func.iat_address + binary_info.base_addr] = (imported_library.name.lower(), ordinal_name) 46 | 47 | def setOsName(self, os_name): 48 | self._os_name = os_name 49 | 50 | def _loadDbFile(self, os_name, db_filepath): 51 | api_db = {} 52 | if os.path.isfile(db_filepath): 53 | with open(db_filepath, "r") as f_json: 54 | api_db = json.loads(f_json.read()) 55 | else: 56 | LOGGER.error("Can't find ApiScout collection file: \"%s\" -- continuing without ApiResolver.", db_filepath) 57 | return 58 | num_apis_loaded = 0 59 | api_map = {} 60 | for dll_entry in api_db["dlls"]: 61 | LOGGER.debug(" building address map for: %s", dll_entry) 62 | for export in api_db["dlls"][dll_entry]["exports"]: 63 | num_apis_loaded += 1 64 | api_name = "%s" % (export["name"]) 65 | if api_name == "None": 66 | api_name = "None<{}>".format(export["ordinal"]) 67 | dll_name = "_".join(dll_entry.split("_")[2:]) 68 | bitness = api_db["dlls"][dll_entry]["bitness"] 69 | self._has_64bit |= bitness == 64 70 | base_address = api_db["dlls"][dll_entry]["base_address"] 71 | virtual_address = base_address + export["address"] 72 | api_map[virtual_address] = (dll_name, api_name) 73 | LOGGER.debug("loaded %d exports from %d DLLs (%s).", num_apis_loaded, len(api_db["dlls"]), api_db["os_name"]) 74 | self._api_map[os_name] = api_map 75 | 76 | def isApiProvider(self): 77 | """Returns whether the get_api(..) function of the AbstractLabelProvider is functional""" 78 | return True 79 | 80 | def getApi(self, to_addr, absolute_addr): 81 | """If the LabelProvider has any information about a used API for the given address, return (dll, api), else return (None, None)""" 82 | # if we work on a dump, use ApiScout method: 83 | if self._is_buffer: 84 | if self._os_name and self._os_name in self._api_map: 85 | return self._api_map[self._os_name].get(absolute_addr, (None, None)) 86 | else: 87 | return (None, None) 88 | # otherwise take import table info from LIEF 89 | else: 90 | return self._api_map["lief"].get(to_addr, (None, None)) 91 | -------------------------------------------------------------------------------- /smda/common/labelprovider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/smda/common/labelprovider/__init__.py -------------------------------------------------------------------------------- /smda/ida/BackendInterface.py: -------------------------------------------------------------------------------- 1 | class BackendInterface(object): 2 | 3 | def __init__(self): 4 | pass 5 | 6 | def getArchitecture(self): 7 | raise NotImplementedError 8 | 9 | def getBitness(self): 10 | raise NotImplementedError 11 | 12 | def getFunctions(self): 13 | raise NotImplementedError 14 | 15 | def getBlocks(self, function_offset): 16 | raise NotImplementedError 17 | 18 | def getCodeInRefs(self, offset): 19 | raise NotImplementedError 20 | 21 | def getCodeOutRefs(self, offset): 22 | raise NotImplementedError 23 | 24 | def getInstructionBytes(self, offset): 25 | raise NotImplementedError 26 | 27 | def getFunctionSymbols(self, demangle=False): 28 | raise NotImplementedError 29 | 30 | def getBaseAddr(self): 31 | raise NotImplementedError 32 | 33 | def getBinary(self): 34 | raise NotImplementedError 35 | 36 | def getApiOffsets(self): 37 | raise NotImplementedError 38 | -------------------------------------------------------------------------------- /smda/ida/IdaExporter.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from capstone import Cs, CS_ARCH_X86, CS_MODE_32, CS_MODE_64 4 | 5 | from smda.DisassemblyResult import DisassemblyResult 6 | from .IdaInterface import IdaInterface 7 | 8 | class IdaExporter(object): 9 | 10 | def __init__(self, config, bitness=None): 11 | self.config = config 12 | self.ida_interface = IdaInterface() 13 | self.bitness = bitness if bitness else self.ida_interface.getBitness() 14 | self.capstone = None 15 | self.disassembly = DisassemblyResult() 16 | self.disassembly.smda_version = config.VERSION 17 | self._initCapstone() 18 | 19 | def _initCapstone(self): 20 | self.capstone = Cs(CS_ARCH_X86, CS_MODE_32) 21 | if self.bitness == 64: 22 | self.capstone = Cs(CS_ARCH_X86, CS_MODE_64) 23 | 24 | def _convertIdaInsToSmda(self, offset, instruction_bytes): 25 | cache = [i for i in self.capstone.disasm_lite(instruction_bytes, offset)] 26 | if cache: 27 | i_address, i_size, i_mnemonic, i_op_str = cache[0] 28 | smda_ins = (i_address, i_size, i_mnemonic, i_op_str, instruction_bytes) 29 | else: 30 | # record error and emit placeholder instruction 31 | bytes_as_hex = "".join(["%02x" % c for c in bytearray(instruction_bytes)]) 32 | print("missing capstone disassembly output at 0x%x (%s)" % (offset, bytes_as_hex)) 33 | self.disassembly.errors[offset] = { 34 | "type": "capstone disassembly failure", 35 | "instruction_bytes": bytes_as_hex 36 | } 37 | smda_ins = (offset, len(instruction_bytes), "error", "error", bytearray(instruction_bytes)) 38 | return smda_ins 39 | 40 | def analyzeBuffer(self, binary_info, cb_analysis_timeout=None): 41 | """ instead of performing a full analysis, simply collect all data from IDA and convert it into a report """ 42 | self.disassembly.analysis_start_ts = datetime.datetime.now(datetime.timezone.utc) 43 | self.disassembly.binary_info = binary_info 44 | self.disassembly.binary_info.architecture = self.ida_interface.getArchitecture() 45 | if not self.disassembly.binary_info.base_addr: 46 | self.disassembly.binary_info.base_addr = self.ida_interface.getBaseAddr() 47 | if not self.disassembly.binary_info.binary: 48 | self.disassembly.binary_info.binary = self.ida_interface.getBinary() 49 | if not self.disassembly.binary_info.bitness: 50 | self.disassembly.binary_info.bitness = self.bitness 51 | self.disassembly.function_symbols = self.ida_interface.getFunctionSymbols() 52 | api_map = self.ida_interface.getApiMap() 53 | for function_offset in self.ida_interface.getFunctions(): 54 | if self.ida_interface.isExternalFunction(function_offset): 55 | continue 56 | converted_function = [] 57 | for block in self.ida_interface.getBlocks(function_offset): 58 | converted_block = [] 59 | for instruction_offset in block: 60 | instruction_bytes = self.ida_interface.getInstructionBytes(instruction_offset) 61 | smda_instruction = self._convertIdaInsToSmda(instruction_offset, instruction_bytes) 62 | converted_block.append(smda_instruction) 63 | self.disassembly.instructions[smda_instruction[0]] = (smda_instruction[2], smda_instruction[1]) 64 | in_refs = self.ida_interface.getCodeInRefs(smda_instruction[0]) 65 | for in_ref in in_refs: 66 | self.disassembly.addCodeRefs(in_ref[0], in_ref[1]) 67 | out_refs = self.ida_interface.getCodeOutRefs(smda_instruction[0]) 68 | for out_ref in out_refs: 69 | self.disassembly.addCodeRefs(out_ref[0], out_ref[1]) 70 | if out_ref[1] in api_map: 71 | self.disassembly.addr_to_api[instruction_offset] = api_map[out_ref[1]] 72 | converted_function.append(converted_block) 73 | self.disassembly.functions[function_offset] = converted_function 74 | if self.disassembly.isRecursiveFunction(function_offset): 75 | self.disassembly.recursive_functions.add(function_offset) 76 | if self.disassembly.isLeafFunction(function_offset): 77 | self.disassembly.leaf_functions.add(function_offset) 78 | self.disassembly.analysis_end_ts = datetime.datetime.now(datetime.timezone.utc) 79 | return self.disassembly 80 | -------------------------------------------------------------------------------- /smda/ida/IdaInterface.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .BackendInterface import BackendInterface 4 | 5 | try: 6 | import idaapi 7 | import idautils 8 | except: 9 | pass 10 | 11 | try: 12 | # we only need these when we are in IDA - IDA 7.4 and above 13 | import ida_idaapi 14 | import ida_funcs 15 | import ida_gdl 16 | import ida_bytes 17 | import ida_nalt 18 | import ida_segment 19 | import ida_name 20 | except: 21 | pass 22 | 23 | try: 24 | # we only need these when we are in IDA - IDA 7.3 and below 25 | import idc 26 | except: 27 | pass 28 | 29 | 30 | class IdaInterface(object): 31 | # derived from https://python-3-patterns-idioms-test.readthedocs.io/en/latest/Singleton.html 32 | instance = None 33 | def __init__(self): 34 | if not IdaInterface.instance: 35 | if idaapi.IDA_SDK_VERSION >= 740: 36 | IdaInterface.instance = Ida74Interface() 37 | else: 38 | IdaInterface.instance = Ida73Interface() 39 | 40 | def __getattr__(self, name): 41 | return getattr(self.instance, name) 42 | 43 | def getIdbDir(self): 44 | return idautils.GetIdbDir() 45 | 46 | 47 | class Ida74Interface(BackendInterface): 48 | 49 | def __init__(self): 50 | self.version = "IDA Pro 7.4" 51 | self._processor_map = { 52 | "metapc": "intel" 53 | } 54 | self._api_map = {} 55 | self._import_module_name = "" 56 | 57 | def getArchitecture(self): 58 | # https://reverseengineering.stackexchange.com/a/11398 59 | info = ida_idaapi.get_inf_structure() 60 | if idaapi.IDA_SDK_VERSION >= 800: 61 | procname = info.procname 62 | else: 63 | procname = info.procName 64 | if procname in self._processor_map: 65 | return self._processor_map[procname] 66 | else: 67 | raise ValueError("Unsupported Architecture") 68 | 69 | def getBitness(self): 70 | # https://reverseengineering.stackexchange.com/a/11398 71 | bits = None 72 | info = ida_idaapi.get_inf_structure() 73 | if info.is_64bit(): 74 | bits = 64 75 | elif info.is_32bit(): 76 | bits = 32 77 | else: 78 | bits = 16 79 | return bits 80 | 81 | def getFunctions(self): 82 | return sorted([offset for offset in idautils.Functions()]) 83 | 84 | def getBlocks(self, function_offset): 85 | blocks = [] 86 | function_chart = ida_gdl.FlowChart(ida_funcs.get_func(function_offset)) 87 | for block in function_chart: 88 | extracted_block = [] 89 | for instruction in idautils.Heads(block.start_ea, block.end_ea): 90 | if ida_bytes.is_code(ida_bytes.get_flags(instruction)): 91 | extracted_block.append(instruction) 92 | if extracted_block: 93 | blocks.append(extracted_block) 94 | return sorted(blocks) 95 | 96 | def getInstructionBytes(self, offset): 97 | ins = idautils.DecodeInstruction(offset) 98 | ins_bytes = ida_bytes.get_bytes(offset, ins.size) 99 | return ins_bytes 100 | 101 | def getCodeInRefs(self, offset): 102 | return [(ref_from, offset) for ref_from in idautils.CodeRefsTo(offset, True)] 103 | 104 | def getCodeOutRefs(self, offset): 105 | return [(offset, ref_to) for ref_to in idautils.CodeRefsFrom(offset, True)] 106 | 107 | def getFunctionSymbols(self, demangle=False): 108 | function_symbols = {} 109 | function_offsets = self.getFunctions() 110 | for function_offset in function_offsets: 111 | function_name = ida_funcs.get_func_name(function_offset) 112 | # apply demangling if required 113 | if demangle and "@" in function_name: 114 | demangled = ida_name.demangle_name(function_name, 0) 115 | if demangled: 116 | function_name = demangled 117 | if not re.match("sub_[0-9a-fA-F]+", function_name): 118 | function_symbols[function_offset] = function_name 119 | return function_symbols 120 | 121 | def getBaseAddr(self): 122 | base_addr = 0 123 | segment_starts = [ea for ea in idautils.Segments()] 124 | if segment_starts: 125 | first_segment_start = segment_starts[0] 126 | # re-align by 0x10000 to reflect typically allocation behaviour for IDA-mapped binaries 127 | first_segment_start = (first_segment_start / 0x10000) * 0x10000 128 | base_addr = int(first_segment_start) 129 | return base_addr 130 | 131 | def getBinary(self): 132 | result = b"" 133 | segment = ida_segment.get_first_seg() 134 | while segment: 135 | result += ida_bytes.get_bytes(segment.start_ea, segment.end_ea - segment.start_ea) 136 | segment = ida_segment.get_next_seg(segment.end_ea) 137 | return result 138 | 139 | def getApiMap(self): 140 | self._api_map = {} 141 | num_imports = ida_nalt.get_import_module_qty() 142 | for i in range(0, num_imports): 143 | self._import_module_name = ida_nalt.get_import_module_name(i) 144 | ida_nalt.enum_import_names(i, self._cbEnumImports) 145 | return self._api_map 146 | 147 | def isExternalFunction(self, function_offset): 148 | function_segment = ida_segment.getseg(function_offset) 149 | function_segment_name = ida_segment.get_segm_name(function_segment) 150 | is_extern = function_segment_name in ["extern", "UNDEF"] 151 | return is_extern 152 | 153 | def makeFunction(self, instruction): 154 | return ida_funcs.add_func(instruction) 155 | 156 | def makeNameEx(self, address, name, warning_level=None): 157 | if warning_level is None: 158 | warning_level=idc.SN_NOWARN 159 | return idc.set_name(address, name, warning_level) 160 | 161 | def _cbEnumImports(self, addr, name, ordinal): 162 | # potentially use: idc.Name(addr) 163 | if self._import_module_name: 164 | self._api_map[addr] = self._import_module_name + "!" + name 165 | else: 166 | self._api_map[addr] = name 167 | return True 168 | 169 | 170 | 171 | class Ida73Interface(BackendInterface): 172 | 173 | def __init__(self): 174 | self.version = "IDA Pro 7.3 and below" 175 | self._processor_map = { 176 | "metapc": "intel" 177 | } 178 | self._api_map = {} 179 | self._import_module_name = "" 180 | 181 | def getArchitecture(self): 182 | # https://reverseengineering.stackexchange.com/a/11398 183 | info = idaapi.get_inf_structure() 184 | procname = info.procName 185 | if procname in self._processor_map: 186 | return self._processor_map[procname] 187 | else: 188 | raise ValueError("Unsupported Architecture") 189 | 190 | def getBitness(self): 191 | # https://reverseengineering.stackexchange.com/a/11398 192 | bits = None 193 | info = idaapi.get_inf_structure() 194 | if info.is_64bit(): 195 | bits = 64 196 | elif info.is_32bit(): 197 | bits = 32 198 | else: 199 | bits = 16 200 | return bits 201 | 202 | def getFunctions(self): 203 | return sorted([offset for offset in idautils.Functions()]) 204 | 205 | def getBlocks(self, function_offset): 206 | blocks = [] 207 | function_chart = idaapi.FlowChart(idaapi.get_func(function_offset)) 208 | for block in function_chart: 209 | extracted_block = [] 210 | for instruction in idautils.Heads(block.startEA, block.endEA): 211 | if idc.isCode(idc.GetFlags(instruction)): 212 | extracted_block.append(instruction) 213 | if extracted_block: 214 | blocks.append(extracted_block) 215 | return sorted(blocks) 216 | 217 | def getInstructionBytes(self, offset): 218 | ins = idautils.DecodeInstruction(offset) 219 | ins_bytes = idc.get_bytes(offset, ins.size) 220 | return ins_bytes 221 | 222 | def getCodeInRefs(self, offset): 223 | return [(ref_from, offset) for ref_from in idautils.CodeRefsTo(offset, True)] 224 | 225 | def getCodeOutRefs(self, offset): 226 | return [(offset, ref_to) for ref_to in idautils.CodeRefsFrom(offset, True)] 227 | 228 | def getFunctionSymbols(self, demangle=False): 229 | function_symbols = {} 230 | function_offsets = self.getFunctions() 231 | for function_offset in function_offsets: 232 | function_name = idc.GetFunctionName(function_offset) 233 | # apply demangling if required 234 | if demangle and "@" in function_name: 235 | function_name = idc.demangle_name(function_name, 0) 236 | if not re.match("sub_[0-9a-fA-F]+", function_name): 237 | function_symbols[function_offset] = function_name 238 | return function_symbols 239 | 240 | def getBaseAddr(self): 241 | segment_starts = [ea for ea in idautils.Segments()] 242 | first_segment_start = segment_starts[0] 243 | # re-align by 0x10000 to reflect typically allocation behaviour for IDA-mapped binaries 244 | first_segment_start = (first_segment_start / 0x10000) * 0x10000 245 | return int(first_segment_start) 246 | 247 | def getBinary(self): 248 | result = b"" 249 | segment_starts = [ea for ea in idautils.Segments()] 250 | offsets = [] 251 | start_len = 0 252 | for start in segment_starts: 253 | end = idc.SegEnd(start) 254 | result += idc.get_bytes(start, end - start) 255 | offsets.append((start, start_len, len(result))) 256 | start_len = len(result) 257 | return result 258 | 259 | def getApiMap(self): 260 | self._api_map = {} 261 | num_imports = idaapi.get_import_module_qty() 262 | for i in range(0, num_imports): 263 | self._import_module_name = idaapi.get_import_module_name(i) 264 | idaapi.enum_import_names(i, self._cbEnumImports) 265 | return self._api_map 266 | 267 | def isExternalFunction(self, function_offset): 268 | # TODO look up older function names to support this for IDA 7.3- as well 269 | return False 270 | 271 | def makeFunction(self, instruction): 272 | return idc.add_func(instruction) 273 | 274 | def makeNameEx(self, address, name, warning_level=None): 275 | if warning_level is None: 276 | warning_level=idc.SN_NOWARN 277 | return idc.set_name(address, name, warning_level) 278 | 279 | def _cbEnumImports(self, addr, name, ordinal): 280 | # potentially use: idc.Name(addr) 281 | if self._import_module_name: 282 | self._api_map[addr] = self._import_module_name + "!" + name 283 | else: 284 | self._api_map[addr] = name 285 | return True 286 | -------------------------------------------------------------------------------- /smda/ida/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/smda/ida/__init__.py -------------------------------------------------------------------------------- /smda/intel/BitnessAnalyzer.py: -------------------------------------------------------------------------------- 1 | import re 2 | import struct 3 | import logging 4 | from collections import Counter 5 | 6 | from .definitions import COMMON_START_BYTES 7 | 8 | LOGGER = logging.getLogger(__name__) 9 | 10 | 11 | class BitnessAnalyzer(object): 12 | 13 | def determineBitnessFromFile(self, filepath): 14 | LOGGER.debug("Running Bitness test on %s", filepath) 15 | with open(filepath, "rb") as infile: 16 | if re.search(r"[0-9a-fA-F]{64}_dump_0x[0-9a-fA-F]+$", filepath): 17 | return self.determineBitness(binary=infile.read()) 18 | return 0 19 | 20 | def determineBitnessFromDisassembly(self, disassembly): 21 | LOGGER.debug("Running Bitness test on binary data of DisassemblyResult") 22 | return self.determineBitness(binary=disassembly.binary_info.binary) 23 | 24 | def determineBitness(self, binary): 25 | candidate_first_bytes = {"32": Counter(), "64": Counter()} 26 | # check for potential call instructions and collect their first bytes 27 | for bitness in ["32", "64"]: 28 | for call_match in re.finditer(b"\xE8", binary): 29 | if len(binary) - call_match.start() > 5: 30 | packed_call = binary[call_match.start() + 1:call_match.start() + 5] 31 | rel_call_offset = struct.unpack("i", packed_call)[0] 32 | call_destination = (rel_call_offset + call_match.start() + 5) # & bitmask 33 | if call_destination > 0 and call_destination < len(binary): 34 | first_byte = binary[call_destination] 35 | candidate_first_bytes[bitness][first_byte] += 1 36 | score = {"32": 0, "64": 0} 37 | for bitness in ["32", "64"]: 38 | for candidate_sequence in candidate_first_bytes[bitness]: 39 | if isinstance(candidate_sequence, int): 40 | candidate_sequence = "%02x" % candidate_sequence 41 | elif isinstance(candidate_sequence, str): 42 | candidate_sequence = candidate_sequence.encode("hex") 43 | for common_sequence, sequence_score in COMMON_START_BYTES[bitness].items(): 44 | if candidate_sequence == str(common_sequence): 45 | score[bitness] += sequence_score * 1.0 46 | total_score = max(score["32"] + score["64"], 1) 47 | score["32"] /= total_score 48 | score["64"] /= total_score 49 | LOGGER.debug("Bitness scores: %5.2f (32bit), %5.2f (64bit)", score["32"], score["64"]) 50 | return 64 if score["32"] < score["64"] else 32 51 | -------------------------------------------------------------------------------- /smda/intel/FunctionCandidate.py: -------------------------------------------------------------------------------- 1 | from binascii import hexlify 2 | 3 | from .definitions import COMMON_PROLOGUES 4 | 5 | class FunctionCandidate(object): 6 | 7 | def __init__(self, binary_info, addr): 8 | self.bitness = binary_info.bitness 9 | self.addr = addr 10 | rel_start_addr = addr - binary_info.base_addr 11 | self.bytes = binary_info.binary[rel_start_addr:rel_start_addr + 5] 12 | self.lang_spec = None 13 | self.call_ref_sources = [] 14 | self.finished = False 15 | self.is_symbol = False 16 | self.is_gap_candidate = False 17 | self.is_tailcall = False 18 | self.alignment = 0 19 | if addr % 4 == 0: 20 | self.alignment = 4 21 | elif addr % 16 == 0: 22 | self.alignment = 16 23 | self.analysis_aborted = False 24 | self.abortion_reason = "" 25 | self._score = None 26 | self._tfidf_score = None 27 | self._confidence = None 28 | self.function_start_score = None 29 | self.is_stub = False 30 | self.is_initial_candidate = False 31 | self.is_exception_handler = False 32 | 33 | def setTfIdf(self, tfidf_score): 34 | self._tfidf_score = tfidf_score 35 | 36 | def getTfIdf(self): 37 | return round(self._tfidf_score, 3) 38 | 39 | def getConfidence(self): 40 | if self._confidence is None: 41 | # based on evaluation over Andriesse, Bao, and Plohmann data sets 42 | weighted_confidence = 0.298 * (1 if self.hasCommonFunctionStart() else 0) 43 | if self._tfidf_score is not None: 44 | weighted_confidence += ( 45 | 0.321 * (1 if self._tfidf_score < 0 else 0) + 46 | 0.124 * (1 if self._tfidf_score < -2 else 0) + 47 | 0.120 * (1 if self._tfidf_score < -4 else 0) + 48 | 0.101 * (1 if self._tfidf_score < -1 else 0) + 49 | 0.025 * (1 if self._tfidf_score < -8 else 0) 50 | ) 51 | # above experiments show that multiple inbound call references are basically always indeed functions 52 | if len(self.call_ref_sources) > 1: 53 | self._confidence = 1.0 54 | # initially recognized candidates are also almost always functions as they follow this heuristic 55 | elif self.is_initial_candidate: 56 | self._confidence = round(0.5 + 0.5 * (weighted_confidence), 3) 57 | else: 58 | self._confidence = round(weighted_confidence, 3) 59 | return self._confidence 60 | 61 | def hasCommonFunctionStart(self): 62 | for length in sorted([int(l) for l in COMMON_PROLOGUES], reverse=True): 63 | byte_sequence = self.bytes[:length] 64 | if byte_sequence in COMMON_PROLOGUES["%d" % length][self.bitness]: 65 | return True 66 | return False 67 | 68 | def getFunctionStartScore(self): 69 | if self.function_start_score is None: 70 | for length in sorted([int(l) for l in COMMON_PROLOGUES], reverse=True): 71 | byte_sequence = self.bytes[:length] 72 | if byte_sequence in COMMON_PROLOGUES["%d" % length][self.bitness]: 73 | self.function_start_score = COMMON_PROLOGUES["%d" % length][self.bitness][byte_sequence] 74 | break 75 | self.function_start_score = self.function_start_score if self.function_start_score else 0 76 | return self.function_start_score 77 | 78 | def addCallRef(self, source_addr): 79 | if source_addr not in self.call_ref_sources: 80 | self.call_ref_sources.append(source_addr) 81 | self._score = None 82 | 83 | def removeCallRefs(self, source_addrs): 84 | for addr in source_addrs: 85 | if addr in self.call_ref_sources: 86 | self.call_ref_sources.remove(addr) 87 | self._score = None 88 | 89 | def setIsTailcallCandidate(self, is_tailcall): 90 | self.is_tailcall = is_tailcall 91 | 92 | def setInitialCandidate(self, initial): 93 | self.is_initial_candidate = initial 94 | 95 | def setIsGapCandidate(self, gap): 96 | self.is_gap_candidate = gap 97 | 98 | def setLanguageSpec(self, lang_spec): 99 | self.lang_spec = lang_spec 100 | self._score = None 101 | 102 | def setIsSymbol(self, is_symbol): 103 | self.is_symbol = is_symbol 104 | self._score = None 105 | 106 | def setIsExceptionHandler(self, is_exception_handler): 107 | self.is_exception_handler = is_exception_handler 108 | self._score = None 109 | 110 | def setIsStub(self, is_stub): 111 | self.is_stub = is_stub 112 | self._score = None 113 | 114 | def setAnalysisAborted(self, reason): 115 | self.finished = True 116 | self.analysis_aborted = True 117 | self.abortion_reason = reason 118 | 119 | def setAnalysisCompleted(self): 120 | self.finished = True 121 | 122 | def isFinished(self): 123 | return self.finished 124 | 125 | def calculateScore(self): 126 | score = 0 127 | score += 10000 if self.is_symbol else 0 128 | score += 5000 if self.is_exception_handler else 0 129 | score += 1000 if self.is_stub else 0 130 | score += 100 if self.lang_spec is not None else 0 131 | score += self.getFunctionStartScore() 132 | num_call_refs = len(self.call_ref_sources) 133 | if num_call_refs >= 10: 134 | call_ref_score = 10 + int(num_call_refs / 10) 135 | else: 136 | call_ref_score = num_call_refs 137 | score += 10 * call_ref_score 138 | score += 1 if self.alignment else 0 139 | return score 140 | 141 | def getScore(self): 142 | if self._score is None: 143 | self._score = self.calculateScore() 144 | return self._score 145 | 146 | def __lt__(self, other): 147 | own_score = self.getScore() 148 | other_score = other.getScore() 149 | if own_score == other_score: 150 | return self.addr > other.addr 151 | return own_score < other_score 152 | 153 | def getCharacteristics(self): 154 | is_aligned = "a" if self.alignment else "-" 155 | is_finished = "f" if self.finished else "-" 156 | is_gap = "g" if self.is_gap_candidate else "-" 157 | is_initial = "i" if self.is_initial_candidate else "-" 158 | is_lang_spec = "l" if self.lang_spec is not None else "-" 159 | is_prologue = "p" if self.hasCommonFunctionStart() else "-" 160 | is_ref = "r" if self.call_ref_sources else "-" 161 | is_symbol = "s" if self.is_symbol else "-" 162 | is_tailcall = "t" if self.is_tailcall else "-" 163 | is_stub = "u" if self.is_stub else "-" 164 | is_aborted = "x" if self.analysis_aborted else "-" 165 | characteristics = is_initial + is_symbol + is_stub + is_aligned + is_lang_spec + is_prologue + is_ref + is_tailcall + is_gap + is_finished + is_aborted 166 | return characteristics 167 | 168 | def __str__(self): 169 | characteristics = self.getCharacteristics() 170 | prologue_score = "%d" % self.getFunctionStartScore() 171 | ref_summary = "{}".format(len(self.call_ref_sources)) if len(self.call_ref_sources) != 1 else "{}: 0x{:x}".format(len(self.call_ref_sources), self.call_ref_sources[0]) 172 | return "0x{:x}: {} -> {} (total score: {}), inref: {} | {}".format(self.addr, hexlify(self.bytes), prologue_score, self.getScore(), ref_summary, characteristics) 173 | 174 | def toJson(self): 175 | return { 176 | "addr": self.addr, 177 | "bytes": self.bytes.hex(), 178 | "alignment": self.alignment, 179 | "reason": self.abortion_reason, 180 | "num_refs": len(self.call_ref_sources), 181 | "characteristics": self.getCharacteristics(), 182 | "prologue_score": self.getFunctionStartScore(), 183 | "score": self.calculateScore(), 184 | "confidence": self.getConfidence() 185 | } 186 | -------------------------------------------------------------------------------- /smda/intel/IndirectCallAnalyzer.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import re 3 | import logging 4 | 5 | LOGGER = logging.getLogger(__name__) 6 | 7 | 8 | class IndirectCallAnalyzer(object): 9 | """ Perform basic dataflow analysis to resolve indirect call targets """ 10 | 11 | def __init__(self, disassembler): 12 | self.disassembler = disassembler 13 | self.disassembly = self.disassembler.disassembly 14 | self.current_calling_addr = 0 15 | self.state = None 16 | 17 | def searchBlock(self, analysis_state, address): 18 | for block in analysis_state.getBlocks(): 19 | if address in [i[0] for i in block]: 20 | return block 21 | return [] 22 | 23 | def getDword(self, addr): 24 | if not self.disassembly.isAddrWithinMemoryImage(addr): 25 | return None 26 | return struct.unpack("I", self.disassembly.getBytes(addr, 4))[0] 27 | 28 | def processBlock(self, analysis_state, block, registers, register_name, processed, depth): 29 | if not block: 30 | return False 31 | if block in processed: 32 | LOGGER.debug("already processed block 0x%08x; skipping", block[0][0]) 33 | return False 34 | processed.append(block) 35 | LOGGER.debug("start processing block: 0x%08x\nlooking for register %s", block[0][0], register_name) 36 | abs_value_found = False 37 | for ins in reversed(block): 38 | LOGGER.debug("0x%08x: %s %s", ins[0], ins[2], ins[3]) 39 | if ins[2] == "mov": 40 | #mov , 41 | match1 = re.match(r"(?P[a-z]{3}), (?P[a-z]{3})$", ins[3]) 42 | if match1: 43 | if match1.group("reg1") == register_name: 44 | register_name = match1.group("reg2") 45 | #mov , 46 | match2 = re.match(r"(?P[a-z]{3}), (?P0x[0-9a-f]{,8})$", ins[3]) 47 | if match2: 48 | registers[match2.group("reg")] = int(match2.group("val"), 16) 49 | LOGGER.debug("**moved value 0x%08x to register %s", int(match2.group("val"), 16), match2.group("reg")) 50 | if match2.group("reg") == register_name: 51 | abs_value_found = True 52 | #mov , dword ptr [] 53 | match3 = re.match(r"(?P[a-z]{3}), dword ptr \[(?P0x[0-9a-f]{,8})\]$", ins[3]) 54 | if match3: 55 | # HACK: test to see if the address points to a import and 56 | # use that instead of the actual memory value 57 | addr = int(match3.group("addr"), 16) 58 | dll, api = self.disassembler.resolveApi(addr, addr) 59 | if dll or api: 60 | registers[match3.group("reg")] = addr 61 | LOGGER.debug("**moved API ref (%s:%s) @0x%08x to register %s", dll, api, addr, match3.group("reg")) 62 | if match3.group("reg") == register_name: 63 | abs_value_found = True 64 | else: 65 | dword = self.getDword(addr) 66 | if dword: 67 | registers[match3.group("reg")] = dword 68 | LOGGER.debug("**moved value 0x%08x to register %s", dword, match3.group("reg")) 69 | if match3.group("reg") == register_name: 70 | abs_value_found = True 71 | #mov , qword ptr [reg + ] 72 | match4 = re.match(r"(?P[a-z]{3}), qword ptr \[rip \+ (?P0x[0-9a-f]{,8})\]$", ins[3]) 73 | if match4: 74 | rip = ins[0] + ins[1] 75 | dword = self.getDword(rip + int(match4.group("addr"), 16)) 76 | if dword: 77 | registers[match4.group("reg")] = rip + dword 78 | LOGGER.debug("**moved value 0x%08x + 0x%08x == 0x%08x to register %s", rip, dword, rip + dword, match4.group("reg")) 79 | if match4.group("reg") == register_name: 80 | abs_value_found = True 81 | elif ins[2] == "lea": 82 | LOGGER.debug("*checking %s %s", ins[2], ins[3]) 83 | #lea , dword ptr [] 84 | match1 = re.match(r"(?P[a-z]{3}), dword ptr \[(?P0x[0-9a-f]{,8})\]$", ins[3]) 85 | if match1: 86 | dword = self.getDword(int(match1.group("addr"), 16)) 87 | if dword: 88 | registers[match1.group("reg")] = dword 89 | LOGGER.debug("**moved value 0x%08x to register %s", dword, match1.group("reg")) 90 | if match1.group("reg") == register_name: 91 | abs_value_found = True 92 | #lea , [] 93 | match1 = re.match(r"(?P[a-z]{3}), \[(?P0x[0-9a-f]{,8})\]$", ins[3]) 94 | if match1: 95 | dword = self.getDword(int(match1.group("addr"), 16)) 96 | if dword: 97 | registers[match1.group("reg")] = dword 98 | LOGGER.debug("**moved value 0x%08x to register %s", dword, match1.group("reg")) 99 | if match1.group("reg") == register_name: 100 | abs_value_found = True 101 | # not handled: lea , dword ptr [ +- ] 102 | # requires state-keeping of multiple registers 103 | # there exist potentially many more way how the register being called can be calculated 104 | # for now we ignore them 105 | elif ins[2] == "other instruction": 106 | pass 107 | #if the absolute value was found for the call instruction, detect API 108 | if abs_value_found: 109 | candidate = registers[register_name] if register_name in registers else None 110 | self.state.setLeaf(False) 111 | if candidate: 112 | LOGGER.debug("candidate: 0x%x - %s, register: %s", candidate, ins[3], register_name) 113 | dll, api = self.disassembler.resolveApi(candidate, candidate) 114 | if dll or api: 115 | LOGGER.debug("successfully resolved: %s %s", dll, api) 116 | api_entry = {"referencing_addr": [], "dll_name": dll, "api_name": api} 117 | if candidate in self.disassembly.apis: 118 | api_entry = self.disassembly.apis[candidate] 119 | if self.current_calling_addr not in api_entry["referencing_addr"]: 120 | api_entry["referencing_addr"].append(self.current_calling_addr) 121 | self.disassembly.apis[candidate] = api_entry 122 | elif self.disassembly.isAddrWithinMemoryImage(candidate): 123 | LOGGER.debug("successfully resolved: 0x%x", candidate) 124 | self.disassembler.fc_manager.addCandidate(candidate, reference_source=self.current_calling_addr) 125 | else: 126 | LOGGER.debug("candidate not resolved") 127 | else: 128 | LOGGER.debug("no candidate to resolved") 129 | 130 | return True 131 | #process previous blocks 132 | if depth >= 0: 133 | refs_in = [ 134 | fr for (fr, to) in analysis_state.code_refs 135 | if to == block[0][0] and 136 | fr not in [ins[0] for block in processed for ins in block] 137 | ] 138 | LOGGER.debug("start processing previous blocks, searching in %d in_refs with remaining depth: %d", len(refs_in), depth - 1) 139 | if any(self.processBlock(analysis_state, b, registers.copy(), register_name, processed, depth - 1) for b in [self.searchBlock(analysis_state, i) for i in refs_in]): 140 | return True 141 | 142 | def resolveRegisterCalls(self, analysis_state, block_depth=3): 143 | # after block reconstruction do simple data flow analysis to resolve open cases like "call " as stored in self.call_register_ins 144 | if analysis_state.call_register_ins: 145 | LOGGER.debug("Trying to resolve %d register calls in function: 0x%x", len(analysis_state.call_register_ins), analysis_state.start_addr) 146 | max_calls_per_block = 10 147 | calls_per_block = {} 148 | for calling_addr in analysis_state.call_register_ins: 149 | LOGGER.debug("#" * 20) 150 | self.current_calling_addr = calling_addr 151 | self.state = analysis_state 152 | start_block = [ins for ins in self.searchBlock(analysis_state, calling_addr) if ins[0] <= calling_addr] 153 | if not start_block: 154 | return 155 | # we only process at most 10 register-calls per block to avoid extreme cases 156 | # found one Go sample with 130k register calls. 157 | if start_block[0] not in calls_per_block: 158 | calls_per_block[start_block[0]] = 0 159 | calls_per_block[start_block[0]] += 1 160 | # if we have an old config, default to 50 161 | max_calls = self.disassembler.config.MAX_INDIRECT_CALLS_PER_BASIC_BLOCK if hasattr(self.disassembler.config, 'MAX_INDIRECT_CALLS_PER_BASIC_BLOCK') else 50 162 | if calls_per_block[start_block[0]] > max_calls: 163 | break 164 | LOGGER.debug("For this block, we can still analyze %d indirect calls.", max_calls_per_block - calls_per_block[start_block[0]]) 165 | if start_block: 166 | self.processBlock(analysis_state, start_block, dict(), start_block[-1][3], list(), block_depth) 167 | -------------------------------------------------------------------------------- /smda/intel/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /smda/intel/definitions.py: -------------------------------------------------------------------------------- 1 | 2 | # some mnemonics as specific to capstone 3 | CJMP_INS = ["je", "jne", "js", "jns", "jp", "jnp", "jo", "jno", "jl", "jle", "jg", "jge", "jb", "jbe", "ja", "jae", "jcxz", "jecxz", "jrcxz"] 4 | LOOP_INS = ["loop", "loopne", "loope"] 5 | JMP_INS = ["jmp", "ljmp"] 6 | CALL_INS = ["call", "lcall"] 7 | RET_INS = ["ret", "retn", "retf", "iret"] 8 | END_INS = ["ret", "retn", "retf", "iret", "int3", "hlt"] 9 | REGS_32BIT = ["eax", "ebx", "ecx", "edx", "esi", "edi", "ebp", "esp"] 10 | REGS_64BIT = ["rax", "rbx", "rcx", "rdx", "rsp", "rbp", "rsi", "rdi", "rip", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"] 11 | DOUBLE_ZERO = bytearray(b"\x00\x00") 12 | 13 | DEFAULT_PROLOGUES = [ 14 | b"\x8B\xFF\x55\x8B\xEC", 15 | b"\x89\xFF\x55\x8B\xEC", 16 | b"\x55\x8B\xEC", 17 | b"\x55\x89\xE5" 18 | ] 19 | 20 | # these cover 99% of confirmed function starts in the reference data set 21 | COMMON_PROLOGUES = { 22 | "5": { 23 | 32: { 24 | b"\x8B\xFF\x55\x8B\xEC": 50, # mov edi, edi, push ebp, mov ebp, esp 25 | b"\x89\xFF\x55\x8B\xEC": 50, # mov edi, edi, push ebp, mov ebp, esp 26 | }, 27 | 64: {} 28 | }, 29 | "3": { 30 | 32: { 31 | b"\x55\x8B\xEC": 50, # push ebp, mov ebp, esp 32 | }, 33 | 64: {} 34 | }, 35 | "1": { 36 | 32: { 37 | b"\x55": 51, # 311150 (51.09%) -- cumulative: 51.09% 38 | b"\x8b": 10, # 62878 (10.32%) -- cumulative: 61.41% 39 | b"\x83": 7, # 46477 (7.63%) -- cumulative: 69.05% 40 | b"\x53": 6, # 38773 (6.37%) -- cumulative: 75.41% 41 | b"\x57": 5, # 36048 (5.92%) -- cumulative: 81.33% 42 | b"\x56": 5, # 31955 (5.25%) -- cumulative: 86.58% 43 | b"\xff": 4, # 24444 (4.01%) -- cumulative: 90.59% 44 | b"\xe9": 2, # 16420 (2.70%) -- cumulative: 93.29% 45 | b"\xb8": 1, # 6577 (1.08%) -- cumulative: 94.37% 46 | b"\xc3": 1, # 5638 (0.93%) -- cumulative: 95.29% 47 | b"\xa1": 1, # 4168 (0.68%) -- cumulative: 95.98% 48 | b"\x6a": 1, # 3815 (0.63%) -- cumulative: 96.60% 49 | b"\x51": 1, # 2753 (0.45%) -- cumulative: 97.06% 50 | b"\x31": 1, # 2514 (0.41%) -- cumulative: 97.47% 51 | b"\xf3": 1, # 2109 (0.35%) -- cumulative: 97.82% 52 | b"\x33": 1, # 1279 (0.21%) -- cumulative: 98.03% 53 | b"\x81": 1, # 1261 (0.21%) -- cumulative: 98.23% 54 | b"\x85": 1, # 1045 (0.17%) -- cumulative: 98.40% 55 | b"\xe8": 1, # 1005 (0.17%) -- cumulative: 98.57% 56 | b"\x8d": 1, # 896 (0.15%) -- cumulative: 98.72% 57 | b"\x68": 1, # 749 (0.12%) -- cumulative: 98.84% 58 | b"\x80": 1, # 703 (0.12%) -- cumulative: 98.95% 59 | }, 60 | 64: { 61 | b"\x55": 33, # 196922 (33.40%) -- cumulative: 33.40% 62 | b"\x48": 21, # 124360 (21.09%) -- cumulative: 54.49% 63 | b"\x41": 15, # 91785 (15.57%) -- cumulative: 70.06% 64 | b"\x53": 6, # 37559 (6.37%) -- cumulative: 76.43% 65 | b"\xff": 3, # 22877 (3.88%) -- cumulative: 80.31% 66 | b"\x40": 3, # 18018 (3.06%) -- cumulative: 83.36% 67 | b"\xe9": 2, # 15434 (2.62%) -- cumulative: 85.98% 68 | b"\x50": 1, # 11713 (1.99%) -- cumulative: 87.97% 69 | b"\x8b": 1, # 9130 (1.55%) -- cumulative: 89.52% 70 | b"\x4c": 1, # 6737 (1.14%) -- cumulative: 90.66% 71 | b"\xc3": 1, # 5978 (1.01%) -- cumulative: 91.67% 72 | b"\x89": 1, # 5852 (0.99%) -- cumulative: 92.66% 73 | b"\xb8": 1, # 5073 (0.86%) -- cumulative: 93.52% 74 | b"\x31": 1, # 4902 (0.83%) -- cumulative: 94.36% 75 | b"\x44": 1, # 4504 (0.76%) -- cumulative: 95.12% 76 | b"\x0f": 1, # 3196 (0.54%) -- cumulative: 95.66% 77 | b"\x83": 1, # 3120 (0.53%) -- cumulative: 96.19% 78 | b"\xf3": 1, # 2363 (0.40%) -- cumulative: 96.59% 79 | b"\xf2": 1, # 2349 (0.40%) -- cumulative: 96.99% 80 | b"\x85": 1, # 1806 (0.31%) -- cumulative: 97.30% 81 | b"\x33": 1, # 1605 (0.27%) -- cumulative: 97.57% 82 | b"\x66": 1, # 1370 (0.23%) -- cumulative: 97.80% 83 | b"\xba": 1, # 1235 (0.21%) -- cumulative: 98.01% 84 | b"\x45": 1, # 1227 (0.21%) -- cumulative: 98.22% 85 | b"\x80": 1, # 1197 (0.20%) -- cumulative: 98.42% 86 | b"\xc7": 1, # 1034 (0.18%) -- cumulative: 98.60% 87 | b"\xb0": 1, # 911 (0.15%) -- cumulative: 98.75% 88 | b"\xbf": 1, # 894 (0.15%) -- cumulative: 98.90% 89 | } 90 | } 91 | } 92 | 93 | #TODO: 2018-06-27 expand the coverage in this list 94 | # https://stackoverflow.com/questions/25545470/long-multi-byte-nops-commonly-understood-macros-or-other-notation 95 | GAP_SEQUENCES = { 96 | 1: [ 97 | b"\x90", # NOP1_OVERRIDE_NOP - AMD / nop - INTEL 98 | b"\xCC", # int3 99 | b"\x00", # pass over sequences of null bytes 100 | ], 101 | 2: [ 102 | b"\x66\x90", # NOP2_OVERRIDE_NOP - AMD / nop - INTEL 103 | b"\x8b\xc0", 104 | b"\x8b\xff", # mov edi, edi 105 | b"\x8d\x00", # lea eax, dword ptr [eax] 106 | b"\x86\xc0", # xchg al, al 107 | b"\x66\x2e", # NOP2_OVERRIDE_NOP - AMD / nop - INTEL 108 | ], 109 | 3: [ 110 | b"\x0f\x1f\x00", # NOP3_OVERRIDE_NOP - AMD / nop - INTEL 111 | b"\x8d\x40\x00", # lea eax, dword ptr [eax] 112 | b"\x8d\x00\x00", # lea eax, dword ptr [eax] 113 | b"\x8d\x49\x00", # lea ecx, dword ptr [ecx] 114 | b"\x8d\x64\x24", # lea esp, dword ptr [esp] 115 | b"\x8d\x76\x00", 116 | b"\x66\x66\x90" 117 | ], 118 | 4: [ 119 | b"\x0f\x1f\x40\x00", # NOP4_OVERRIDE_NOP - AMD / nop - INTEL 120 | b"\x8d\x74\x26\x00", 121 | b"\x66\x66\x66\x90" 122 | ], 123 | 5: [ 124 | b"\x0f\x1f\x44\x00\x00", # NOP5_OVERRIDE_NOP - AMD / nop - INTEL 125 | b"\x90\x8d\x74\x26\x00" 126 | ], 127 | 6: [ 128 | b"\x66\x0f\x1f\x44\x00\x00", # NOP6_OVERRIDE_NOP - AMD / nop - INTEL 129 | b"\x8d\xb6\x00\x00\x00\x00" 130 | ], 131 | 7: [ 132 | b"\x0f\x1f\x80\x00\x00\x00\x00", # NOP7_OVERRIDE_NOP - AMD / nop - INTEL, 133 | b"\x8d\xb4\x26\x00\x00\x00\x00", 134 | b"\x8D\xBC\x27\x00\x00\x00\x00" 135 | ], 136 | 8: [ 137 | b"\x0f\x1f\x84\x00\x00\x00\x00\x00", # NOP8_OVERRIDE_NOP - AMD / nop - INTEL 138 | b"\x90\x8d\xb4\x26\x00\x00\x00\x00" 139 | ], 140 | 9: [ 141 | b"\x66\x0f\x1f\x84\x00\x00\x00\x00\x00", # NOP9_OVERRIDE_NOP - AMD / nop - INTEL 142 | b"\x89\xf6\x8d\xbc\x27\x00\x00\x00\x00" 143 | ], 144 | 10: [ 145 | b"\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00", # NOP10_OVERRIDE_NOP - AMD 146 | b"\x8d\x76\x00\x8d\xbc\x27\x00\x00\x00\x00", 147 | b"\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00" 148 | ], 149 | 11: [ 150 | b"\x66\x66\x66\x0f\x1f\x84\x00\x00\x00\x00\x00", # NOP11_OVERRIDE_NOP - AMD 151 | b"\x8d\x74\x26\x00\x8d\xbc\x27\x00\x00\x00\x00", 152 | b"\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00" 153 | ], 154 | 12: [ 155 | b"\x8d\xb6\x00\x00\x00\x00\x8d\xbf\x00\x00\x00\x00", 156 | b"\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00" 157 | ], 158 | 13: [ 159 | b"\x8d\xb6\x00\x00\x00\x00\x8d\xbc\x27\x00\x00\x00\x00", 160 | b"\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00" 161 | ], 162 | 14: [ 163 | b"\x8d\xb4\x26\x00\x00\x00\x00\x8d\xbc\x27\x00\x00\x00\x00", 164 | b"\x66\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00" 165 | ], 166 | 15: [ 167 | b"\x66\x66\x66\x66\x66\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00" 168 | ] 169 | } 170 | 171 | 172 | COMMON_START_BYTES = { 173 | "32": { 174 | "55": 8334, 175 | "6a": 758, 176 | "56": 756, 177 | "51": 312, 178 | "8d": 566, 179 | "83": 558, 180 | "53": 548 181 | }, 182 | "64": { 183 | "48": 1341, 184 | "40": 349, 185 | "4c": 59, 186 | "33": 56, 187 | "44": 18, 188 | "45": 17, 189 | "e9": 16, 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /smda/utility/BracketQueue.py: -------------------------------------------------------------------------------- 1 | class BracketQueue(object): 2 | """ 3 | This queue is tailored based on our research rsults regarding function entry point identification 4 | """ 5 | def __init__(self, candidates=None, initial_brackets=None): 6 | self.update_count = 0 7 | self.update_shift_count = 0 8 | self.brackets = { 9 | 0: {}, 10 | 1: {}, 11 | 2: {} 12 | } 13 | if candidates is not None: 14 | for candidate in candidates: 15 | self.add(candidate) 16 | self.ensure_order() 17 | elif initial_brackets is not None: 18 | self.brackets = initial_brackets 19 | self.ensure_order() 20 | 21 | def __iter__(self): 22 | return self 23 | 24 | def __next__(self): 25 | return self.next() 26 | 27 | def next(self): 28 | if all(len(self.brackets[i]) == 0 for i in range(3)): 29 | raise StopIteration 30 | for bracket_index in range(2, -1, -1): 31 | if self.brackets[bracket_index]: 32 | offset, candidate = self.brackets[bracket_index].popitem() 33 | return candidate 34 | 35 | def add(self, candidate): 36 | bracket_index = min(2, len(candidate.call_ref_sources)) 37 | self.brackets[bracket_index][candidate.addr] = candidate 38 | 39 | def update(self, target_candidate=None): 40 | if target_candidate: 41 | updated_bracket_index = min(2, len(target_candidate.call_ref_sources)) 42 | # check if the element is still in the same bracket, otherwise shift to next bracket 43 | self.update_count += 1 44 | for bracket_index in range(2, -1, -1): 45 | if target_candidate.addr in self.brackets[bracket_index] and bracket_index != updated_bracket_index: 46 | self.update_shift_count += 1 47 | self.brackets[bracket_index].pop(target_candidate.addr) 48 | self.brackets[updated_bracket_index][target_candidate.addr] = target_candidate 49 | break 50 | 51 | def ensure_order(self): 52 | for bracket_index in range(2, -1, -1): 53 | if self.brackets[bracket_index]: 54 | self.brackets[bracket_index] = {offset: candidate for offset, candidate in sorted(self.brackets[bracket_index].items(), key=lambda x: x[1].getScore())} 55 | 56 | def __str__(self): 57 | return f"BracketQueue | 2: {len(self.brackets[2])} candidates, 1: {len(self.brackets[1])} candidates, 0: {len(self.brackets[0])} candidates," 58 | -------------------------------------------------------------------------------- /smda/utility/DelphiKbFileLoader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | LOGGER = logging.getLogger(__name__) 4 | 5 | 6 | class DelphiKbFileLoader(object): 7 | 8 | @staticmethod 9 | def isCompatible(data): 10 | return data[:23] == b"IDR Knowledge Base File" 11 | 12 | @staticmethod 13 | def getBaseAddress(binary): 14 | # return fixed base address that will allow instruction escaping 15 | return 0x400000 16 | 17 | @staticmethod 18 | def mapBinary(binary): 19 | return binary 20 | 21 | @staticmethod 22 | def getBitness(binary): 23 | # we only support 32bit for now 24 | return 32 25 | 26 | @staticmethod 27 | def getCodeAreas(binary): 28 | return [] 29 | -------------------------------------------------------------------------------- /smda/utility/FileLoader.py: -------------------------------------------------------------------------------- 1 | import os 2 | from smda.utility.PeFileLoader import PeFileLoader 3 | from smda.utility.ElfFileLoader import ElfFileLoader 4 | from smda.utility.MachoFileLoader import MachoFileLoader 5 | from smda.utility.DelphiKbFileLoader import DelphiKbFileLoader 6 | 7 | class FileLoader(object): 8 | 9 | _file_path = None 10 | _map_file = False 11 | _data = b"" 12 | _raw_data = b"" 13 | _base_addr = 0 14 | _bitness = 0 15 | _architecture = "" 16 | _code_areas = [] 17 | file_loaders = [PeFileLoader, ElfFileLoader, MachoFileLoader, DelphiKbFileLoader] 18 | 19 | def __init__(self, file_path, load_file=True, map_file=False): 20 | self._file_path = file_path 21 | self._map_file = map_file 22 | if load_file: 23 | self._loadFile() 24 | 25 | def _loadRawFileContent(self): 26 | binary = "" 27 | if os.path.isfile(self._file_path): 28 | with open(self._file_path, "rb") as inf: 29 | binary = inf.read() 30 | return binary 31 | 32 | def _loadFile(self, buffer=None): 33 | self._raw_data = buffer if buffer is not None else self._loadRawFileContent() 34 | if self._map_file: 35 | for loader in self.file_loaders: 36 | if loader.isCompatible(self._raw_data): 37 | self._data = loader.mapBinary(self._raw_data) 38 | self._base_addr = loader.getBaseAddress(self._raw_data) 39 | self._bitness = loader.getBitness(self._raw_data) 40 | self._code_areas = loader.getCodeAreas(self._raw_data) 41 | self._architecture = loader.getArchitecture(self._raw_data) 42 | break 43 | else: 44 | self._data = self._raw_data 45 | 46 | def getData(self): 47 | return self._data 48 | 49 | def getRawData(self): 50 | return self._raw_data 51 | 52 | def getBaseAddress(self): 53 | return self._base_addr 54 | 55 | def getArchitecture(self): 56 | return self._architecture 57 | 58 | def getBitness(self): 59 | return self._bitness 60 | 61 | def getCodeAreas(self): 62 | return self._code_areas 63 | -------------------------------------------------------------------------------- /smda/utility/MachoFileLoader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | LOGGER = logging.getLogger(__name__) 4 | 5 | LIEF_AVAILABLE = False 6 | try: 7 | import lief 8 | lief.logging.disable() 9 | LIEF_AVAILABLE = True 10 | except: 11 | LOGGER.warning("LIEF not available, will not be able to parse data from MachO files.") 12 | 13 | 14 | def align(v, alignment): 15 | remainder = v % alignment 16 | if remainder == 0: 17 | return v 18 | else: 19 | return v + (alignment - remainder) 20 | 21 | 22 | class MachoFileLoader(object): 23 | 24 | @staticmethod 25 | def isCompatible(data): 26 | if not LIEF_AVAILABLE: 27 | return False 28 | # check for MachO magic 29 | return data[:4] == b"\xCE\xFA\xED\xFE" or data[:4] == b"\xCF\xFA\xED\xFE" 30 | 31 | @staticmethod 32 | def getBaseAddress(binary): 33 | macho_file = lief.parse(binary) 34 | # Determine base address of binary 35 | # 36 | base_addr = 0 37 | candidates = [0xFFFFFFFFFFFFFFFF, macho_file.imagebase] 38 | for section in macho_file.sections: 39 | if section.virtual_address: 40 | candidates.append(section.virtual_address - section.offset) 41 | if len(candidates) > 1: 42 | base_addr = min(candidates) 43 | return base_addr 44 | 45 | @staticmethod 46 | def mapBinary(binary): 47 | """ 48 | map the MachO file sections and segments into a contiguous bytearray 49 | as if into virtual memory with the given base address. 50 | """ 51 | # MachO needs a file-like object... 52 | # Attention: for Python 2.x use the cStringIO package for StringIO 53 | macho_file = lief.parse(binary) 54 | base_addr = MachoFileLoader.getBaseAddress(binary) 55 | 56 | LOGGER.debug("MachO: base address: 0x%x", base_addr) 57 | 58 | # a segment may contain 0 or more sections. 59 | # ref: https://stackoverflow.com/a/14382477/87207 60 | # 61 | # i'm not sure if a section may be found outside of a segment. 62 | # therefore, lets load segments first, and then load sections over them. 63 | # we expect the section data to overwrite the segment data; however, 64 | # it should be exactly the same data. 65 | 66 | # find min and max virtual addresses. 67 | max_virtual_address = 0 68 | min_virtual_address = 0xFFFFFFFFFFFFFFFF 69 | min_raw_offset = 0xFFFFFFFFFFFFFFFF 70 | 71 | # find begin of the first section/segment and end of the last section/segment. 72 | for section in sorted(macho_file.sections, key=lambda section: section.size, reverse=True): 73 | if not section.virtual_address: 74 | continue 75 | 76 | max_virtual_address = max(max_virtual_address, section.size + section.virtual_address) 77 | min_virtual_address = min(min_virtual_address, section.virtual_address) 78 | min_raw_offset = min(min_raw_offset, section.offset) 79 | 80 | for segment in macho_file.segments: 81 | if not segment.virtual_address: 82 | continue 83 | max_virtual_address = max(max_virtual_address, segment.virtual_size + segment.virtual_address) 84 | min_virtual_address = min(min_virtual_address, segment.virtual_address) 85 | min_raw_offset = min(min_raw_offset, segment.file_offset) 86 | 87 | if not max_virtual_address: 88 | LOGGER.debug("MachO: no section or segment data") 89 | return bytes() 90 | 91 | # create mapped region. 92 | # offset 0x0 corresponds to the MachO base address 93 | virtual_size = max_virtual_address - base_addr 94 | LOGGER.debug("MachO: max virtual section offset: 0x%x", max_virtual_address) 95 | LOGGER.debug("MachO: mapped size: 0x%x", virtual_size) 96 | LOGGER.debug("MachO: min raw offset: 0x%x", min_raw_offset) 97 | mapped_binary = bytearray(align(virtual_size, 0x1000)) 98 | 99 | # map segments. 100 | # segments may contains 0 or more sections, 101 | # so we do segments first. 102 | # 103 | # load sections from largest to smallest, 104 | # because some segments may overlap. 105 | # 106 | # technically, we should only have to load PT_LOAD segments, 107 | # but we do all of them here. 108 | for segment in sorted(macho_file.segments, key=lambda segment: segment.file_size, reverse=True): 109 | if not segment.virtual_address: 110 | continue 111 | rva = segment.virtual_address - base_addr 112 | LOGGER.debug("MachO: mapping segment of 0x%04x bytes at 0x%08x-0x%08x (0x%08x)", segment.file_size, rva, rva + segment.file_size, segment.virtual_address) 113 | assert len(segment.content) == segment.file_size 114 | mapped_binary[rva:rva + segment.file_size] = segment.content 115 | 116 | # map sections. 117 | # may overwrite some segment data, but we expect the content to be identical. 118 | for section in sorted(macho_file.sections, key=lambda section: section.size, reverse=True): 119 | if not section.virtual_address: 120 | continue 121 | rva = section.virtual_address - base_addr 122 | LOGGER.debug("MachO: mapping section of 0x%04x bytes at 0x%08x-0x%08x (0x%08x)", section.size, rva, rva + section.size, section.virtual_address) 123 | # section may be empty or smaller, so we may not always copy data here 124 | if len(section.content) == section.size: 125 | mapped_binary[rva:rva + section.size] = section.content 126 | # assert len(section.content) == section.size 127 | 128 | # map header. 129 | # we consider the headers to be any data found before the first section/segment 130 | if min_raw_offset != 0: 131 | LOGGER.debug("MachO: mapping 0x%x bytes of header at 0x0 (0x%x)", min_raw_offset, base_addr) 132 | mapped_binary[0:min_raw_offset] = binary[0:min_raw_offset] 133 | 134 | LOGGER.debug("MachO: final mapped size: 0x%x", len(mapped_binary)) 135 | return bytes(mapped_binary) 136 | 137 | @staticmethod 138 | def getArchitecture(binary): 139 | # TODO add machine types whenever we add more architectures 140 | macho_file = lief.parse(binary) 141 | machine_type = macho_file.header.cpu_type 142 | if machine_type in [lief.MachO.Header.CPU_TYPE.X86_64, lief.MachO.Header.CPU_TYPE.X86]: 143 | return "intel" 144 | elif machine_type == [lief.MachO.Header.CPU_TYPE.ARM64, lief.MachO.Header.CPU_TYPE.ARM]: 145 | return "arm" 146 | raise NotImplementedError("SMDA does not support this architecture yet.") 147 | 148 | @staticmethod 149 | def getBitness(binary): 150 | # TODO add machine types whenever we add more architectures 151 | macho_file = lief.parse(binary) 152 | machine_type = macho_file.header.cpu_type 153 | if machine_type == lief.MachO.Header.CPU_TYPE.X86_64: 154 | return 64 155 | elif machine_type == lief.MachO.Header.CPU_TYPE.X86: 156 | return 32 157 | elif machine_type == lief.MachO.Header.CPU_TYPE.ARM64: 158 | raise NotImplementedError("SMDA does not support ARM yet.") 159 | return 0 160 | 161 | @staticmethod 162 | def mergeCodeAreas(code_areas): 163 | merged_code_areas = sorted(code_areas) 164 | result = [] 165 | index = 0 166 | while index < len(merged_code_areas) - 1: 167 | this_area = merged_code_areas[index] 168 | next_area = merged_code_areas[index + 1] 169 | if this_area[1] != next_area[0]: 170 | result.append(this_area) 171 | index += 1 172 | else: 173 | merged_code_areas = merged_code_areas[:index] + [[this_area[0], next_area[1]]] + merged_code_areas[index + 2:] 174 | return merged_code_areas 175 | 176 | @staticmethod 177 | def getCodeAreas(binary): 178 | # TODO add machine types whenever we add more architectures 179 | macho_file = lief.parse(binary) 180 | ins_flags = ( 181 | lief.MachO.Section.FLAGS.PURE_INSTRUCTIONS.value + 182 | lief.MachO.Section.FLAGS.SELF_MODIFYING_CODE.value + 183 | lief.MachO.Section.FLAGS.SOME_INSTRUCTIONS.value 184 | ) 185 | code_areas = [] 186 | for section in macho_file.sections: 187 | # SHF_EXECINSTR = 4 188 | if section.flags.value & ins_flags: 189 | section_start = section.virtual_address 190 | section_size = section.size 191 | if section.alignment and section_size % section.alignment != 0: 192 | section_size += section.alignment - (section_size % section.alignment) 193 | section_end = section_start + section_size 194 | code_areas.append([section_start, section_end]) 195 | return MachoFileLoader.mergeCodeAreas(code_areas) 196 | -------------------------------------------------------------------------------- /smda/utility/MemoryFileLoader.py: -------------------------------------------------------------------------------- 1 | from smda.utility.FileLoader import FileLoader 2 | 3 | 4 | class MemoryFileLoader(FileLoader): 5 | 6 | def __init__(self, buffer, load_file=True, map_file=False): 7 | super().__init__("", load_file=False, map_file=map_file) 8 | self._loadFile(buffer=buffer) 9 | -------------------------------------------------------------------------------- /smda/utility/PeFileLoader.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import logging 3 | 4 | import lief 5 | lief.logging.disable() 6 | 7 | LOG = logging.getLogger(__name__) 8 | 9 | 10 | 11 | class PeFileLoader(object): 12 | 13 | BITNESS_MAP = {0x14c: 32, 0x8664: 64} 14 | 15 | @staticmethod 16 | def isCompatible(data): 17 | return data[:2] == b"MZ" 18 | 19 | @staticmethod 20 | def mapBinary(binary): 21 | # This is a pretty rough implementation but does the job for now 22 | mapped_binary = bytearray([]) 23 | pe_offset = PeFileLoader.getPeOffset(binary) 24 | if pe_offset: 25 | num_sections = 0 26 | bitness = 0 27 | section_infos = [] 28 | optional_header_size = 0xF8 29 | if pe_offset and len(binary) >= pe_offset + 0x8: 30 | num_sections = struct.unpack("H", binary[pe_offset + 0x6:pe_offset + 0x8])[0] 31 | bitness = PeFileLoader.getBitness(binary) 32 | if bitness == 64: 33 | optional_header_size = 0x108 34 | if pe_offset and num_sections and len(binary) >= pe_offset + optional_header_size + num_sections * 0x28: 35 | for section_index in range(num_sections): 36 | section_offset = section_index * 0x28 37 | slice_start = pe_offset + optional_header_size + section_offset + 0x8 38 | slice_end = pe_offset + optional_header_size + section_offset + 0x8 + 0x10 39 | virt_size, virt_offset, raw_size, raw_offset = struct.unpack("IIII", binary[slice_start:slice_end]) 40 | section_info = { 41 | "section_index": section_index, 42 | "virt_size": virt_size, 43 | "virt_offset": virt_offset, 44 | "raw_size": raw_size, 45 | "raw_offset": raw_offset, 46 | } 47 | section_infos.append(section_info) 48 | max_virt_section_offset = 0 49 | min_raw_section_offset = 0xFFFFFFFF 50 | if section_infos: 51 | for section_info in section_infos: 52 | max_virt_section_offset = max(max_virt_section_offset, section_info["virt_size"] + section_info["virt_offset"]) 53 | max_virt_section_offset = max(max_virt_section_offset, section_info["raw_size"] + section_info["virt_offset"]) 54 | if section_info["raw_offset"] > 0x200: 55 | min_raw_section_offset = min(min_raw_section_offset, section_info["raw_offset"]) 56 | # support up to 100MB for now. 57 | if max_virt_section_offset and max_virt_section_offset < 100 * 1024 * 1024: 58 | mapped_binary = bytearray([0] * max_virt_section_offset) 59 | mapped_binary[0:min_raw_section_offset] = binary[0:min_raw_section_offset] 60 | for section_info in section_infos: 61 | mapped_from = section_info["virt_offset"] 62 | mapped_to = section_info["virt_offset"] + section_info["raw_size"] 63 | mapped_binary[mapped_from:mapped_to] = binary[section_info["raw_offset"]:section_info["raw_offset"] + section_info["raw_size"]] 64 | LOG.debug("Mapping %d: raw 0x%x (0x%x bytes) -> virtual 0x%x (0x%x bytes)", 65 | section_info["section_index"], 66 | section_info["raw_offset"], 67 | section_info["raw_size"], 68 | section_info["virt_offset"], 69 | section_info["virt_size"]) 70 | LOG.debug("Mapped binary of size %d bytes (%d sections) to memory view of size %d bytes", len(binary), num_sections, len(mapped_binary)) 71 | return bytes(mapped_binary) 72 | 73 | @staticmethod 74 | def getBitness(binary): 75 | bitness_id = 0 76 | pe_offset = PeFileLoader.getPeOffset(binary) 77 | if pe_offset: 78 | if pe_offset and len(binary) >= pe_offset + 0x6: 79 | bitness_id = struct.unpack("H", binary[pe_offset + 0x4:pe_offset + 0x6])[0] 80 | return PeFileLoader.BITNESS_MAP.get(bitness_id, 0) 81 | 82 | @staticmethod 83 | def getBaseAddress(binary): 84 | base_addr = 0 85 | pe_offset = PeFileLoader.getPeOffset(binary) 86 | if pe_offset and len(binary) >= pe_offset + 0x38: 87 | if PeFileLoader.getBitness(binary) == 32: 88 | base_addr = struct.unpack("I", binary[pe_offset + 0x34:pe_offset + 0x38])[0] 89 | elif PeFileLoader.getBitness(binary) == 64: 90 | base_addr = struct.unpack("Q", binary[pe_offset + 0x30:pe_offset + 0x38])[0] 91 | if base_addr: 92 | LOG.debug("Changing base address from 0 to: 0x%x for inference of reference counts (based on PE header)", base_addr) 93 | return base_addr 94 | 95 | @staticmethod 96 | def getPeOffset(binary): 97 | if len(binary) >= 0x40: 98 | pe_offset = struct.unpack("H", binary[0x3c:0x3c + 2])[0] 99 | return pe_offset 100 | return 0 101 | 102 | @staticmethod 103 | def getOEP(binary): 104 | oep_rva = 0 105 | if PeFileLoader.checkPe(binary): 106 | pe_offset = PeFileLoader.getPeOffset(binary) 107 | if pe_offset and len(binary) >= pe_offset + 0x2c: 108 | oep_rva = struct.unpack("I", binary[pe_offset + 0x28:pe_offset + 0x2C])[0] 109 | return oep_rva 110 | 111 | @staticmethod 112 | def getArchitecture(binary): 113 | architecture = "intel" 114 | pefile = lief.parse(binary) 115 | if pefile: 116 | for d in pefile.data_directories: 117 | if d.type == lief.PE.DataDirectory.TYPES.CLR_RUNTIME_HEADER: 118 | if d.size > 0: 119 | architecture = "cil" 120 | return architecture 121 | 122 | @staticmethod 123 | def checkPe(binary): 124 | pe_offset = PeFileLoader.getPeOffset(binary) 125 | if pe_offset and len(binary) >= pe_offset + 6: 126 | bitness = struct.unpack("H", binary[pe_offset + 4:pe_offset + 4 + 2])[0] 127 | return bitness in PeFileLoader.BITNESS_MAP 128 | return False 129 | 130 | @staticmethod 131 | def getCodeAreas(binary): 132 | pefile = lief.parse(binary) 133 | code_areas = [] 134 | base_address = PeFileLoader.getBaseAddress(binary) 135 | if pefile and pefile.sections: 136 | for section in pefile.sections: 137 | # MEM_EXECUTE 138 | if section.characteristics & 0x20000000: 139 | section_start = base_address + section.virtual_address 140 | section_size = section.virtual_size 141 | if section_size % 0x1000 != 0: 142 | section_size += 0x1000 - (section_size % 0x1000) 143 | section_end = section_start + section_size 144 | code_areas.append([section_start, section_end]) 145 | return PeFileLoader.mergeCodeAreas(code_areas) 146 | 147 | @staticmethod 148 | def mergeCodeAreas(code_areas): 149 | merged_code_areas = sorted(code_areas) 150 | result = [] 151 | index = 0 152 | while index < len(merged_code_areas) - 1: 153 | this_area = merged_code_areas[index] 154 | next_area = merged_code_areas[index + 1] 155 | if this_area[1] != next_area[0]: 156 | result.append(this_area) 157 | index += 1 158 | else: 159 | merged_code_areas = merged_code_areas[:index] + [[this_area[0], next_area[1]]] + merged_code_areas[index + 2:] 160 | return merged_code_areas 161 | -------------------------------------------------------------------------------- /smda/utility/PriorityQueue.py: -------------------------------------------------------------------------------- 1 | import heapq 2 | 3 | class PriorityQueue(object): 4 | def __init__(self, content=None): 5 | if content is None: 6 | content = [] 7 | self.heap = content 8 | if self.heap: 9 | self.update() 10 | 11 | def __iter__(self): 12 | return self 13 | 14 | def __next__(self): 15 | return self.next() 16 | 17 | def next(self): 18 | if not self.heap: 19 | raise StopIteration 20 | if len(self.heap) == 1: 21 | return self.heap.pop() 22 | last_item = self.heap.pop() 23 | result = self.heap[0] 24 | self.heap[0] = last_item 25 | heapq._siftup_max(self.heap, 0) 26 | return result 27 | 28 | def add(self, element): 29 | self.heap.append(element) 30 | heapq._siftdown_max(self.heap, 0, len(self.heap)-1) 31 | 32 | def update(self, target_candidate=None): 33 | if target_candidate is None: 34 | heapq._heapify_max(self.heap) 35 | 36 | def __str__(self): 37 | return str(self.heap) 38 | -------------------------------------------------------------------------------- /smda/utility/StringExtractor.py: -------------------------------------------------------------------------------- 1 | import string 2 | import struct 3 | from typing import Tuple, Iterator 4 | 5 | from smda.common import SmdaFunction 6 | 7 | # ported back from our PR to capa v4.0.0 8 | # https://github.com/mandiant/capa/blob/v4.0.0/capa/features/extractors/smda/insn.py 9 | 10 | 11 | def read_bytes(smda_report, va, num_bytes=None): 12 | """ 13 | read up to MAX_BYTES_FEATURE_SIZE from the given address. 14 | """ 15 | 16 | rva = va - smda_report.base_addr 17 | if smda_report.buffer is None: 18 | raise ValueError("buffer is empty") 19 | buffer_end = len(smda_report.buffer) 20 | max_bytes = num_bytes if num_bytes is not None else 0x100 21 | if rva + max_bytes > buffer_end: 22 | return smda_report.buffer[rva:] 23 | else: 24 | return smda_report.buffer[rva : rva + max_bytes] 25 | 26 | 27 | def derefs(smda_report, p): 28 | """ 29 | recursively follow the given pointer, yielding the valid memory addresses along the way. 30 | useful when you may have a pointer to string, or pointer to pointer to string, etc. 31 | 32 | this is a "do what i mean" type of helper function. 33 | 34 | based on the implementation in viv/insn.py 35 | """ 36 | depth = 0 37 | while True: 38 | if not smda_report.isAddrWithinMemoryImage(p): 39 | return 40 | yield p 41 | 42 | bytes_ = read_bytes(smda_report, p, num_bytes=4) 43 | val = struct.unpack("I", bytes_)[0] 44 | 45 | # sanity: pointer points to self 46 | if val == p: 47 | return 48 | 49 | # sanity: avoid chains of pointers that are unreasonably deep 50 | depth += 1 51 | if depth > 10: 52 | return 53 | 54 | p = val 55 | 56 | 57 | def detect_ascii_len(smda_report, offset): 58 | if smda_report.buffer is None: 59 | return 0 60 | ascii_len = 0 61 | rva = offset - smda_report.base_addr 62 | char = smda_report.buffer[rva] 63 | while char < 127 and chr(char) in string.printable: 64 | ascii_len += 1 65 | rva += 1 66 | char = smda_report.buffer[rva] 67 | if char == 0: 68 | return ascii_len 69 | return 0 70 | 71 | 72 | def detect_unicode_len(smda_report, offset): 73 | if smda_report.buffer is None: 74 | return 0 75 | unicode_len = 0 76 | rva = offset - smda_report.base_addr 77 | char = smda_report.buffer[rva] 78 | second_char = smda_report.buffer[rva + 1] 79 | while char < 127 and chr(char) in string.printable and second_char == 0: 80 | unicode_len += 2 81 | rva += 2 82 | char = smda_report.buffer[rva] 83 | second_char = smda_report.buffer[rva + 1] 84 | if char == 0 and second_char == 0: 85 | return unicode_len 86 | return 0 87 | 88 | 89 | def read_string(smda_report, offset): 90 | alen = detect_ascii_len(smda_report, offset) 91 | if alen > 1: 92 | return read_bytes(smda_report, offset, alen).decode("utf-8") 93 | ulen = detect_unicode_len(smda_report, offset) 94 | if ulen > 2: 95 | return read_bytes(smda_report, offset, ulen).decode("utf-16") 96 | 97 | 98 | def extract_strings(f: SmdaFunction) -> Iterator[Tuple[str, int]]: 99 | """parse string features from the given instruction.""" 100 | for insn in f.getInstructions(): 101 | for data_ref in insn.getDataRefs(): 102 | for v in derefs(f.smda_report, data_ref): 103 | string_read = read_string(f.smda_report, v) 104 | if string_read: 105 | yield string_read.rstrip("\x00"), insn.offset 106 | -------------------------------------------------------------------------------- /smda/utility/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/__init__.py -------------------------------------------------------------------------------- /tests/asprox_0x008D0000_xored: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/asprox_0x008D0000_xored -------------------------------------------------------------------------------- /tests/bashlite_xored: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/bashlite_xored -------------------------------------------------------------------------------- /tests/context.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import os 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | 7 | import smda 8 | from smda.SmdaConfig import SmdaConfig 9 | config = SmdaConfig() 10 | config.API_COLLECTION_FILES = {"winxp": config.PROJECT_ROOT + os.sep + "data" + os.sep + "apiscout_winxp_prof_sp3.json"} 11 | -------------------------------------------------------------------------------- /tests/cutwail_xored: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/cutwail_xored -------------------------------------------------------------------------------- /tests/komplex_xored: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/komplex_xored -------------------------------------------------------------------------------- /tests/njrat_xored: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielplohmann/smda/16f1a82dec86db354711c292e70e0aa21b30957a/tests/njrat_xored -------------------------------------------------------------------------------- /tests/testEscaper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import logging 4 | import os 5 | import unittest 6 | 7 | from smda.common.SmdaReport import SmdaReport 8 | from smda.common.SmdaFunction import SmdaFunction 9 | from smda.common.SmdaInstruction import SmdaInstruction 10 | from smda.intel.IntelInstructionEscaper import IntelInstructionEscaper 11 | from smda.cil.CilInstructionEscaper import CilInstructionEscaper 12 | 13 | from .context import config 14 | 15 | LOG = logging.getLogger(__name__) 16 | logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s") 17 | logging.disable(logging.CRITICAL) 18 | 19 | 20 | class DisassemblyTestSuite(unittest.TestCase): 21 | """Run a full example on a memory dump""" 22 | 23 | def testInstructionEscaping(self): 24 | test_data = [ 25 | {"ins": (0, "55", "push", "ebp"), "mnemonic_group": "S", "escaped_operands": "REG"}, 26 | {"ins": (1, "8365fc00", "and", "dword ptr [ebp - 4], 0"), "mnemonic_group": "A", "escaped_operands": "PTR, CONST"}, 27 | {"ins": (2, "f30f1efa", "endbr64", ""), "mnemonic_group": "C", "escaped_operands": ""}, 28 | {"ins": (3, "c58e5ad3", "vcvtss2sd", "xmm2, xmm14, xmm3"), "mnemonic_group": "X", "escaped_operands": "XREG, XREG, XREG"}, 29 | ] 30 | for data in test_data: 31 | smda_ins = SmdaInstruction(data["ins"]) 32 | self.assertEqual(smda_ins.getMnemonicGroup(IntelInstructionEscaper), data["mnemonic_group"]) 33 | self.assertEqual(smda_ins.getEscapedOperands(IntelInstructionEscaper), data["escaped_operands"]) 34 | 35 | def testIntelInstructionWildcarding(self): 36 | test_data = [ 37 | # simple mov with IMM outside of address space 38 | {"ins": (0, "b803400080", "mov", "eax, 0x80004003"), "lower": 0x63300000, "upper": 0x63400000, "expected_bin": "b803400080", "bitness": 32, "expected_opc": "b8????????"}, 39 | # simple mov with IMM within address space 40 | {"ins": (0, "ba2c893863", "mov", "edx, 0x6338892c"), "lower": 0x63300000, "upper": 0x63400000, "expected_bin": "ba????????", "bitness": 32, "expected_opc": "ba????????"}, 41 | # mov with with address calc within address space 42 | {"ins": (0, "0fb681808f3b63", "mov", "eax, byte ptr [ecx + 0x633b8f80]"), "lower": 0x63300000, "upper": 0x63400000, "expected_bin": "0fb681????????", "bitness": 32, "expected_opc": "0fb6??????????"}, 43 | # jump table calculation 44 | {"ins": (0, "ff2485788f3b63", "jmp", "dword ptr [eax*4 + 0x633b8f78]"), "lower": 0x63300000, "upper": 0x63400000, "expected_bin": "ff2485????????", "bitness": 32, "expected_opc": "ff????????????"}, 45 | # should only wildcard last part as escaper doesn't know address space 46 | {"ins": (0, "c705ac974a00ac974a00", "mov", "dword ptr [0x4a97ac], 0x4a97ac"), "lower": None, "upper": None, "expected_bin": "c705ac974a00????????", "bitness": 32, "expected_opc": "c7??????????????????"}, 47 | # should escape both operands 48 | {"ins": (0, "c705ac974a00ac974a00", "mov", "dword ptr [0x4a97ac], 0x4a97ac"), "lower": 0x400000, "upper": 0x4f0000, "expected_bin": "c705????????????????", "bitness": 32, "expected_opc": "c7??????????????????"}, 49 | # should escape from the right side and only blank out one, despite finding two matches for the pattern 50 | {"ins": (0, "010505050505", "add", "dword ptr [0x5050505], eax"), "lower": 0x400000, "upper": 0x4f0000, "expected_bin": "0105????????", "bitness": 32, "expected_opc": "01??????????"}, 51 | # should escape from the right side and only blank out one, despite finding two matches for the pattern 52 | {"ins": (0, "0f101515151515", "movups", "xmm2, xmmword ptr [0x15151515]"), "lower": 0x400000, "upper": 0x4f0000, "expected_bin": "0f1015????????", "bitness": 32, "expected_opc": "0f10??????????"}, 53 | # should ignore prefixes while wildcarding 54 | {"ins": (0, "666666660f008000224000", "sldt", "word ptr [rax + 0x402200]"), "lower": 0x400000, "upper": 0x4f0000, "expected_bin": "666666660f0080????????", "bitness": 32, "expected_opc": "666666660f00??????????"}, 55 | # should ignore prefixes and REX while wildcarding 56 | {"ins": (0, "66666666480f008000224000", "sldt", "word ptr [rax + 0x402200]"), "lower": 0x400000, "upper": 0x4f0000, "expected_bin": "66666666480f0080????????", "bitness": 64, "expected_opc": "66666666480f00??????????"}, 57 | ] 58 | for data in test_data: 59 | smda_report = SmdaReport() 60 | smda_report.bitness = data["bitness"] 61 | smda_function = SmdaFunction(smda_report=smda_report) 62 | smda_ins = SmdaInstruction(data["ins"], smda_function=smda_function) 63 | self.assertEqual(smda_ins.getEscapedBinary(IntelInstructionEscaper, lower_addr=data["lower"], upper_addr=data["upper"]), data["expected_bin"]) 64 | self.assertEqual(smda_ins.getEscapedToOpcodeOnly(IntelInstructionEscaper), data["expected_opc"]) 65 | 66 | def testCilInstructionWildcarding(self): 67 | test_data = [ 68 | # call MemberRef 69 | {"ins": (0, "280a000006", "call", "SomeFunc"), "expected_bin": "28??????06", "expected_bin_intraprocedural": "28??????06", "expected_opc": "28????????", "bitness": 32}, 70 | {"ins": (0, "6fbb00000a", "callvirt", "SomeFunc"), "expected_bin": "6f??????0a", "expected_bin_intraprocedural": "6f??????0a", "expected_opc": "6f????????", "bitness": 32}, 71 | {"ins": (0, "2d3a", "brtrue.s", "0x5994"), "expected_bin": "2d3a", "expected_bin_intraprocedural": "2d??", "expected_opc": "2d??", "bitness": 32}, 72 | {"ins": (0, "450300000002000000060000000a000000", "switch", "[(0D50), (0D54), (0D58)]"), "expected_bin": "450300000002000000060000000a000000", "expected_bin_intraprocedural": "45????????????????????????????????", "expected_opc": "45????????????????????????????????", "bitness": 32}, 73 | {"ins": (0, "20c48efb0e", "ldc.i4", "0xefb8ec4"), "expected_bin": "20c48efb0e", "expected_bin_intraprocedural": "20c48efb0e", "expected_opc": "20????????", "bitness": 32}, 74 | ] 75 | for data in test_data: 76 | smda_report = SmdaReport() 77 | smda_report.bitness = data["bitness"] 78 | smda_function = SmdaFunction(smda_report=smda_report) 79 | smda_ins = SmdaInstruction(data["ins"], smda_function=smda_function) 80 | self.assertEqual(CilInstructionEscaper.escapeToOpcodeOnly(smda_ins), data["expected_opc"]) 81 | self.assertEqual(CilInstructionEscaper.escapeBinary(smda_ins), data["expected_bin"]) 82 | self.assertEqual(CilInstructionEscaper.escapeBinary(smda_ins, escape_intraprocedural_jumps=True), data["expected_bin_intraprocedural"]) 83 | 84 | 85 | if __name__ == '__main__': 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /tests/testFileFormatParsers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import logging 4 | import os 5 | import lief 6 | import unittest 7 | 8 | from smda.utility.FileLoader import FileLoader 9 | from smda.common.BinaryInfo import BinaryInfo 10 | from smda.Disassembler import Disassembler 11 | from smda.common.SmdaReport import SmdaReport 12 | from smda.common.SmdaFunction import SmdaFunction 13 | from .context import config 14 | 15 | LOG = logging.getLogger(__name__) 16 | logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s") 17 | logging.disable(logging.CRITICAL) 18 | 19 | 20 | class SmdaIntegrationTestSuite(unittest.TestCase): 21 | """Run a full example on a memory dump""" 22 | 23 | @classmethod 24 | def setUpClass(cls): 25 | super(SmdaIntegrationTestSuite, cls).setUpClass() 26 | 27 | def testPeParsingWithCutwail(self): 28 | disasm = Disassembler(config, backend="intel") 29 | # load encrypted malicious win.cutwail 30 | with open(os.path.join(config.PROJECT_ROOT, "tests", "cutwail_xored"), "rb") as f_binary: 31 | binary = f_binary.read() 32 | decrypted_cutwail = bytearray() 33 | for index, byte in enumerate(binary): 34 | if isinstance(byte, str): 35 | byte = ord(byte) 36 | decrypted_cutwail.append(byte ^ (index % 256)) 37 | cutwail_binary = bytes(decrypted_cutwail) 38 | # run FileLoader and disassemble as file 39 | loader = FileLoader("/", map_file=True) 40 | loader._loadFile(cutwail_binary) 41 | file_content = loader.getData() 42 | binary_info = BinaryInfo(file_content) 43 | binary_info.raw_data = loader.getRawData() 44 | binary_info.file_path = "" 45 | binary_info.base_addr = loader.getBaseAddress() 46 | binary_info.bitness = loader.getBitness() 47 | binary_info.code_areas = loader.getCodeAreas() 48 | binary_info.oep = binary_info.getOep() 49 | cutwail_binary_info = binary_info 50 | # parse bytes of 0x400 truncated PE header 51 | pe_header = lief.parse(binary_info.getHeaderBytes()) 52 | assert pe_header.dos_header.magic == 0x5A4D 53 | assert pe_header.header.machine == 0x14C 54 | cutwail_disassembly = disasm._disassemble(binary_info) 55 | cutwail_unmapped_disassembly = disasm.disassembleUnmappedBuffer(cutwail_binary) 56 | assert cutwail_unmapped_disassembly.num_functions == 33 57 | # TODO test label extraction for PE, add another binary for testing 58 | 59 | def testElfParsingWithBashlite(self): 60 | disasm = Disassembler(config, backend="intel") 61 | # load encrypted benign /bin/cat 62 | with open(os.path.join(config.PROJECT_ROOT, "tests", "bashlite_xored"), "rb") as f_binary: 63 | binary = f_binary.read() 64 | decrypted_bashlite = bytearray() 65 | for index, byte in enumerate(binary): 66 | if isinstance(byte, str): 67 | byte = ord(byte) 68 | decrypted_bashlite.append(byte ^ (index % 256)) 69 | bashlite_binary = bytes(decrypted_bashlite) 70 | # run FileLoader and disassemble as file 71 | loader = FileLoader("/", map_file=True) 72 | loader._loadFile(bashlite_binary) 73 | file_content = loader.getData() 74 | binary_info = BinaryInfo(file_content) 75 | binary_info.raw_data = loader.getRawData() 76 | binary_info.file_path = "" 77 | binary_info.base_addr = loader.getBaseAddress() 78 | binary_info.bitness = loader.getBitness() 79 | binary_info.code_areas = loader.getCodeAreas() 80 | binary_info.oep = binary_info.getOep() 81 | bashlite_binary_info = binary_info 82 | bashlite_disassembly = disasm._disassemble(binary_info) 83 | bashlite_unmapped_disassembly = disasm.disassembleUnmappedBuffer(bashlite_binary) 84 | assert bashlite_unmapped_disassembly.num_functions == 177 85 | assert len([f.function_name for f in bashlite_unmapped_disassembly.getFunctions() if f.function_name]) == 174 86 | 87 | def testDotnetParsingWithNjRAT(self): 88 | disasm = Disassembler(config, backend="cil") 89 | # load encrypted malicious win.cutwail 90 | with open(os.path.join(config.PROJECT_ROOT, "tests", "njrat_xored"), "rb") as f_binary: 91 | binary = f_binary.read() 92 | decrypted_njrat = bytearray() 93 | for index, byte in enumerate(binary): 94 | if isinstance(byte, str): 95 | byte = ord(byte) 96 | decrypted_njrat.append(byte ^ (index % 256)) 97 | njrat_binary = bytes(decrypted_njrat) 98 | # run FileLoader and disassemble as file 99 | njrat_unmapped_disassembly = disasm.disassembleUnmappedBuffer(njrat_binary) 100 | assert njrat_unmapped_disassembly.num_functions == 64 101 | assert len([f.function_name for f in njrat_unmapped_disassembly.getFunctions() if f.function_name]) == 64 102 | 103 | def testMacOsParsingWithKomplex(self): 104 | disasm = Disassembler(config, backend="intel") 105 | # load encrypted malicious osx.komplex 106 | with open(os.path.join(config.PROJECT_ROOT, "tests", "komplex_xored"), "rb") as f_binary: 107 | binary = f_binary.read() 108 | decrypted_komplex = bytearray() 109 | for index, byte in enumerate(binary): 110 | if isinstance(byte, str): 111 | byte = ord(byte) 112 | decrypted_komplex.append(byte ^ (index % 256)) 113 | komplex_binary = bytes(decrypted_komplex) 114 | # run FileLoader and disassemble as file 115 | loader = FileLoader("/", map_file=True) 116 | loader._loadFile(komplex_binary) 117 | file_content = loader.getData() 118 | binary_info = BinaryInfo(file_content) 119 | binary_info.raw_data = loader.getRawData() 120 | binary_info.file_path = "" 121 | binary_info.base_addr = loader.getBaseAddress() 122 | binary_info.bitness = loader.getBitness() 123 | binary_info.code_areas = loader.getCodeAreas() 124 | binary_info.oep = binary_info.getOep() 125 | komplex_binary_info = binary_info 126 | komplex_disassembly = disasm._disassemble(binary_info) 127 | komplex_unmapped_disassembly = disasm.disassembleUnmappedBuffer(komplex_binary) 128 | komplex_unmapped_disassembly.num_functions == 208 129 | 130 | 131 | if __name__ == '__main__': 132 | unittest.main() 133 | -------------------------------------------------------------------------------- /tests/testIntegration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import logging 4 | import os 5 | import unittest 6 | 7 | from smda.utility.FileLoader import FileLoader 8 | from smda.common.BinaryInfo import BinaryInfo 9 | from smda.Disassembler import Disassembler 10 | from smda.common.SmdaReport import SmdaReport 11 | from smda.common.SmdaFunction import SmdaFunction 12 | from .context import config 13 | 14 | LOG = logging.getLogger(__name__) 15 | logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s") 16 | logging.disable(logging.CRITICAL) 17 | 18 | 19 | class SmdaIntegrationTestSuite(unittest.TestCase): 20 | """Run a full example on a memory dump""" 21 | 22 | @classmethod 23 | def setUpClass(cls): 24 | super(SmdaIntegrationTestSuite, cls).setUpClass() 25 | config.WITH_STRINGS = True 26 | disasm = Disassembler(config) 27 | # load encrypted Asprox 28 | with open(os.path.join(config.PROJECT_ROOT, "tests", "asprox_0x008D0000_xored"), "rb") as f_binary: 29 | binary = f_binary.read() 30 | decrypted_asprox = bytearray() 31 | for index, byte in enumerate(binary): 32 | if isinstance(byte, str): 33 | byte = ord(byte) 34 | decrypted_asprox.append(byte ^ (index % 256)) 35 | cls.asprox_binary = decrypted_asprox 36 | cls.asprox_disassembly = disasm.disassembleBuffer(bytes(decrypted_asprox), 0x8D0000) 37 | # load encrypted Cutwail 38 | with open(os.path.join(config.PROJECT_ROOT, "tests", "cutwail_xored"), "rb") as f_binary: 39 | binary = f_binary.read() 40 | decrypted_cutwail = bytearray() 41 | for index, byte in enumerate(binary): 42 | if isinstance(byte, str): 43 | byte = ord(byte) 44 | decrypted_cutwail.append(byte ^ (index % 256)) 45 | cls.cutwail_binary = bytes(decrypted_cutwail) 46 | # run FileLoader and disassemble as file 47 | loader = FileLoader("/", map_file=True) 48 | loader._loadFile(cls.cutwail_binary) 49 | file_content = loader.getData() 50 | binary_info = BinaryInfo(file_content) 51 | binary_info.raw_data = loader.getRawData() 52 | binary_info.file_path = "" 53 | binary_info.base_addr = loader.getBaseAddress() 54 | binary_info.bitness = loader.getBitness() 55 | binary_info.code_areas = loader.getCodeAreas() 56 | binary_info.oep = binary_info.getOep() 57 | cls.cutwail_binary_info = binary_info 58 | cls.cutwail_disassembly = disasm._disassemble(binary_info) 59 | cls.cutwail_unmapped_disassembly = disasm.disassembleUnmappedBuffer(cls.cutwail_binary) 60 | 61 | def testAsproxDisassemblyCoverage(self): 62 | assert len([fn for fn in self.asprox_disassembly.getFunctions()]) == 105 63 | 64 | def testOep(self): 65 | # PE header from buffers are not parsed, so we don't get header infos 66 | assert self.asprox_disassembly.oep == None 67 | # PE headers are parsed for regularly processed files (PE+ELF) 68 | assert self.cutwail_unmapped_disassembly.oep == 0x1730 69 | 70 | def testCodeXrefCreation(self): 71 | example_function = self.asprox_disassembly.getFunction(0x008d8292) 72 | # should be initialized on demand only 73 | assert example_function.code_inrefs == None 74 | # example function has inrefs and outrefs 75 | inrefs = [code_inref for code_inref in example_function.getCodeInrefs()] 76 | assert len(inrefs) == 1 77 | for xref in example_function.getCodeInrefs(): 78 | print(xref.from_function, xref.from_instruction, xref.to_function, xref.to_instruction) 79 | outrefs = [code_outref for code_outref in example_function.getCodeOutrefs()] 80 | assert len(outrefs) == 10 81 | 82 | def testAsproxStringRefs(self): 83 | function_with_strings = self.asprox_disassembly.getFunction(0x008d2850) 84 | assert function_with_strings.stringrefs[9251000] == "Software" 85 | marshalled = function_with_strings.toDict() 86 | unmarshalled = SmdaFunction.fromDict(marshalled) 87 | assert unmarshalled.stringrefs[9251000] == "Software" 88 | 89 | def testAsproxApiCoverage(self): 90 | num_api_ref_srcs = 0 91 | api_ref_dsts = set() 92 | for fn in self.asprox_disassembly.getFunctions(): 93 | num_api_ref_srcs += len(fn.apirefs) 94 | api_ref_dsts.update(fn.apirefs.values()) 95 | assert num_api_ref_srcs == 546 96 | assert len(api_ref_dsts) == 95 97 | 98 | def testAsproxMarshalling(self): 99 | report_as_dict = self.asprox_disassembly.toDict() 100 | assert report_as_dict["status"] == "ok" 101 | assert report_as_dict["base_addr"] == 0x8D0000 102 | assert report_as_dict["statistics"]["num_instructions"] == 15706 103 | assert report_as_dict["sha256"] == "db8a133fed1b706608a4492079b702ded6b70369a980d2b5ae355a6adc78ef00" 104 | reimported_report = SmdaReport.fromDict(report_as_dict) 105 | 106 | def testCutwailMarshalling(self): 107 | report_as_dict = self.cutwail_disassembly.toDict() 108 | assert report_as_dict["status"] == "ok" 109 | assert report_as_dict["base_addr"] == 0x4000000 110 | assert report_as_dict["statistics"]["num_instructions"] == 1611 111 | assert report_as_dict["sha256"] == "a348a0ddfab135d152b684d561a3215ab6c472570facd3d75aa2c7ee845a8e2b" 112 | # compare our manual file loading with unmapped buffer 113 | assert self.cutwail_disassembly.num_instructions == self.cutwail_unmapped_disassembly.num_instructions 114 | reimported_report = SmdaReport.fromDict(report_as_dict) 115 | 116 | def testBlockLocator(self): 117 | # test with a function start 118 | found_function = self.asprox_disassembly.findFunctionByContainedAddress(0x008d8292) 119 | found_block = self.asprox_disassembly.findBlockByContainedAddress(0x008d8292) 120 | assert found_function.offset == 0x008d8292 121 | assert found_block.offset == 0x008d8292 122 | # test with an instruction in a block a bit deeper in the function 123 | found_function = self.asprox_disassembly.findFunctionByContainedAddress(0x008d82a6) 124 | found_block = self.asprox_disassembly.findBlockByContainedAddress(0x008d82a6) 125 | assert found_function.offset == 0x008d8292 126 | assert found_block.offset == 0x008d82a4 127 | # test with an offset that is not start of an instruction 128 | found_function = self.asprox_disassembly.findFunctionByContainedAddress(0x008d82a7) 129 | found_block = self.asprox_disassembly.findBlockByContainedAddress(0x008d82a7) 130 | assert found_function.offset == 0x008d8292 131 | assert found_block.offset == 0x008d82a4 132 | # test with offsets beyond image base and binary size 133 | found_function = self.asprox_disassembly.findFunctionByContainedAddress(0x100) 134 | found_block = self.asprox_disassembly.findBlockByContainedAddress(0x100) 135 | assert found_function is None 136 | assert found_block is None 137 | found_function = self.asprox_disassembly.findFunctionByContainedAddress(0xFFFFFF00) 138 | found_block = self.asprox_disassembly.findBlockByContainedAddress(0xFFFFFF00) 139 | assert found_function is None 140 | assert found_block is None 141 | 142 | 143 | if __name__ == '__main__': 144 | unittest.main() 145 | -------------------------------------------------------------------------------- /tests/testTarjan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import logging 4 | import os 5 | import unittest 6 | 7 | from smda.common.Tarjan import Tarjan 8 | 9 | from .context import config 10 | 11 | LOG = logging.getLogger(__name__) 12 | logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s") 13 | logging.disable(logging.CRITICAL) 14 | 15 | 16 | class TarjanTestSuite(unittest.TestCase): 17 | """Provoke recursion""" 18 | 19 | def testInstructionEscaping(self): 20 | test_data = {i: [] for i in range(1000)} 21 | for i in range(1, 1000): 22 | for j in range(i + 1, 1000, 1): 23 | test_data[i].append(j) 24 | test_data[1000] = [] 25 | 26 | tarjan = Tarjan(test_data) 27 | tarjan.calculateScc() 28 | sccs = tarjan.getResult() 29 | self.assertEqual(1001, len(sccs)) 30 | 31 | 32 | if __name__ == '__main__': 33 | unittest.main() 34 | -------------------------------------------------------------------------------- /version_history.md: -------------------------------------------------------------------------------- 1 | # Full Version History 2 | 3 | * 2022-11-18: v1.9.16- Fixed a bug where handling of inrefs in SmdaReport could lead to crashes (THX to @1337-42!). 4 | * 2022-09-27: v1.9.15- Fixed a bug where recognition of code areas would not incorporate virtual addressing (infinite loops while Delphi VMT parsing). 5 | * 2022-09-20: v1.9.13- Fixed a bug for listing unreachable basic block refs pointing outside of function boundaries (exception handling). 6 | * 2022-09-19: v1.9.12- Fixed a logic binding bug in IntelInstructionEscaper (THX to @1337-42!). 7 | * 2022-09-08: v1.9.11- Exposed masking of intraprocedural jmps/calls in SmdaInstruction. 8 | * 2022-08-31: v1.9.9 - Better handling of colliding code due to tailjumps. 9 | * 2022-08-30: v1.9.8 - Improved accuracy for references around tailcalls. 10 | * 2022-08-25: v1.9.6 - Fixed bug in delphi knowledge base handling and improved performance. 11 | * 2022-08-23: v1.9.4 - Fixed bug in section padding for ELF files. 12 | * 2022-08-22: v1.9.3 - Added parsing for Delphi knowledge base files (THX to @danielenders1!). 13 | * 2022-08-22: v1.9.2 - Improved structural parsing of Delphi binaries (THX to @danielenders1!). 14 | * 2022-08-22: v1.9.3 - Added parsing for Delphi knowledge base files (THX to @danielenders1!). 15 | * 2022-08-22: v1.9.2 - Improved structural parsing of Delphi binaries (THX to @danielenders1!). 16 | * 2022-08-12: v1.9.1 - Added support for parsing intel MachO files, including Go parsing. 17 | * 2022-08-10: v1.8.5 - Fixed Go 64bit lavel parsing for v1.12 binaries. 18 | * 2022-08-04: v1.8.4 - Dot export now uses hex formatted addresses in node names. 19 | * 2022-08-03: v1.8.3 - Added support for producing a Dot export for SmdaFunction. 20 | * 2022-08-01: v1.8.1 - Added support for parsing 32bit Go binaries as well. 21 | * 2022-08-01: v1.8.0 - Added support for parsing Go function information (THX to @danielenders1!). 22 | * 2022-07-22: v1.7.4 - Bugfix for marshalling of reports. 23 | * 2022-07-08: v1.7.2 - Excluded overly aggressive tailcall recognition heuristics when processing Golang binaries. 24 | * 2022-01-27: v1.7.0 - SmdaReports now contains a field `oep`; SmdaFunctions now indicate `is_exported` and can provide CodeXrefs via `getCodeInrefs()` and `getCodeOutrefs()`. (THX for the ideas: @mr-tz) 25 | * 2021-08-20: v1.6.1 - Bugfix for alignment calculation of binary mappings. (THX: @williballenthin) 26 | * 2021-08-19: v1.6.0 - Bugfix for truncation during ELF segment/section loading. API usage in ELF files is now resolved as well! (THX: @williballenthin) 27 | * 2021-07-22: v1.5.19 - Now also parsing plt.sec structures to identify functions. 28 | * 2021-06-07: v1.5.18 - Bugfix for struct.pack 8byte conversion using L instead Q (works on Linux, not on Windows). 29 | * 2021-05-21: v1.5.17 - Bugfix for MemoryError when having LIEF try to process section data. 30 | * 2021-05-20: v1.5.16 - Bugfix for formatting exceptions in report output (THX: @BonusPlay) 31 | * 2021-05-18: v1.5.15 - Changed SHA256 in SmdaReports for unmapped files (was hash of memory-mapped image, not it's the input file's hash). 32 | * 2021-04-07: v1.5.14 - Bugfix when processing Exception handler addresses as function entry point candidates (THX: capa team). 33 | * 2021-01-20: v1.5.13 - Now using LIEF 0.11 and moved some print output to logging. 34 | * 2021-01-15: v1.5.11 - Disassembler now offers `disassembleUnmappedBuffer(buffer)` to load and process unmapped files directly from memory. 35 | * 2020-12-11: v1.5.10 - Pinned LIEF to 0.10.1. 36 | * 2020-12-01: v1.5.9 - Bugfix for section names. again. :) 37 | * 2020-11-25: v1.5.6 - Now considering segments for content when ELF file has no sections (THX: @jcrussell). 38 | * 2020-11-10: v1.5.5 - Unmarshalling setting default value for older reports. 39 | * 2020-11-06: v1.5.4 - Minor fix on PE header parsing. 40 | * 2020-11-05: v1.5.3 - Adjusted API thunk identification. 41 | * 2020-10-30: v1.5.2 - One bugfix, also removed one print and reduced logging priority for the message in case the PDB parser module is missing. 42 | * 2020-10-30: v1.5.1 - PE section table now contained in SmdaReport and added `SmdaReport.getSection(offset)`. 43 | * 2020-10-30: v1.4.12 - Bugfix in IndirectCallHandler (THX: @jcrussell). 44 | * 2020-10-29: v1.4.11 - Populate exception handlers specified in PE64 `.pdata` section as FEPs. 45 | * 2020-10-29: v1.4.10 - Resolves 64bit API calls of style `call qword ptr [rip + offset]` and more register-based API calls in general (THX: @jcrussell). 46 | * 2020-10-29: v1.4.8 - Bugfixes. Verbose mode added (THX: @jcrussell). 47 | * 2020-10-28: v1.4.6 - WinApiResolver now tries to resolve import by ordinal to their name if it is known - can be extended in the database of OrdinalHelper. 48 | * 2020-10-28: v1.4.5 - Store the (mapped) buffer that was used to do disassembly along inside a SmdaReport - goal: enable to read strings/bytes at offsets at a later time. 49 | * 2020-10-27: v1.4.4 - SmdaInstructions can now provide potential data references via `SmdaInstruction.getDataRefs()`. 50 | * 2020-10-27: v1.4.3 - SmdaInstructions can now on demand provide the detailed capstone instruction representation via `SmdaInstruction.getDetailed()`. 51 | * 2020-10-27: v1.4.1 - 10-20% gain in processing speed by switching to `capstone.disasm_lite()`. 52 | * 2020-10-26: v1.4.0 - Adding SmdaBasicBlock. Some convenience code to ease intgration with capa. (GeekWeek edition!) 53 | * 2020-09-07: v1.3.11 - Summarizable DisassemblyStatistics. 54 | * 2020-09-02: v1.3.10 - Fixed a bug where IDA Pro would crash when failing to demangle a function name while exporting a SMDA report. 55 | * 2020-08-31: v1.3.9 - Adjusted Logging to avoid interference with other loggers configured outside of SMDA (THX: @BonusPlay). 56 | * 2020-08-25: v1.3.6 - PicHash no longer stored as list. 57 | * 2020-08-17: v1.3.5 - Bugfix for import parsing (ELF files). 58 | * 2020-08-17: v1.3.4 - Recalculate PIC hash and nesting depth for older (v1.2.x) reports on import for compatibility. 59 | * 2020-08-17: v1.3.3 - Added binary variation of `push ebp;mov ebp, esp` to list of default prologues and added exception handling for DominatorTrees (THX: @fxb). 60 | * 2020-07-13: v1.3.2 - Use LIEF to parse Import Table for WinAPI usage data when processing unmapped files. 61 | * 2020-07-13: v1.3.1 - Fixed `setup.py` to properly specify dependencies (THX: @BonusPlay). 62 | * 2020-06-22: v1.3.0 - Added DominatorTree (Implementation by Armin Rigo) to calculate function nesting depth, shortened PIC hash to 8 byte, added some missing instructions for the InstructionEscaper, IdaInterface now demangles names. 63 | * 2020-05-28: v1.2.15 - Bugfixes in IntelInstructionEscaper (handling of negative RIP-relative offsets), SmdaReport (datetime handling), PeFileParser (handling of empty pefile.sections); SCC calculation changed to iterative algorithm (using @bwesterb's implementation) and activated by default again. 64 | * 2020-05-14: v1.2.10 - Bug in IdaInterface fixed. 65 | * 2020-05-13: v1.2.9 - Bugfix in code gap identification in FunctionCandidateManager, SCC calculation is now optional. 66 | * 2020-05-12: v1.2.7 - Added additional default metadata field "component" to SmdaReport. 67 | * 2020-05-11: v1.2.6 - Export from IDA to SMDA data format is now supported (IDA 7.4). 68 | * 2020-05-09: v1.2.5 - Fixed off-by-one that affected wildcarding of instructions (THX to Viviane Zwanger). 69 | * 2020-05-04: v1.2.4 - Various minor fixes. 70 | * 2020-04-29: v1.2.0 - Restructured config.py into smda/SmdaConfig.py to similfy usage and now available via PyPI! The smda/Disassembler.py now emits a report object (smda.common.SmdaReport) that allows direct (pythonic) interaction with the results - a JSON can still be easily generated by using toDict() on the report. 71 | * 2020-04-28: v1.1.0 - Several improvements, including: x64 jump table handling, better data flow handling for calls using registers and tailcalls, extended list of common prologues based on much more groundtruth data, extended padding instruction list for gap function discovery, adjusted weights in candidate priority score, filtering code areas based on section tables, using exported symbols as candidates, new function output metadata: confidence score based on instruction mnemonic histogram, PIC hash based on escaped binary instruction sequence 72 | * 2020-03-10: Various minor fixes and QoL improvements. 73 | * 2019-08-20: IdaExporter is now handling failed instruction conversion via capstone properly. 74 | * 2019-08-19: Minor fix for crashes caused by PDB parser. 75 | * 2019-08-05: v1.0.3 - SMDA can now export reports from IDA Pro (requires capstone to be available for idapython). 76 | * 2019-06-13: PDB symbols for functions are now resolved if given a PDB file using parameter "-d" (THX to @VPaulV). 77 | * 2019-05-15: Fixed a bug in PE mapper where buffer would be shortened because of misinterpretation of section sizes. 78 | * 2019-02-14: v1.0.2 - ELF symbols for functions are now resolved, if present in the file. Also "-m" parameter changed to "-p" to imply parsing instead of just mapping (THX: @VPaulV). 79 | * 2018-12-12: all gcc jump table styles are now parsed correctly. 80 | * 2018-11-26: Better handling of multibyte NOPs, ELF loader now provides base addr. 81 | * 2018-09-28: We now have functional PE/ELF loaders. 82 | * 2018-07-09: v1.0.1 - Performance improvements. 83 | * 2018-07-01: v1.0.0 - Initial Release. --------------------------------------------------------------------------------