├── .clang-format
├── .editorconfig
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .gitmodules
├── CODEOWNERS
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── adding_a_worker.md
    └── output_cohorts_format.md
├── src
    ├── analysis
    │   ├── analysis
    │   ├── pass
    │   │   ├── dedupe
    │   │   │   ├── dedupe
    │   │   │   └── spec.yml
    │   │   ├── filter-all-failure
    │   │   │   ├── filter-all-failure
    │   │   │   └── spec.yml
    │   │   ├── filter-all-success
    │   │   │   ├── filter-all-success
    │   │   │   └── spec.yml
    │   │   ├── filter-any-failure
    │   │   │   ├── filter-any-failure
    │   │   │   └── spec.yml
    │   │   ├── filter-bddisasm-salc
    │   │   │   ├── filter-bddisasm-salc
    │   │   │   └── spec.yml
    │   │   ├── filter-destroy-bddisasm
    │   │   │   ├── filter-destroy-bddisasm
    │   │   │   └── spec.yml
    │   │   ├── filter-destroy-capstone
    │   │   │   ├── filter-destroy-capstone
    │   │   │   └── spec.yml
    │   │   ├── filter-destroy-ghidra
    │   │   │   ├── filter-destroy-ghidra
    │   │   │   └── spec.yml
    │   │   ├── filter-ghidra-lock
    │   │   │   ├── filter-ghidra-lock
    │   │   │   └── spec.yml
    │   │   ├── filter-incomparable
    │   │   │   ├── filter-incomparable
    │   │   │   └── spec.yml
    │   │   ├── filter-ndecoded-different
    │   │   │   ├── filter-ndecoded-different
    │   │   │   └── spec.yml
    │   │   ├── filter-ndecoded-same
    │   │   │   ├── filter-ndecoded-same
    │   │   │   └── spec.yml
    │   │   ├── filter-xed-find-overaccept
    │   │   │   ├── filter-xed-find-overaccept
    │   │   │   └── spec.yml
    │   │   ├── filter-xed-find-underaccept
    │   │   │   ├── filter-xed-find-underaccept
    │   │   │   └── spec.yml
    │   │   ├── find-size-discrepancies
    │   │   │   ├── find-size-discrepancies
    │   │   │   └── spec.yml
    │   │   ├── minimize-input
    │   │   │   ├── minimize-input
    │   │   │   └── spec.yml
    │   │   └── normalize
    │   │   │   ├── normalize
    │   │   │   └── spec.yml
    │   └── passes.yml
    ├── include
    │   └── mish_common.h
    ├── mish2jsonl
    │   ├── Makefile
    │   └── mish2jsonl.c
    ├── mishegos
    │   ├── Makefile
    │   ├── mishegos.c
    │   ├── mutator.c
    │   └── mutator.h
    ├── mishmat
    │   └── mishmat
    └── worker
    │   ├── Makefile
    │   ├── bddisasm
    │       ├── Makefile
    │       └── bddisasm.c
    │   ├── bfd
    │       ├── Makefile
    │       └── bfd.c
    │   ├── capstone
    │       ├── Makefile
    │       └── capstone.c
    │   ├── dynamorio
    │       ├── .gitignore
    │       ├── Makefile
    │       └── dynamorio.c
    │   ├── fadec
    │       ├── Makefile
    │       └── fadec.c
    │   ├── ghidra
    │       ├── .gitignore
    │       ├── CMakeLists.txt
    │       ├── Makefile
    │       ├── ghidra.cc
    │       ├── sleighMishegos.cc
    │       └── sleighMishegos.hh
    │   ├── iced
    │       ├── Cargo.toml
    │       ├── Makefile
    │       ├── build.rs
    │       ├── src
    │       │   ├── lib.rs
    │       │   └── mishegos.rs
    │       └── wrapper.h
    │   ├── llvm
    │       ├── Makefile
    │       └── llvm.c
    │   ├── worker.h
    │   ├── xed
    │       ├── Makefile
    │       └── xed.c
    │   ├── yaxpeax-x86
    │       ├── .gitignore
    │       ├── Cargo.toml
    │       ├── Makefile
    │       ├── build.rs
    │       └── src
    │       │   └── lib.rs
    │   └── zydis
    │       ├── Makefile
    │       └── zydis.c
└── workers.spec


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  LLVM
  4 | AccessModifierOffset: -2
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveAssignments: false
  7 | AlignConsecutiveDeclarations: false
  8 | AlignEscapedNewlines: Right
  9 | AlignOperands:   true
 10 | AlignTrailingComments: true
 11 | AllowAllParametersOfDeclarationOnNextLine: true
 12 | AllowShortBlocksOnASingleLine: false
 13 | AllowShortCaseLabelsOnASingleLine: false
 14 | AllowShortFunctionsOnASingleLine: None
 15 | AllowShortIfStatementsOnASingleLine: false
 16 | AllowShortLoopsOnASingleLine: false
 17 | AlwaysBreakAfterDefinitionReturnType: None
 18 | AlwaysBreakAfterReturnType: None
 19 | AlwaysBreakBeforeMultilineStrings: false
 20 | AlwaysBreakTemplateDeclarations: false
 21 | BinPackArguments: true
 22 | BinPackParameters: true
 23 | BraceWrapping:
 24 |   AfterClass:      false
 25 |   AfterControlStatement: false
 26 |   AfterEnum:       false
 27 |   AfterFunction:   false
 28 |   AfterNamespace:  false
 29 |   AfterObjCDeclaration: false
 30 |   AfterStruct:     false
 31 |   AfterUnion:      false
 32 |   AfterExternBlock: false
 33 |   BeforeCatch:     false
 34 |   BeforeElse:      false
 35 |   IndentBraces:    false
 36 |   SplitEmptyFunction: true
 37 |   SplitEmptyRecord: true
 38 |   SplitEmptyNamespace: true
 39 | BreakBeforeBinaryOperators: None
 40 | BreakBeforeBraces: Attach
 41 | BreakBeforeInheritanceComma: false
 42 | BreakBeforeTernaryOperators: true
 43 | BreakConstructorInitializersBeforeComma: false
 44 | BreakConstructorInitializers: BeforeColon
 45 | BreakAfterJavaFieldAnnotations: false
 46 | BreakStringLiterals: true
 47 | ColumnLimit:     100
 48 | CommentPragmas:  '^ IWYU pragma:'
 49 | CompactNamespaces: false
 50 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
 51 | ConstructorInitializerIndentWidth: 4
 52 | ContinuationIndentWidth: 4
 53 | Cpp11BracedListStyle: true
 54 | DerivePointerAlignment: false
 55 | DisableFormat:   false
 56 | ExperimentalAutoDetectBinPacking: false
 57 | FixNamespaceComments: true
 58 | ForEachMacros:
 59 |   - foreach
 60 |   - Q_FOREACH
 61 |   - BOOST_FOREACH
 62 | IncludeBlocks:   Preserve
 63 | IncludeCategories:
 64 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
 65 |     Priority:        2
 66 |   - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
 67 |     Priority:        3
 68 |   - Regex:           '.*'
 69 |     Priority:        1
 70 | IncludeIsMainRegex: '(Test)?$'
 71 | IndentCaseLabels: false
 72 | IndentPPDirectives: None
 73 | IndentWidth:     2
 74 | IndentWrappedFunctionNames: false
 75 | JavaScriptQuotes: Leave
 76 | JavaScriptWrapImports: true
 77 | KeepEmptyLinesAtTheStartOfBlocks: true
 78 | MacroBlockBegin: ''
 79 | MacroBlockEnd:   ''
 80 | MaxEmptyLinesToKeep: 1
 81 | NamespaceIndentation: None
 82 | ObjCBlockIndentWidth: 2
 83 | ObjCSpaceAfterProperty: false
 84 | ObjCSpaceBeforeProtocolList: true
 85 | PenaltyBreakAssignment: 2
 86 | PenaltyBreakBeforeFirstCallParameter: 19
 87 | PenaltyBreakComment: 300
 88 | PenaltyBreakFirstLessLess: 120
 89 | PenaltyBreakString: 1000
 90 | PenaltyExcessCharacter: 1000000
 91 | PenaltyReturnTypeOnItsOwnLine: 60
 92 | PointerAlignment: Right
 93 | ReflowComments:  true
 94 | SortIncludes:    false
 95 | SpaceAfterCStyleCast: false
 96 | SpaceBeforeAssignmentOperators: true
 97 | SpaceBeforeParens: ControlStatements
 98 | SpaceInEmptyParentheses: false
 99 | SpacesBeforeTrailingComments: 1
100 | SpacesInAngles:  false
101 | SpacesInContainerLiterals: true
102 | SpacesInCStyleCastParentheses: false
103 | SpacesInParentheses: false
104 | SpacesInSquareBrackets: false
105 | Standard:        Cpp11
106 | TabWidth:        2
107 | UseTab:          Never
108 | ...
109 | 
110 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | end_of_line = lf
 5 | insert_final_newline = true
 6 | trim_trailing_whitespace = true
 7 | 
 8 | [*.{c,h}]
 9 | indent_style = space
10 | indent_size = 2
11 | 
12 | [Makefile]
13 | indent_style = tab
14 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: gitsubmodule
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: weekly
 7 |   open-pull-requests-limit: 10
 8 | - package-ecosystem: cargo
 9 |   directory: "/src/worker/yaxpeax-x86"
10 |   schedule:
11 |     interval: weekly
12 |   open-pull-requests-limit: 10
13 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |   schedule:
 9 |     # run CI every day even if no PRs/merges occur
10 |     - cron:  '0 12 * * *'
11 | 
12 | jobs:
13 |   lint:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: actions/checkout@v4
17 |       - name: Install dependencies
18 |         run: sudo apt install -y cppcheck clang-format-12
19 |       - name: Lint
20 |         run: |
21 |           make fmt && git diff --exit-code
22 |           make lint
23 | 
24 |   docker-build:
25 |     runs-on: ubuntu-latest
26 |     steps:
27 |     - uses: actions/checkout@v4
28 | 
29 |     - name: Checkout submodules
30 |       run: |
31 |         auth_header="$(git config --local --get http.https://github.com/.extraheader)"
32 |         git submodule sync --recursive
33 |         git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive
34 | 
35 |     - name: Docker build
36 |       run: docker build -t mishegos .
37 | 
38 |     - name: Docker Smoketest
39 |       run: |
40 |         docker run --rm mishegos bash -eo pipefail -c './src/mishegos/mishegos -m manual ./workers.spec <<< "90" | ./src/mish2jsonl/mish2jsonl'
41 | 
42 |     - name: Docker Test Fuzz
43 |       run: |
44 |         docker run --rm mishegos bash -eo pipefail -c \
45 |           '(timeout --preserve-status 5s ./src/mishegos/mishegos -s 0 ./workers.spec || true) | ./src/mish2jsonl/mish2jsonl | tail'
46 | 
47 |   build:
48 |     runs-on: ubuntu-latest
49 |     steps:
50 |     - uses: actions/checkout@v4
51 | 
52 |     - name: Checkout submodules
53 |       run: |
54 |         auth_header="$(git config --local --get http.https://github.com/.extraheader)"
55 |         git submodule sync --recursive
56 |         git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive
57 | 
58 |     - name: Install dependencies
59 |       run: |
60 |         sudo apt-get update
61 | 
62 |         sudo apt-get install -y \
63 |           build-essential \
64 |           binutils-dev \
65 |           python2 \
66 |           python3 \
67 |           cmake \
68 |           meson \
69 |           ruby \
70 |           autotools-dev \
71 |           autoconf \
72 |           llvm-dev \
73 |           libtool
74 | 
75 |         sudo update-alternatives --install /usr/bin/python python /usr/bin/python2 1
76 | 
77 |     - name: Build
78 |       run: make -j$(nproc)
79 | 
80 |     - name: Smoketest
81 |       # Disassemble NOP
82 |       run: |
83 |         set -eo pipefail
84 |         ./src/mishegos/mishegos -m manual ./workers.spec <<< "90" | ./src/mish2jsonl/mish2jsonl
85 | 
86 |     - name: Test Fuzz
87 |       run: |
88 |         set -eo pipefail
89 |         (timeout --preserve-status 5s ./src/mishegos/mishegos -s 0 ./workers.spec || true) | ./src/mish2jsonl/mish2jsonl | tail
90 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | src/mishegos/mishegos
4 | src/worker/worker
5 | src/mish2jsonl/mish2jsonl
6 | Cargo.lock
7 | target/
8 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "src/worker/capstone/capstone"]
 2 | 	path = src/worker/capstone/capstone
 3 | 	url = https://github.com/aquynh/capstone.git
 4 | [submodule "src/worker/xed/xed"]
 5 | 	path = src/worker/xed/xed
 6 | 	url = https://github.com/intelxed/xed.git
 7 | [submodule "src/worker/xed/mbuild"]
 8 | 	path = src/worker/xed/mbuild
 9 | 	url = https://github.com/intelxed/mbuild.git
10 | [submodule "src/worker/zydis/zydis"]
11 | 	path = src/worker/zydis/zydis
12 | 	url = https://github.com/zyantific/zydis.git
13 | [submodule "src/worker/dynamorio/dynamorio"]
14 | 	path = src/worker/dynamorio/dynamorio
15 | 	url = https://github.com/DynamoRIO/dynamorio.git
16 | [submodule "src/worker/fadec/fadec"]
17 | 	path = src/worker/fadec/fadec
18 | 	url = https://github.com/aengelke/fadec.git
19 | [submodule "src/worker/bddisasm/bddisasm"]
20 | 	path = src/worker/bddisasm/bddisasm
21 | 	url = https://github.com/bitdefender/bddisasm.git
22 | 	branch = master
23 | [submodule "src/worker/iced/iced"]
24 | 	path = src/worker/iced/iced
25 | 	url = https://github.com/0xd4d/iced.git
26 | [submodule "src/worker/ghidra/ghidra"]
27 | 	path = src/worker/ghidra/ghidra
28 | 	url = https://github.com/NationalSecurityAgency/ghidra.git
29 | [submodule "src/worker/ghidra/sleigh-cmake"]
30 | 	path = src/worker/ghidra/sleigh-cmake
31 | 	url = https://github.com/lifting-bits/sleigh.git
32 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @woodruffw @ekilmer
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | RUN export DEBIAN_FRONTEND="noninteractive" && \
 4 |     apt-get update && \
 5 |     apt-get install -y \
 6 |         gpg wget && \
 7 |     wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && \
 8 |     echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null && \
 9 |     apt-get update && \
10 |     apt-get install -y \
11 |         build-essential \
12 |         binutils-dev \
13 |         python \
14 |         python3 \
15 |         cmake \
16 |         meson \
17 |         ruby \
18 |         autotools-dev \
19 |         autoconf \
20 |         libtool \
21 |         git \
22 |         curl \
23 |         llvm-dev \
24 |         libclang-dev \
25 |         clang
26 | 
27 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
28 | ENV PATH="/root/.cargo/bin:${PATH}"
29 | 
30 | WORKDIR /app/mishegos
31 | COPY ./ .
32 | 
33 | ARG TARGET=all
34 | RUN make "${TARGET}" -j $(nproc)
35 | 
36 | CMD ["/bin/bash"]
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | UNAME := $(shell uname)
  2 | 
  3 | CFLAGS := \
  4 | 	-std=gnu11 -Wall -pthread -O2 \
  5 | 	-I$(shell pwd)/src/include
  6 | LDLIBS := -ldl -lpthread
  7 | CPPFLAGS :=
  8 | CXXFLAGS := \
  9 | 	-std=c++11 -Wall -pthread -O2 \
 10 | 	-I$(shell pwd)/src/include
 11 | # TODO(ww): https://github.com/rust-lang/rust-bindgen/issues/1651
 12 | # RUSTFLAGS := -D warnings
 13 | RUST_BINDGEN_CLANG_ARGS := \
 14 | 	-I$(shell pwd)/src/include
 15 | 
 16 | ifeq ($(UNAME), Darwin)
 17 | 	SO_SUFFIX := dylib
 18 | else
 19 | 	SO_SUFFIX := so
 20 |  	# Linux needs -lrt for the POSIX shm(3) family calls.
 21 | 	LDLIBS := $(LDLIBS) -lrt
 22 | endif
 23 | 
 24 | export UNAME
 25 | export CFLAGS
 26 | export LDLIBS
 27 | export CPPFLAGS
 28 | export CXXFLAGS
 29 | export RUST_BINDGEN_CLANG_ARGS
 30 | export SO_SUFFIX
 31 | 
 32 | 
 33 | ALL_SRCS := $(shell \
 34 | 	find . -type f \
 35 | 	\( \
 36 | 		-path '*/capstone/capstone/*' -o \
 37 | 		-path '*/vendor/*' -o \
 38 | 		-path '*/dynamorio/dynamorio/*' -o \
 39 | 		-path '*/dynamorio/obj/*' -o \
 40 | 		-path '*/fadec/fadec/*' -o \
 41 | 		-path '*/udis86/udis86/*' -o \
 42 | 		-path '*/xed/xed/*' -o \
 43 | 		-path '*/xed/mbuild/*' -o \
 44 | 		-path '*/zydis/zydis/*' -o \
 45 | 		-path '*/bddisasm/bddisasm/*' -o \
 46 | 		-path '*/ghidra/sleighMishegos*' -o \
 47 | 		-path '*/ghidra/ghidra/*' -o \
 48 | 		-path '*/ghidra/build/*' -o \
 49 | 		-path '*/ghidra/sleigh-cmake/*' \
 50 | 	\) \
 51 | 	-prune \
 52 | 	-o \( \
 53 | 		-name 'sleighMishegos*' -o \
 54 | 		-name '*.c' -o \
 55 | 		-name '*.cc' -o \
 56 | 		-name '*.h' -o \
 57 | 		-name '*.hh' \
 58 | 	\) \
 59 | 	-print \
 60 | )
 61 | 
 62 | .PHONY: all
 63 | all: mishegos worker mish2jsonl
 64 | 
 65 | .PHONY: debug
 66 | debug: CPPFLAGS += -DDEBUG
 67 | debug: CFLAGS += -g
 68 | debug: all
 69 | 
 70 | .PHONY: mishegos
 71 | mishegos:
 72 | 	$(MAKE) -C src/mishegos
 73 | 
 74 | .PHONY: worker
 75 | worker:
 76 | 	$(MAKE) -C src/worker $(WORKERS)
 77 | 
 78 | .PHONY: mish2jsonl
 79 | mish2jsonl:
 80 | 	$(MAKE) -C  src/mish2jsonl
 81 | 
 82 | .PHONY: fmt
 83 | fmt:
 84 | 	clang-format -i -style=file $(ALL_SRCS)
 85 | 
 86 | .PHONY: lint
 87 | lint:
 88 | 	cppcheck --error-exitcode=1 $(ALL_SRCS)
 89 | 
 90 | .PHONY: edit
 91 | edit:
 92 | 	$(EDITOR) $(ALL_SRCS)
 93 | 
 94 | .PHONY: clean
 95 | clean:
 96 | 	$(MAKE) -C src/worker clean
 97 | 	$(MAKE) -C src/mishegos clean
 98 | 	$(MAKE) -C src/mish2jsonl clean
 99 | 
100 | .PHONY: update-submodules
101 | update-submodules:
102 | 	git submodule foreach git pull origin master
103 | 
104 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | mishegos
  2 | ========
  3 | 
  4 | [![CI](https://github.com/trailofbits/mishegos/actions/workflows/ci.yml/badge.svg)](https://github.com/trailofbits/mishegos/actions/workflows/ci.yml)
  5 | 
  6 | A differential fuzzer for x86 decoders.
  7 | 
  8 | ![mishegos](https://user-images.githubusercontent.com/3059210/59005797-da89b400-87ec-11e9-8274-321edfa6df45.png)
  9 | 
 10 | Read more about `mishegos` in its accompanying [blog post](https://blog.trailofbits.com/2019/10/31/destroying-x86_64-instruction-decoders-with-differential-fuzzing/)
 11 | and academic publication ([paper](https://github.com/gangtan/LangSec-papers-and-slides/raw/main/langsec21/papers/Woodruff_LangSec21.pdf)
 12 | | [recording](https://www.youtube.com/watch?v=a2q86KTZt0g)
 13 | | [slides](https://github.com/trailofbits/publications/blob/master/presentations/Differential%20analysis%20of%20x86-64%20decoders/langsec-2021-slides.pdf)).
 14 | 
 15 | ```bibtex
 16 | @InProceedings{woodruff21differential,
 17 |   author       = "William Woodruff and Niki Carroll and Sebastiaan Peters",
 18 |   title        = "Differential analysis of x86-64 instruction decoders",
 19 |   booktitle    = "Proceedings of the Seventh Language-Theoretic Security Workshop~({LangSec}) at the {IEEE} Symposium on Security and Privacy",
 20 |   year         = "2021",
 21 |   month        = "May"
 22 | }
 23 | ```
 24 | 
 25 | ## Usage
 26 | 
 27 | Start with a clone, including submodules:
 28 | 
 29 | ```bash
 30 | git clone --recurse-submodules https://github.com/trailofbits/mishegos
 31 | ```
 32 | 
 33 | ### Building
 34 | 
 35 | `mishegos` is most easily built within Docker:
 36 | 
 37 | ```bash
 38 | docker build -t mishegos .
 39 | ```
 40 | 
 41 | Alternatively, you can try building it directly.
 42 | 
 43 | Make sure you have `binutils-dev` (or however your system provides `libopcodes`) installed:
 44 | 
 45 | ```bash
 46 | make
 47 | # or
 48 | make debug
 49 | ```
 50 | 
 51 | Build specific workers by passing a space-delimited list as the `WORKERS` varable:
 52 | 
 53 | ```bash
 54 | WORKERS="bfd capstone" make worker
 55 | ```
 56 | 
 57 | ### Running
 58 | 
 59 | Run the fuzzer for a bit:
 60 | 
 61 | ```bash
 62 | ./src/mishegos/mishegos ./workers.spec > /tmp/mishegos
 63 | ```
 64 | 
 65 | `mishegos` checks for three environment variables:
 66 | 
 67 | * `V=1` enables verbose output on `stderr`
 68 | * `D=1` enables the "dummy" mutation mode for debugging purposes
 69 | * `M=1` enables the "manual" mutation mode (i.e., read from `stdin`)
 70 | * `MODE=mode` can be used to configure the mutation mode in the absence of `D` and `M`
 71 |     * Valid mutation modes are `sliding` (default), `havoc`, and `structured`
 72 | 
 73 | Convert mishegos's raw output into JSONL suitable for analysis:
 74 | 
 75 | ```bash
 76 | ./src/mish2jsonl/mish2jsonl /tmp/mishegos > /tmp/mishegos.jsonl
 77 | ```
 78 | 
 79 | `mish2jsonl` checks for `V=1` to enable verbose output on `stderr`.
 80 | 
 81 | Run an analysis/filter pass group on the results:
 82 | 
 83 | ```bash
 84 | ./src/analysis/analysis -p same-size-different-decodings < /tmp/mishegos.jsonl > /tmp/mishegos.interesting
 85 | ```
 86 | 
 87 | Generate an ~ugly~ pretty visualization of the filtered results:
 88 | 
 89 | ```bash
 90 | ./src/mishmat/mishmat < /tmp/mishegos.interesting > /tmp/mishegos.html
 91 | open /tmp/mishegos.html
 92 | ```
 93 | 
 94 | Tip: The HTML file that `mishmat` generates could be hundreds of megabytes large, which will likely result in a bad browser viewing experience. Using the [`split`](https://man7.org/linux/man-pages/man1/split.1.html) tool, you can create multiple smaller HTML files with a specified number of entries per file (10,000 in the following example) and load each of them separately:
 95 | 
 96 | ```bash
 97 | mkdir /tmp/mishegos-html
 98 | split -d --lines=10000 - /tmp/mishegos-html/mishegos_ \
 99 |     --additional-suffix='.html' --filter='./src/mishmat/mishmat > $FILE' \
100 |     < /tmp/mishegos.interesting
101 | ```
102 | 
103 | ### Contributing
104 | 
105 | We welcome contributors to mishegos!
106 | 
107 | A guide for adding new disassembler workers can be found [here](./docs/adding_a_worker.md).
108 | 
109 | ### Performance notes
110 | 
111 | All numbers below correspond to the following run:
112 | 
113 | ```bash
114 | V=1 timeout 60s ./src/mishegos/mishegos ./workers.spec > /tmp/mishegos
115 | ```
116 | 
117 | Outside Docker:
118 | 
119 | * On a Linux desktop (Ubuntu 20.04, Ryzen 5 3600, 32GB DDR4):
120 |     * Commit [`d80063a`](https://github.com/trailofbits/mishegos/commit/d80063a575c4b10d5f787ac88f45d44c8e7f9937)
121 |     * 8 workers (no `udis86`) + 1 `mishegos` fuzzer process
122 |     * 8.7M outputs/minute
123 |     * 9 cores pinned
124 | 
125 | ## TODO
126 | 
127 | * Performance improvements
128 |     * Break cohort collection out into a separate process (requires re-addition of semaphores)
129 |     * Maybe use a better data structure for input/output/cohort slots
130 | * Add a scaling factor for workers, e.g. spawn `N` of each worker
131 | * Pre-analysis normalization (whitespace, immediate representation, prefixes)
132 | * Analysis strategies:
133 |     * Filter by length, decode status discrepancies
134 |     * Easy: lexical comparison
135 |     * Easy: reassembly + effects modeling (maybe with microx?)
136 | * Scoring ideas:
137 |     * Low value: Flag/prefix discrepancies
138 |     * Medium value: Decode success/failure/crash discrepancies
139 |     * High value: Decode discrepancies with differing control flow, operands, maybe some immediates
140 | * Visualization ideas:
141 |     * Basic but not really basic: some kind of mouse-over differential visualization
142 | 
143 | ## License
144 | 
145 | `mishegos` is licensed and distributed under the [Apache v2.0](LICENSE) license. [Contact us](mailto:opensource@trailofbits.com) if you’re looking for an exception to the terms.
146 | 


--------------------------------------------------------------------------------
/docs/adding_a_worker.md:
--------------------------------------------------------------------------------
 1 | Adding a mishegos worker
 2 | ========================
 3 | 
 4 | Adding a new worker to mishegos is (relatively) straightforward.
 5 | 
 6 | This page makes an attempt to document the process, but no guarantees about
 7 | correctness or being up-to-date are made. When in doubt refer to
 8 | a simple worker already in the tree, like
 9 | [capstone](https://github.com/trailofbits/mishegos/tree/master/src/worker/capstone).
10 | 
11 | ## Adding the worker
12 | 
13 | A good worker is self contained within its `./src/worker/WORKERNAME/` directory.
14 | 
15 | That directory should look something like this:
16 | 
17 | ```
18 | ./src/worker/WORKERNAME/:
19 |     SOME_SUBMODULE/
20 |     Makefile
21 |     WORKERNAME.c
22 | ```
23 | 
24 | Each member is discussed below.
25 | 
26 | ### `SOME_SUBMODULE/`
27 | 
28 | If your worker requires a disassembly library that is **either** (1) actively maintained **or**
29 | (2) is unavailable in popular package managers, then it should be submoduled within the worker
30 | directory. Multiple submodules (or recursive submodules, if necessary) are fine; see the XED worker
31 | for an example.
32 | 
33 | ### `Makefile`
34 | 
35 | Your worker directory should include a single `Makefile` that builds both the target disassembler
36 | and the mishegos worker.
37 | 
38 | Two `make` targets are required:
39 | 
40 | * `all`: Build all dependencies and the worker's shared object
41 | * `clean`: Clean the worker's shared object and, *optionally*, the builds of all dependencies
42 | 
43 | Your `all` target should produce some reasonably named shared object (`WORKERNAME.so` is
44 | currently common in the codebase) in the worker directory. You'll need this shared object's path
45 | later.
46 | 
47 | ### `WORKERNAME.c`
48 | 
49 | `WORKERNAME.c` should implement the mishegos worker ABI, which is the following:
50 | 
51 | ```c
52 | char *worker_name;
53 | void worker_ctor();
54 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length);
55 | void worker_dtor();
56 | ```
57 | 
58 | See the existing workers and header files for type and usage examples.
59 | 
60 | `worker_name` is a static string that *uniquely identifies the worker*. Duplicating `worker_name`
61 | across different kinds of workers will cause very bad things to happen.
62 | 
63 | `worker_ctor` and `worker_dtor` are **optional** and run on worker process startup and termination,
64 | respectively.
65 | 
66 | ## Integrating into the build
67 | 
68 | Once you have a worker in place, you'll have to modify a few files to get mishegos to build
69 | and fuzz with it.
70 | 
71 | ### `./src/workers/Makefile`
72 | 
73 | This `Makefile` contains a `WORKERS` variable. Add `WORKERNAME` (or whatever you named
74 | your worker directory) to it.
75 | 
76 | ### `./Makefile`
77 | 
78 | The top-level `Makefile` contains an `ALL_SRCS` variable. This variable has a `find` expression
79 | in it that excludes submodule sources from automated linting tasks. Add glob(s) matching your
80 | worker's submodule(s) to it.
81 | 
82 | ### `./workers.spec`
83 | 
84 | This is a newline-delimited list of shared objects that `mishegos` (the main fuzzer binary)
85 | takes via an argument. Add the path to your worker shared object to it.
86 | 


--------------------------------------------------------------------------------
/docs/output_cohorts_format.md:
--------------------------------------------------------------------------------
 1 | Cohort Output Format
 2 | =====================
 3 | 
 4 | This file briefly describes the format of `mishegos`'s binary output.
 5 | 
 6 | The details of the binary format are an implementation detail and should only be of interest
 7 | if working on mishegos itself; users looking to analyze mishegos's results should run
 8 | `mish2jsonl` and operate on the JSONL-formatted results.
 9 | 
10 | ## Motivation
11 | 
12 | Earlier versions of mishegos dumped their results directly to JSONL. This required
13 | us to do JSON serialization and internal allocations in the fuzzing lifecycle, incurring
14 | a performance hit.
15 | 
16 | ## Format
17 | 
18 | Mishegos's binary output is a sequence of "cohorts", each of which contains `N` outputs
19 | where `N` is the number of workers.
20 | 
21 | Each cohort begins with a header:
22 | 
23 | * `nworkers` (`u32`): The number of workers present in this output cohort
24 | * `input` (`u64` + `str`): A length-prefixed, pretty-printed hex string of the input handled by
25 | this cohort
26 | 
27 | After the header, each cohort contains `nworkers` output records. Each output contains:
28 | 
29 | * `status` (`u32`): A status code corresponding to the `decode_status` enum
30 | * `ndecoded` (`u16`): The number of bytes of `input` decoded
31 | * `workerno` (`u32`): The worker's identifying index
32 | * `worker_so` (`u64` + `str`): A length-prefixed string containg the path to the worker's dynamic
33 | shared object
34 | * `len` (`u16`): The string length of the decoded instruction, or `0` if none is present
35 | * `result` (`str`): A string of `len` bytes containing the decoded instruction
36 | 
37 | Visualized:
38 | 
39 | ```
40 | |-------------------------|
41 | |  cohort 1: nworkers: 3  |
42 | |    output 1             |
43 | |    output 2             |
44 | |    output 3             |
45 | |-------------------------|
46 | |  cohort 2: nworkers: 3  |
47 | |    output 1             |
48 | |    output 2             |
49 | |    output 3             |
50 | |-------------------------|
51 | |  cohort ...             |
52 | |    ....                 |
53 | |_________________________|
54 | ```
55 | 
56 | ## Implementation
57 | 
58 | Mishego's binary output is transformed into JSONL via a parser specified in
59 | [Kaitai Struct](https://kaitai.io/)'s DSL.
60 | 


--------------------------------------------------------------------------------
/src/analysis/analysis:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | # frozen_string_literal: true
  3 | 
  4 | # analysis: collect analysis passes, order them, and pipeline
  5 | # mishegos results through them
  6 | 
  7 | require "yaml"
  8 | require "ostruct"
  9 | require "pathname"
 10 | require "set"
 11 | require "open3"
 12 | require "optparse"
 13 | 
 14 | def hai(msg)
 15 |   warn "[analysis] #{msg}" if VERBOSE
 16 | end
 17 | 
 18 | def load_pass!(dir)
 19 |   hai "loading pass from #{dir}"
 20 | 
 21 |   spec = dir / "spec.yml"
 22 |   raise "Pass missing spec: #{spec}" unless spec.file?
 23 | 
 24 |   pass = OpenStruct.new YAML.load_file(spec)
 25 |   pass.spec = spec
 26 |   pass.dir = dir
 27 |   pass.not_before ||= []
 28 |   pass.cmd = pass.dir / pass.run
 29 | 
 30 |   pass
 31 | end
 32 | 
 33 | # A mix-in for operations on all passes.
 34 | module PassOperations
 35 |   def build_graph!
 36 |     graph = OpenStruct.new(nodes: [], edges: [])
 37 | 
 38 |     each do |pass|
 39 |       graph.nodes << pass
 40 | 
 41 |       pass.not_before.each do |nb|
 42 |         pred = find { |p| p.name == nb }
 43 |         raise "#{pass.name} depends on missing pass: #{nb}" unless pred
 44 | 
 45 |         graph.edges << [pred, pass]
 46 |       end
 47 |     end
 48 | 
 49 |     graph
 50 |   end
 51 | 
 52 |   def verify!
 53 |     hai "verifying #{size} passes"
 54 | 
 55 |     raise "one or more duplicate pass names" if uniq(&:name).size != size
 56 |     raise "one or more nonexecutable passes" unless all? { |p| p.cmd.executable? }
 57 | 
 58 |     self
 59 |   end
 60 | 
 61 |   # This is just a topological sort of our pass DAG.
 62 |   # Why? Nescio; sed fieri sentio et excrucior.
 63 |   # NOTE: Currently unused; we assume that the analysis's order is valid.
 64 |   def order!
 65 |     hai "realizing pass DAG into a concrete order"
 66 | 
 67 |     graph = build_graph!
 68 |     ordered = []
 69 |     node_set = []
 70 | 
 71 |     # Our initial node set consists of only nodes that don't have a predecessor.
 72 |     graph.nodes.each do |node|
 73 |       next if graph.edges.any? { |e| e[1] == node }
 74 | 
 75 |       node_set << node
 76 |     end
 77 | 
 78 |     until node_set.empty?
 79 |       node = node_set.shift
 80 |       ordered << node
 81 | 
 82 |       succ_nodes = graph.nodes.select { |s| graph.edges.include?([node, s]) }
 83 |       succ_nodes.each do |succ|
 84 |         graph.edges.delete [node, succ]
 85 |         next if graph.edges.any? { |e| e[1] == succ }
 86 | 
 87 |         node_set << succ
 88 |       end
 89 |     end
 90 | 
 91 |     raise "pass DAG contains a cycle" unless graph.edges.empty?
 92 | 
 93 |     replace ordered
 94 |     self
 95 |   end
 96 | 
 97 |   def run!
 98 |     hai "running passes: #{map(&:name)}"
 99 | 
100 |     cmds = map(&:cmd).map(&:to_s)
101 |     Open3.pipeline(*cmds, in: $stdin, out: $stdout)
102 | 
103 |     self
104 |   end
105 | end
106 | 
107 | VERBOSE = ENV["VERBOSE"] || ENV["V"]
108 | PASS_DIR = Pathname.new File.expand_path("pass", __dir__)
109 | PASS_FILE = Pathname.new File.expand_path("passes.yml", __dir__)
110 | 
111 | opts = {
112 |   profile: "default",
113 |   describe: false,
114 | }
115 | 
116 | OptionParser.new do |o|
117 |   o.banner = "Usage: analysis [options]"
118 | 
119 |   o.on "-p", "--profile PROFILE", String, "Use the given analysis profile" do |profile|
120 |     opts[:profile] = profile
121 |   end
122 | 
123 |   o.on "-d", "--describe", "Describe each step of the given profile instead of running" do
124 |     opts[:describe] = true
125 |   end
126 | end.parse!
127 | 
128 | $stderr.sync = true
129 | 
130 | hai "pass directory: #{PASS_DIR}"
131 | hai "pass spec file: #{PASS_FILE}"
132 | 
133 | profile = YAML.load_file(PASS_FILE)[opts[:profile]]
134 | raise "no such profile: #{opts[:profile]}" unless profile
135 | 
136 | hai "#{opts[:profile]} passes: #{profile}"
137 | 
138 | passes = PASS_DIR.children.select(&:directory?).map do |pass_dir|
139 |   load_pass! pass_dir
140 | end
141 | 
142 | # Select only the passes defined by the profile, and sort them by their order
143 | # in the profile.
144 | passes
145 |   .select! { |p| profile.include? p.name }
146 |   .sort_by! { |p| profile.index p.name }
147 | passes.extend PassOperations
148 | passes.verify!
149 | 
150 | if opts[:describe]
151 |   puts opts[:profile]
152 |   passes.each { |pass| puts "\t#{pass.name}: #{pass.desc}" }
153 | else
154 |   passes.run!
155 | end
156 | 


--------------------------------------------------------------------------------
/src/analysis/pass/dedupe/dedupe:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # dedupe: filter out any cohorts whose inputs have already appeared at least once.
 5 | 
 6 | require "json"
 7 | require "set"
 8 | 
 9 | warn "[+] pass: dedupe"
10 | 
11 | count = 0
12 | seen = Set.new
13 | $stdin.each_line do |line|
14 |   result = JSON.parse line, symbolize_names: true
15 | 
16 |   # add? returns nil if the element is already present, saving us
17 |   # two separate operations (check + add).
18 |   if seen.add?(result[:input]).nil?
19 |     count += 1
20 |     next
21 |   end
22 | 
23 |   $stdout.puts result.to_json
24 | end
25 | 
26 | warn "[+] pass: dedupe done: #{count} filtered"
27 | 


--------------------------------------------------------------------------------
/src/analysis/pass/dedupe/spec.yml:
--------------------------------------------------------------------------------
1 | name: dedupe
2 | desc: Filter out any duplicate cohorts (by input)
3 | run: dedupe
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-all-failure/filter-all-failure:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-all-failure: remove all cohorts in which each worker failed
 5 | 
 6 | require "json"
 7 | 
 8 | warn "[+] pass: filter-all-failure"
 9 | 
10 | count = 0
11 | $stdin.each_line do |line|
12 |   result = JSON.parse line, symbolize_names: true
13 | 
14 |   if result[:outputs].all? { |o| o[:status][:name] == "failure" }
15 |     count += 1
16 |     next
17 |   end
18 | 
19 |   $stdout.puts result.to_json
20 | end
21 | 
22 | warn "[+] pass: filter-all-failure done: #{count} filtered"
23 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-all-failure/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-all-failure
2 | desc: Remove all cohorts in which each worker failed
3 | run: filter-all-failure
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-all-success/filter-all-success:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-all-success: remove all cohorts in which each worker suceeded
 5 | 
 6 | require "json"
 7 | 
 8 | STDERR.puts "[+] pass: filter-all-success"
 9 | 
10 | count = 0
11 | STDIN.each_line do |line|
12 |   result = JSON.parse line, symbolize_names: true
13 | 
14 |   if result[:outputs].all? { |o| o[:status][:name] == "success" }
15 |     count += 1
16 |     next
17 |   end
18 | 
19 |   STDOUT.puts result.to_json
20 | end
21 | 
22 | STDERR.puts "[+] pass: filter-all-success done: #{count} filtered"
23 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-all-success/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-all-success
2 | desc: Remove all cohorts in which each worker succeeded
3 | run: filter-all-success
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-any-failure/filter-any-failure:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-any-failure: remove any cohorts in which each worker failed
 5 | 
 6 | require "json"
 7 | 
 8 | warn "[+] pass: filter-any-failure"
 9 | 
10 | count = 0
11 | $stdin.each_line do |line|
12 |   result = JSON.parse line, symbolize_names: true
13 | 
14 |   if result[:outputs].any? { |o| o[:status][:name] == "failure" }
15 |     count += 1
16 |     next
17 |   end
18 | 
19 |   $stdout.puts result.to_json
20 | end
21 | 
22 | warn "[+] pass: filter-any-failure done: #{count} filtered"
23 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-any-failure/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-any-failure
2 | desc: Remove any cohorts in which each worker failed
3 | run: filter-any-failure
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-bddisasm-salc/filter-bddisasm-salc:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-xed-find-underaccept: find inputs that XED potentially underaccepts
 5 | # (i.e., inputs the other high-quality decoders think are valid)
 6 | 
 7 | require "json"
 8 | 
 9 | # TODO(ww): Remove this.
10 | BDDISASM_SO = "./src/worker/bddisasm/bddisasm.so"
11 | 
12 | warn "[+] pass: filter-bddisasm-salc"
13 | 
14 | count = 0
15 | $stdin.each_line do |line|
16 |   result = JSON.parse line, symbolize_names: true
17 | 
18 |   bddisasm = result[:outputs].find { |o| o[:worker_so] == BDDISASM_SO }
19 | 
20 |   if bddisasm[:result] == "SALC"
21 |     count += 1
22 |     next
23 |   end
24 | 
25 |   $stdout.puts result.to_json
26 | end
27 | 
28 | warn "[+] pass: filter-bddisasm-salc done: #{count} filtered"
29 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-bddisasm-salc/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-bddisasm-salc
2 | desc: Find bddisasm results that decode to SALC
3 | run: filter-bddisasm-salc
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-destroy-bddisasm/filter-destroy-bddisasm:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-destroy-bddisasm: find results that only bddisasm gets right (or wrong)
 5 | 
 6 | require "json"
 7 | 
 8 | # TODO(ww): Remove this.
 9 | BDDISASM_SO = "./src/worker/bddisasm/bddisasm.so"
10 | XED_SO = "./src/worker/xed/xed.so"
11 | ZYDIS_SO = "./src/worker/zydis/zydis.so"
12 | 
13 | warn "[+] pass: filter-destroy-bddisasm"
14 | 
15 | count = 0
16 | $stdin.each_line do |line|
17 |   result = JSON.parse line, symbolize_names: true
18 | 
19 |   bddisasm = result[:outputs].find { |o| o[:worker_so] == BDDISASM_SO }
20 |   xed = result[:outputs].find { |o| o[:worker_so] == XED_SO }
21 |   zydis = result[:outputs].find { |o| o[:worker_so] == ZYDIS_SO }
22 | 
23 |   if bddisasm[:status][:value] == xed[:status][:value] && bddisasm[:status][:value] == zydis[:status][:value]
24 |     count += 1
25 |     next
26 |   end
27 | 
28 |   $stdout.puts result.to_json
29 | end
30 | 
31 | warn "[+] pass: filter-destroy-bddisasm done: #{count} filtered"
32 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-destroy-bddisasm/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-destroy-bddisasm
2 | desc: Find results that only bddisasm gets right (or wrong)
3 | run: filter-destroy-bddisasm
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-destroy-capstone/filter-destroy-capstone:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-destroy-capstone: find results that only capstone gets right (or wrong)
 5 | 
 6 | require "json"
 7 | 
 8 | # TODO(ww): Remove this.
 9 | CAPSTONE_SO = "./src/worker/capstone/capstone.so"
10 | XED_SO = "./src/worker/xed/xed.so"
11 | ZYDIS_SO = "./src/worker/zydis/zydis.so"
12 | 
13 | warn "[+] pass: filter-destroy-capstone"
14 | 
15 | count = 0
16 | $stdin.each_line do |line|
17 |   result = JSON.parse line, symbolize_names: true
18 | 
19 |   capstone = result[:outputs].find { |o| o[:worker_so] == CAPSTONE_SO }
20 |   xed = result[:outputs].find { |o| o[:worker_so] == XED_SO }
21 |   zydis = result[:outputs].find { |o| o[:worker_so] == ZYDIS_SO }
22 | 
23 |   if capstone[:status][:value] == xed[:status][:value] && capstone[:status][:value] == zydis[:status][:value]
24 |     count += 1
25 |     next
26 |   end
27 | 
28 |   $stdout.puts result.to_json
29 | end
30 | 
31 | warn "[+] pass: filter-destroy-capstone done: #{count} filtered"
32 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-destroy-capstone/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-destroy-capstone
2 | desc: Find results that only capstone gets right (or wrong)
3 | run: filter-destroy-capstone
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-destroy-ghidra/filter-destroy-ghidra:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-destroy-ghidra: find results that only ghidra gets right (or wrong)
 5 | 
 6 | require "json"
 7 | 
 8 | XED_SO = "./src/worker/xed/xed.so"
 9 | ZYDIS_SO = "./src/worker/zydis/zydis.so"
10 | ICED_SO = "./src/worker/iced/iced.so"
11 | GHIDRA_SO = "./src/worker/ghidra/ghidra.so"
12 | 
13 | warn "[+] pass: filter-destroy-ghidra"
14 | 
15 | count = 0
16 | $stdin.each_line do |line|
17 |   result = JSON.parse line, symbolize_names: true
18 | 
19 |   xed = result[:outputs].find { |o| o[:worker_so] == XED_SO }
20 |   zydis = result[:outputs].find { |o| o[:worker_so] == ZYDIS_SO }
21 |   iced = result[:outputs].find { |o| o[:worker_so] == ICED_SO }
22 |   ghidra = result[:outputs].find { |o| o[:worker_so] == GHIDRA_SO }
23 | 
24 |   if ghidra[:status][:value] == xed[:status][:value] &&
25 |       ghidra[:status][:value] == zydis[:status][:value] &&
26 |       ghidra[:status][:value] == iced[:status][:value]
27 |     count += 1
28 |     next
29 |   end
30 | 
31 |   $stdout.puts result.to_json
32 | end
33 | 
34 | warn "[+] pass: filter-destroy-ghidra done: #{count} filtered"
35 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-destroy-ghidra/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-destroy-ghidra
2 | desc: Find results that only ghidra gets right (or wrong)
3 | run: filter-destroy-ghidra
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-ghidra-lock/filter-ghidra-lock:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-ghidra-lock: Find Ghidra results that decode to "LOCK"
 5 | # instruction. "LOCK" is a prefix, not a real instruction:
 6 | # https://github.com/NationalSecurityAgency/ghidra/issues/2033#issue-645334803
 7 | 
 8 | require "json"
 9 | 
10 | GHIDRA_SO = "./src/worker/ghidra/ghidra.so"
11 | 
12 | warn "[+] pass: filter-ghidra-lock"
13 | 
14 | count = 0
15 | $stdin.each_line do |line|
16 |   result = JSON.parse line, symbolize_names: true
17 | 
18 |   ghidra = result[:outputs].find { |o| o[:worker_so] == GHIDRA_SO }
19 | 
20 |   if ghidra[:result] == "LOCK"
21 |     count += 1
22 |     next
23 |   end
24 | 
25 |   $stdout.puts result.to_json
26 | end
27 | 
28 | warn "[+] pass: filter-ghidra-lock done: #{count} filtered"
29 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-ghidra-lock/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-ghidra-lock
2 | desc: Find ghidra results that decode to LOCK
3 | run: filter-ghidra-lock
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-incomparable/filter-incomparable:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-incomparable: remove any cohorts whose results can't be compared,
 5 | # i.e. any cohorts that have fewer than two successful results
 6 | 
 7 | require "json"
 8 | 
 9 | warn "[+] pass: filter-incomparable"
10 | 
11 | def success?(decoder)
12 |   decoder[:status][:value] == 1
13 | end
14 | 
15 | count = 0
16 | $stdin.each_line do |line|
17 |   result = JSON.parse line, symbolize_names: true
18 | 
19 |   outputs = result[:outputs]
20 |   successes = outputs.count { |o| success?(o) }
21 |   if successes < 2
22 |     count += 1
23 |     next
24 |   end
25 | 
26 |   $stdout.puts result.to_json
27 | end
28 | 
29 | warn "[+] pass: filter-incomparable done: #{count} filtered"
30 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-incomparable/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-incomparable
2 | desc: Find out cohorts whose results can't be compared (i.e., that have less than two successes)
3 | run: filter-incomparable
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-ndecoded-different/filter-ndecoded-different:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-ndecoded-different: remove any cohorts where one or more outputs
 5 | # consumed different amounts of the input
 6 | #
 7 | # NOTE: Observe that "decoded the same number of bytes" is *not* the same
 8 | # as "decoded to the same instruction". As such, this pass will probably produce
 9 | # false negatives if your goal is to find instructions of the same size that decode
10 | # to different things.
11 | 
12 | require "json"
13 | 
14 | warn "[+] pass: filter-ndecoded-different"
15 | 
16 | count = 0
17 | $stdin.each_line do |line|
18 |   result = JSON.parse line, symbolize_names: true
19 | 
20 |   outputs_ndecoded = result[:outputs].map { |o| o[:ndecoded] }
21 |   if outputs_ndecoded.uniq.size > 1
22 |     count += 1
23 |     next
24 |   end
25 | 
26 |   $stdout.puts result.to_json
27 | end
28 | 
29 | warn "[+] pass: filter-ndecoded-different done: #{count} filtered"
30 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-ndecoded-different/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-ndecoded-different
2 | desc: Filter out any cohorts where one or more cohorts consumed different amounts of input
3 | run: filter-ndecoded-different
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-ndecoded-same/filter-ndecoded-same:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-ndecoded-same: remove any cohorts where all outputs consumed
 5 | # exactly the same number of input bytes.
 6 | #
 7 | # NOTE: Observe that "decoded the same number of bytes" is *not* the same
 8 | # as "decoded to the same instruction". As such, this pass will probably produce
 9 | # false negatives if your goal is to find instructions of the same size that decode
10 | # to different things.
11 | 
12 | require "json"
13 | 
14 | warn "[+] pass: filter-ndecoded-same"
15 | 
16 | count = 0
17 | $stdin.each_line do |line|
18 |   result = JSON.parse line, symbolize_names: true
19 | 
20 |   outputs = result[:outputs]
21 |   if outputs.all? { |o| o[:ndecoded] == outputs.first[:ndecoded] }
22 |     count += 1
23 |     next
24 |   end
25 | 
26 |   $stdout.puts result.to_json
27 | end
28 | 
29 | warn "[+] pass: filter-ndecoded-same done: #{count} filtered"
30 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-ndecoded-same/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-ndecoded-same
2 | desc: Filter out any cohorts where all outputs consumed exactly the same number of input bytes
3 | run: filter-ndecoded-same
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-xed-find-overaccept/filter-xed-find-overaccept:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-xed-find-overaccept: find inputs that XED potentially overaccepts
 5 | # (i.e., inputs the other high-quality decoders think are invalid)
 6 | 
 7 | require "json"
 8 | 
 9 | # TODO(ww): Remove this.
10 | BDDISASM_SO = "./src/worker/bddisasm/bddisasm.so"
11 | XED_SO = "./src/worker/xed/xed.so"
12 | ZYDIS_SO = "./src/worker/zydis/zydis.so"
13 | ICED_SO = "./src/worker/iced/iced.so"
14 | 
15 | def success?(decoder)
16 |   decoder[:status][:value] == 1
17 | end
18 | 
19 | def failure?(decoder)
20 |   !success?(decoder)
21 | end
22 | 
23 | def failure_by_consensus?(*decoders)
24 |   nfailures = decoders.select { |d| failure?(d) }.size
25 | 
26 |   (nfailures / decoders.size.to_f) > 0.50
27 | end
28 | 
29 | warn "[+] pass: filter-xed-find-overaccept"
30 | 
31 | count = 0
32 | $stdin.each_line do |line|
33 |   result = JSON.parse line, symbolize_names: true
34 | 
35 |   bddisasm = result[:outputs].find { |o| o[:worker_so] == BDDISASM_SO }
36 |   xed = result[:outputs].find { |o| o[:worker_so] == XED_SO }
37 |   zydis = result[:outputs].find { |o| o[:worker_so] == ZYDIS_SO }
38 |   iced = result[:outputs].find { |o| o[:worker_so] == ICED_SO }
39 | 
40 |   # If XED reports success when other high-quality decoders don't, keep it.
41 |   if success?(xed) && failure_by_consensus?(bddisasm, zydis, iced)
42 |     $stdout.puts result.to_json
43 |   end
44 | 
45 |   count += 1
46 | end
47 | 
48 | warn "[+] pass: filter-xed-find-overaccept done: #{count} filtered"
49 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-xed-find-overaccept/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-xed-find-overaccept
2 | desc: Find results that XED potentially overaccepts
3 | run: filter-xed-find-overaccept
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-xed-find-underaccept/filter-xed-find-underaccept:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # filter-xed-find-underaccept: find inputs that XED potentially underaccepts
 5 | # (i.e., inputs the other high-quality decoders think are valid)
 6 | 
 7 | require "json"
 8 | 
 9 | # TODO(ww): Remove this.
10 | BDDISASM_SO = "./src/worker/bddisasm/bddisasm.so"
11 | XED_SO = "./src/worker/xed/xed.so"
12 | ZYDIS_SO = "./src/worker/zydis/zydis.so"
13 | ICED_SO = "./src/worker/iced/iced.so"
14 | 
15 | def success?(decoder)
16 |   decoder[:status][:value] == 1
17 | end
18 | 
19 | def failure?(decoder)
20 |   !success?(decoder)
21 | end
22 | 
23 | def success_by_consensus?(*decoders)
24 |   nsuccesses = decoders.select { |d| success?(d) }.size
25 | 
26 |   (nsuccesses / decoders.size.to_f) > 0.50
27 | end
28 | 
29 | warn "[+] pass: filter-xed-find-underaccept"
30 | 
31 | count = 0
32 | $stdin.each_line do |line|
33 |   result = JSON.parse line, symbolize_names: true
34 | 
35 |   bddisasm = result[:outputs].find { |o| o[:worker_so] == BDDISASM_SO }
36 |   xed = result[:outputs].find { |o| o[:worker_so] == XED_SO }
37 |   zydis = result[:outputs].find { |o| o[:worker_so] == ZYDIS_SO }
38 |   iced = result[:outputs].find { |o| o[:worker_so] == ICED_SO }
39 | 
40 |   # If XED reports failure when other high-quality decoders don't, keep it.
41 |   $stdout.puts result.to_json if failure?(xed) && success_by_consensus?(bddisasm, zydis, iced)
42 | 
43 |   count += 1
44 | end
45 | 
46 | warn "[+] pass: filter-xed-find-underaccept done: #{count} filtered"
47 | 


--------------------------------------------------------------------------------
/src/analysis/pass/filter-xed-find-underaccept/spec.yml:
--------------------------------------------------------------------------------
1 | name: filter-xed-find-underaccept
2 | desc: Find results that XED potentially underaccepts
3 | run: filter-xed-find-underaccept
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/find-size-discrepancies/find-size-discrepancies:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # find-size-discrepancies: find cohorts whose successful results differ in decoded size.
 5 | 
 6 | require "json"
 7 | 
 8 | warn "[+] pass: find-size-discrepancies"
 9 | 
10 | def success?(decoder)
11 |   decoder[:status][:value] == 1
12 | end
13 | 
14 | count = 0
15 | $stdin.each_line do |line|
16 |   result = JSON.parse line, symbolize_names: true
17 | 
18 |   outputs = result[:outputs]
19 |   successes = outputs.select { |o| success? o }
20 | 
21 |   if successes.map { |o| o[:ndecoded] }.uniq.size == 1
22 |     count += 1
23 |     next
24 |   end
25 | 
26 |   $stdout.puts result.to_json
27 | end
28 | 
29 | warn "[+] pass: find-size-discrepancies done: #{count} filtered"
30 | 


--------------------------------------------------------------------------------
/src/analysis/pass/find-size-discrepancies/spec.yml:
--------------------------------------------------------------------------------
1 | name: find-size-discrepancies
2 | desc: Find results that have at least two successful results that disagree on size
3 | run: find-size-discrepancies
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/minimize-input/minimize-input:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # minimize-input: trim each cohort's input to no greater than the longest output's decode length
 5 | 
 6 | require "json"
 7 | 
 8 | warn "[+] pass: minimize-input"
 9 | 
10 | count = 0
11 | $stdin.each_line do |line|
12 |   result = JSON.parse line, symbolize_names: true
13 | 
14 |   max_ndecoded = result[:outputs].map { |o| o[:ndecoded] }.max
15 | 
16 |   # If the maximum ndecoded is 0, then all are 0 and we should skip
17 |   # this cohort entirely.
18 |   # In effect, this is probably identical to filter-all-failure.
19 |   if max_ndecoded.zero?
20 |     count += 1
21 |     next
22 |   end
23 | 
24 |   # input is hex formatted, so the trimmed length is max_ndecoded * 2
25 |   result[:input] = result[:input][0, max_ndecoded * 2]
26 | 
27 |   $stdout.puts result.to_json
28 | end
29 | 
30 | warn "[+] pass: minimize-input done: #{count} filtered"
31 | 


--------------------------------------------------------------------------------
/src/analysis/pass/minimize-input/spec.yml:
--------------------------------------------------------------------------------
1 | name: minimize-input
2 | desc: Trim each cohort's input to no greater than the longest output's decode length
3 | run: minimize-input
4 | 


--------------------------------------------------------------------------------
/src/analysis/pass/normalize/normalize:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # normalize: perform some basic normalization of each worker's output
 5 | 
 6 | require "json"
 7 | 
 8 | warn "[+] pass: normalize"
 9 | 
10 | $stdin.each_line do |line|
11 |   result = JSON.parse line, symbolize_names: true
12 | 
13 |   result[:outputs].map! do |output|
14 |     next output if output[:result].nil?
15 | 
16 |     normalized = output[:result].each_line.map do |dec_line|
17 |       dec_line.split.join(" ")
18 |     end.join("\n")
19 | 
20 |     output[:result] = normalized
21 |     output[:len] = normalized.size
22 |     output
23 |   end
24 | 
25 |   $stdout.puts result.to_json
26 | end
27 | 
28 | warn "[+] pass: normalize done"
29 | 


--------------------------------------------------------------------------------
/src/analysis/pass/normalize/spec.yml:
--------------------------------------------------------------------------------
1 | name: normalize
2 | desc: Normalize analysis results
3 | run: normalize
4 | 


--------------------------------------------------------------------------------
/src/analysis/passes.yml:
--------------------------------------------------------------------------------
 1 | default:
 2 |   - filter-all-failure
 3 |   - filter-ndecoded-same
 4 |   - dedupe
 5 |   - minimize-input
 6 |   - normalize
 7 | 
 8 | # Run just the dedupe pass, as a convenient filter.
 9 | dedupe:
10 |   - dedupe
11 | 
12 | # Find inputs that all workers agree are one size, but one or more
13 | # decodes differently.
14 | same-size-different-decodings:
15 |   - filter-any-failure
16 |   - filter-ndecoded-different
17 |   - filter-same-effects
18 |   - minimize-input
19 |   - normalize
20 | 
21 | # Finds disagreements in size between workers.
22 | size-discrepancies:
23 |   - filter-all-failure
24 |   - filter-ndecoded-same
25 |   - filter-incomparable
26 |   - dedupe
27 |   - find-size-discrepancies
28 |   - minimize-input
29 |   - normalize
30 | 
31 | # Find inputs that not all workers either succeed or fail on.
32 | status-discrepancies:
33 |   - filter-all-failure
34 |   - filter-all-success
35 |   - dedupe
36 |   - minimize-input
37 |   - normalize
38 | 
39 | destroy-capstone:
40 |   - filter-all-success
41 |   - filter-ndecoded-same
42 |   - dedupe
43 |   - filter-destroy-capstone
44 |   - minimize-input
45 |   - normalize
46 | 
47 | destroy-bddisasm:
48 |   - filter-all-success
49 |   - filter-ndecoded-same
50 |   - dedupe
51 |   - filter-destroy-bddisasm
52 |   - minimize-input
53 |   - normalize
54 | 
55 | destroy-ghidra:
56 |   - filter-all-success
57 |   - filter-ndecoded-same
58 |   - dedupe
59 |   - normalize
60 |   - filter-ghidra-lock
61 |   - filter-destroy-ghidra
62 |   - minimize-input
63 | 
64 | xed-overaccept:
65 |   - filter-all-success
66 |   - filter-ndecoded-same
67 |   - dedupe
68 |   - filter-xed-find-overaccept
69 |   - minimize-input
70 |   - normalize
71 | 
72 | xed-underaccept:
73 |   - filter-all-success
74 |   - filter-ndecoded-same
75 |   - dedupe
76 |   - filter-bddisasm-salc
77 |   - filter-xed-find-underaccept
78 |   - minimize-input
79 |   - normalize
80 | 


--------------------------------------------------------------------------------
/src/include/mish_common.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef DEBUG
 4 | #define DLOG(fmt, ...)                                                                             \
 5 |   fprintf(stderr, "%s:%d %s: " fmt "\n", __FILE__, __LINE__, __func__, ##__VA_ARGS__);
 6 | #undef NDEBUG
 7 | #define _unused(x)
 8 | #else
 9 | #define DLOG(...)
10 | #define NDEBUG
11 | #define _unused(x) ((void)(x))
12 | #endif
13 | 
14 | #include <assert.h>
15 | #include <stdbool.h>
16 | #include <stdint.h>
17 | 
18 | #define MISHEGOS_INSN_MAXLEN 15
19 | #define MISHEGOS_DEC_MAXLEN 248
20 | // This limit is rather arbitrary at the moment.
21 | #define MISHEGOS_MAX_NWORKERS 31
22 | 
23 | typedef enum {
24 |   S_NONE = 0,
25 |   S_SUCCESS,
26 |   S_FAILURE,
27 |   S_CRASH,
28 |   S_PARTIAL,
29 |   S_UNKNOWN,
30 | } decode_status;
31 | 
32 | typedef enum {
33 |   W_IGNORE_CRASHES,
34 | } worker_config_mask;
35 | 
36 | typedef struct {
37 |   uint8_t len;
38 |   uint8_t raw_insn[MISHEGOS_INSN_MAXLEN];
39 | } input_slot;
40 | 
41 | typedef struct __attribute__((packed)) {
42 |   decode_status status;
43 |   uint16_t ndecoded;
44 |   uint16_t len;
45 |   char result[MISHEGOS_DEC_MAXLEN];
46 | } output_slot;
47 | static_assert(sizeof(output_slot) == 256, "output_slot should be 256 bytes");
48 | 


--------------------------------------------------------------------------------
/src/mish2jsonl/Makefile:
--------------------------------------------------------------------------------
 1 | SRCS = $(wildcard *.c)
 2 | OBJS = $(SRCS:.c=.o)
 3 | 
 4 | PROG = mish2jsonl
 5 | 
 6 | .PHONY: all
 7 | all: $(PROG)
 8 | 
 9 | $(PROG): $(OBJS) 
10 | 
11 | .PHONY: clean
12 | clean:
13 | 	rm -rf $(PROG) $(OBJS)
14 | 
15 | 


--------------------------------------------------------------------------------
/src/mish2jsonl/mish2jsonl.c:
--------------------------------------------------------------------------------
  1 | #include <err.h>
  2 | #include <errno.h>
  3 | #include <stdio.h>
  4 | #include <stdint.h>
  5 | #include <stdlib.h>
  6 | #include <string.h>
  7 | #include <unistd.h>
  8 | #include "mish_common.h"
  9 | 
 10 | typedef struct m_string {
 11 |   uint64_t len;
 12 |   char *string;
 13 | } m_string;
 14 | 
 15 | typedef struct worker_output {
 16 |   uint32_t status;   // 4
 17 |   uint16_t ndecoded; // 2
 18 |   uint32_t workerno; // 4
 19 |   m_string workerso; // 16
 20 |   m_string result;   // 16
 21 | } worker_output;
 22 | 
 23 | typedef struct cohort_results {
 24 |   uint32_t nworkers;
 25 |   input_slot input;
 26 |   worker_output *outputs;
 27 | } cohort_results;
 28 | 
 29 | static cohort_results results;
 30 | static int m_finished_parsing;
 31 | 
 32 | static const char *status2str(decode_status status) {
 33 |   switch (status) {
 34 |   case S_NONE:
 35 |     return "none";
 36 |   case S_SUCCESS:
 37 |     return "success";
 38 |   case S_FAILURE:
 39 |     return "failure";
 40 |   case S_CRASH:
 41 |     return "crash";
 42 |   case S_PARTIAL:
 43 |     return "partial";
 44 |   case S_UNKNOWN:
 45 |   default:
 46 |     return "unknown";
 47 |   }
 48 | }
 49 | 
 50 | static void m_cohort_print_json(FILE *f, cohort_results *r) {
 51 |   char hexbuf[MISHEGOS_INSN_MAXLEN * 2 + 1];
 52 |   for (size_t i = 0; i < r->input.len; i++) {
 53 |     hexbuf[i * 2] = "0123456789abcdef"[r->input.raw_insn[i] / 0x10];
 54 |     hexbuf[i * 2 + 1] = "0123456789abcdef"[r->input.raw_insn[i] % 0x10];
 55 |   }
 56 |   hexbuf[r->input.len * 2] = '\0';
 57 |   fprintf(f, "{ \"nworkers\": %u, \"input\": \"%s\", \"outputs\": [", r->nworkers, hexbuf);
 58 |   for (int i = 0; i < r->nworkers; i++) {
 59 |     if (i != 0) {
 60 |       fprintf(f, ",");
 61 |     }
 62 |     fprintf(f,
 63 |             "{ \"status\": { \"value\": %u, \"name\": \"%s\" }, \"ndecoded\": %u, \"workerno\": "
 64 |             "%u, \"worker_so\": \"%s\",\"len\": %ld, \"result\": \"%s\" }",
 65 |             r->outputs[i].status, status2str(r->outputs[i].status), r->outputs[i].ndecoded,
 66 |             r->outputs[i].workerno, r->outputs[i].workerso.string, r->outputs[i].result.len,
 67 |             r->outputs[i].result.string);
 68 |   }
 69 | 
 70 |   fprintf(f, "]}");
 71 |   return;
 72 | }
 73 | 
 74 | static void m_fread(void *ref, size_t size, size_t times, FILE *file) {
 75 |   size_t rd = fread(ref, size, times, file);
 76 | 
 77 |   // reading a 0 length string can be valid if disassembeling failed
 78 |   if (rd == 0 && size * times != 0) {
 79 |     m_finished_parsing = 1;
 80 |   }
 81 | }
 82 | 
 83 | /*
 84 |     There are a few subtleties that this functions catches
 85 |     1) Not all strings we will read actually have an null byte
 86 |     even when we read the entire string, leading to extra
 87 |     junk being inserted in for example an output file
 88 |     and hence crashing the analysis tool
 89 | 
 90 |     2) Some disassemblers like to insert \n in their output
 91 |     this breaks values from being a valid string
 92 | 
 93 |     3) at the moment we alloc the memory of the string here
 94 |     using malloc will result in padded whitespaces showing
 95 |     old data that we don't want. Hence the use of calloc
 96 |     if this is only used for printing one-by-one
 97 |     we can optimize this out
 98 | 
 99 |     we implicitly calloc 1 extra byte to guarantee
100 |     the string ends with a null byte (implicit calloc logic)
101 | */
102 | 
103 | static void read_string(FILE *file, m_string *s, int len_size) {
104 |   uint64_t string_length = 0;
105 |   m_fread(&string_length, len_size, 1, file);
106 | 
107 |   // calloc instead of malloc because we want to zero out the memory.
108 |   char *input = calloc(1, sizeof(char) * string_length + 1);
109 |   // this is because we tend to reuse the same memory alot (we optimize this out)
110 |   m_fread(input, sizeof(char), string_length, file);
111 | 
112 |   int newsize = strcspn(input, "\n");
113 |   input[newsize] = '\0';
114 |   s->len = newsize; // should this be new or old size?
115 |   s->string = input;
116 | }
117 | 
118 | static int read_next(FILE *file) {
119 |   fread(&results.nworkers, sizeof(uint32_t), 1, file);
120 |   results.outputs = malloc(sizeof(worker_output) * results.nworkers);
121 |   m_fread(&results.input, sizeof(results.input), 1, file);
122 | 
123 |   for (int i = 0; i < results.nworkers; i++) {
124 |     read_string(file, &results.outputs[i].workerso, 8);
125 |     m_fread(&results.outputs[i].status, sizeof(uint32_t), 1, file);
126 |     m_fread(&results.outputs[i].ndecoded, sizeof(uint16_t), 1, file);
127 | 
128 |     read_string(file, &results.outputs[i].result, 2);
129 |   }
130 | 
131 |   return m_finished_parsing == 0;
132 | }
133 | 
134 | static void free_cohort_results(cohort_results *result) {
135 |   for (int i = 0; i < results.nworkers; i++) {
136 |     free(results.outputs[i].workerso.string);
137 |     free(results.outputs[i].result.string);
138 |   }
139 |   free(results.outputs);
140 | }
141 | 
142 | void m_print_results_json(FILE *input_file, FILE *output_file) {
143 |   m_finished_parsing = 0;
144 |   int is_first = 1;
145 | 
146 |   fprintf(output_file, "[");
147 |   while (read_next(input_file)) {
148 |     if (!m_finished_parsing) {
149 |       if (!is_first) {
150 |         printf(",");
151 |       }
152 |       m_cohort_print_json(output_file, &results);
153 |       free_cohort_results(&results);
154 |       fprintf(output_file, "\n");
155 |       is_first = 0;
156 |     }
157 |   }
158 |   fprintf(output_file, "]");
159 | }
160 | 
161 | void m_print_results_jsonl(FILE *input_file, FILE *output_file) {
162 |   m_finished_parsing = 0;
163 | 
164 |   while (read_next(input_file)) {
165 |     m_cohort_print_json(output_file, &results);
166 |     free_cohort_results(&results);
167 |     fprintf(output_file, "\n");
168 |   }
169 | }
170 | 
171 | int main(int argc, char **argv) {
172 |   enum { JSON, JSONL } mode = JSONL;
173 |   int opt;
174 |   while ((opt = getopt(argc, argv, "hn")) != -1) {
175 |     switch (opt) {
176 |     case 'h':
177 |       fprintf(stdout,
178 |               "Convert mishegos output to JSON or JSONL\n"
179 |               "OPTIONS: -n switches output from JSONL (default) to JSON\n"
180 |               "Usage: %s [-n] [input]\n",
181 |               argv[0]);
182 |       return 0;
183 |     case 'n':
184 |       mode = JSON;
185 |       break;
186 |     default:
187 |       fprintf(stderr, "Usage: %s [-n] [input]\n", argv[0]);
188 |       return 1;
189 |     }
190 |   }
191 | 
192 |   // Default to stdin.
193 |   FILE *input;
194 |   if (argc - optind != 1) {
195 |     input = stdin;
196 |   } else {
197 |     input = fopen(argv[optind], "r");
198 |     if (input == NULL) {
199 |       err(errno, "fopen");
200 |     }
201 |   }
202 | 
203 |   if (mode == JSONL) {
204 |     m_print_results_jsonl(input, stdout);
205 |   } else {
206 |     m_print_results_json(input, stdout);
207 |   }
208 | 
209 |   fclose(input);
210 |   return 0;
211 | }
212 | 


--------------------------------------------------------------------------------
/src/mishegos/Makefile:
--------------------------------------------------------------------------------
 1 | SRCS = $(wildcard *.c)
 2 | OBJS = $(SRCS:.c=.o)
 3 | 
 4 | PROG = mishegos
 5 | 
 6 | .PHONY: all
 7 | all: $(PROG)
 8 | 
 9 | $(PROG): $(OBJS)
10 | 
11 | .PHONY: clean
12 | clean:
13 | 	rm -rf $(PROG) $(OBJS)
14 | 


--------------------------------------------------------------------------------
/src/mishegos/mishegos.c:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "mish_common.h"
  3 | #include "mutator.h"
  4 | 
  5 | #include <assert.h>
  6 | #include <dlfcn.h>
  7 | #include <err.h>
  8 | #include <pthread.h>
  9 | #include <limits.h>
 10 | #include <stdatomic.h>
 11 | #include <stdbool.h>
 12 | #include <stddef.h>
 13 | #include <stdint.h>
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <string.h>
 17 | #include <unistd.h>
 18 | #include <signal.h>
 19 | #include <sys/mman.h>
 20 | #include <sys/prctl.h>
 21 | #include <sys/random.h>
 22 | #include <sys/syscall.h>
 23 | #include <sys/wait.h>
 24 | #include <linux/futex.h>
 25 | 
 26 | #define WITH_FUTEX
 27 | 
 28 | typedef struct {
 29 |   _Atomic uint32_t val;
 30 | #ifdef WITH_FUTEX
 31 |   _Atomic uint32_t waiters;
 32 | #endif
 33 | } mish_atomic_uint;
 34 | 
 35 | static void mish_atomic_wait_for(mish_atomic_uint *var, uint32_t target) {
 36 |   uint32_t old;
 37 |   size_t cnt = 0;
 38 |   while ((old = atomic_load(&var->val)) != target) {
 39 | #ifdef __x86_64__
 40 |     __asm__ volatile("pause");
 41 | #endif
 42 |     (void)cnt;
 43 | #ifdef WITH_FUTEX
 44 |     if (++cnt > 10000) {
 45 |       atomic_fetch_add_explicit(&var->waiters, 1, memory_order_relaxed);
 46 |       syscall(SYS_futex, &var->val, FUTEX_WAIT, old, NULL);
 47 |       atomic_fetch_sub_explicit(&var->waiters, 1, memory_order_relaxed);
 48 |     }
 49 | #endif
 50 |   }
 51 | }
 52 | 
 53 | static uint32_t mish_atomic_fetch_add(mish_atomic_uint *var, uint32_t val) {
 54 |   return atomic_fetch_add(&var->val, val);
 55 | }
 56 | 
 57 | static uint32_t mish_atomic_load(mish_atomic_uint *var) {
 58 |   return atomic_load(&var->val);
 59 | }
 60 | 
 61 | static void mish_atomic_store(mish_atomic_uint *var, uint32_t val) {
 62 |   atomic_store(&var->val, val);
 63 | }
 64 | 
 65 | static void mish_atomic_notify(mish_atomic_uint *var) {
 66 | #ifdef WITH_FUTEX
 67 |   if (atomic_load_explicit(&var->waiters, memory_order_relaxed))
 68 |     syscall(SYS_futex, &var->val, FUTEX_WAKE, INT_MAX);
 69 | #endif
 70 | }
 71 | 
 72 | #define MISHEGOS_NUM_SLOTS_PER_CHUNK 4096
 73 | #define MISHEGOS_NUM_CHUNKS 16
 74 | 
 75 | typedef struct {
 76 |   mish_atomic_uint generation;
 77 |   mish_atomic_uint remaining_workers;
 78 |   uint32_t input_count;
 79 |   input_slot inputs[MISHEGOS_NUM_SLOTS_PER_CHUNK];
 80 | } input_chunk;
 81 | 
 82 | typedef struct {
 83 |   mish_atomic_uint remaining;
 84 |   output_slot outputs[MISHEGOS_NUM_SLOTS_PER_CHUNK];
 85 | } output_chunk;
 86 | 
 87 | struct worker_config {
 88 |   size_t soname_len;
 89 |   const char *soname;
 90 |   int workerno;
 91 |   input_chunk *input_chunks;
 92 |   output_chunk *output_chunks;
 93 |   size_t start_gen;
 94 |   size_t start_idx;
 95 |   bool sigchld;
 96 |   pthread_t thread;
 97 |   pid_t pid;
 98 | };
 99 | 
100 | static struct worker_config workers[MISHEGOS_MAX_NWORKERS];
101 | 
102 | static void *alloc_shared(size_t size) {
103 |   void *res = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON | MAP_POPULATE, -1, 0);
104 |   if (res == MAP_FAILED) {
105 |     perror("mmap");
106 |     exit(1);
107 |   }
108 |   return res;
109 | }
110 | 
111 | static void *worker(void *wc_vp) {
112 |   const struct worker_config *wc = wc_vp;
113 |   void *so = dlopen(wc->soname, RTLD_LAZY);
114 |   if (!so) {
115 |     perror(wc->soname);
116 |     return NULL;
117 |   }
118 | 
119 |   void (*worker_ctor)() = (void (*)())dlsym(so, "worker_ctor");
120 |   void (*worker_dtor)() = (void (*)())dlsym(so, "worker_dtor");
121 |   typedef void (*try_decode_t)(output_slot * result, uint8_t * raw_insn, uint8_t length);
122 |   try_decode_t try_decode = (try_decode_t)dlsym(so, "try_decode");
123 |   char *worker_name = *((char **)dlsym(so, "worker_name"));
124 | 
125 |   if (worker_ctor != NULL) {
126 |     worker_ctor();
127 |   }
128 | 
129 |   uint32_t gen = wc->start_gen;
130 |   size_t idx = wc->start_idx;
131 | 
132 |   input_chunk *input_chunks = wc->input_chunks;
133 |   output_chunk *output_chunks = wc->output_chunks;
134 |   while (1) {
135 |     mish_atomic_wait_for(&input_chunks[idx].generation, gen);
136 | 
137 |     /* Track remaining slots; if we crash, we know where we are. If we start
138 |      * with a non-zero remaining count, we continue where we left, but skip the
139 |      * slot that caused us to crash. */
140 |     size_t old_remaining = mish_atomic_load(&output_chunks[idx].remaining);
141 |     size_t start = old_remaining == 0 ? 0 : input_chunks[idx].input_count - old_remaining + 1;
142 |     mish_atomic_store(&output_chunks[idx].remaining, input_chunks[idx].input_count - start);
143 |     for (size_t i = start; i < input_chunks[idx].input_count; i++) {
144 |       output_chunks[idx].outputs[i].len = 0;
145 |       output_chunks[idx].outputs[i].ndecoded = 0;
146 |       try_decode(&output_chunks[idx].outputs[i], input_chunks[idx].inputs[i].raw_insn,
147 |                  input_chunks[idx].inputs[i].len);
148 |       /* Note: this is no atomic subtraction. It atomic, however, to ensure that
149 |        * the decode result is written to memory before we decrement the counter */
150 |       mish_atomic_store(&output_chunks[idx].remaining, input_chunks[idx].input_count - i - 1);
151 |     }
152 | 
153 |     if (mish_atomic_fetch_add(&input_chunks[idx].remaining_workers, -1) == 1)
154 |       mish_atomic_notify(&input_chunks[idx].remaining_workers);
155 | 
156 |     /* Not getting a full chunk indicates that we are exiting. */
157 |     if (input_chunks[idx].input_count != MISHEGOS_NUM_SLOTS_PER_CHUNK)
158 |       break;
159 | 
160 |     idx++;
161 |     if (idx == MISHEGOS_NUM_CHUNKS) {
162 |       idx = 0;
163 |       gen++;
164 |     }
165 |   }
166 | 
167 |   if (worker_dtor != NULL) {
168 |     worker_dtor();
169 |   }
170 |   dlclose(so);
171 | 
172 |   return NULL;
173 | }
174 | 
175 | /* By default, filter all inputs which all decoders identify as invalid. */
176 | static int filter_min_success = 1;
177 | static int filter_max_success = MISHEGOS_MAX_NWORKERS;
178 | static bool filter_ndecoded_same = false;
179 | 
180 | static void process(size_t slot, size_t idx, input_chunk *input_chunks, int nworkers,
181 |                     struct worker_config *workers) {
182 |   int num_success = 0;
183 |   bool ndecoded_same = true;
184 |   int last_ndecoded = -1;
185 |   for (int j = 0; j < nworkers; j++) {
186 |     output_slot *output = &workers[j].output_chunks[slot].outputs[idx];
187 |     num_success += output->status == S_SUCCESS;
188 |     if (output->status == S_SUCCESS) {
189 |       if (last_ndecoded == -1)
190 |         last_ndecoded = output->ndecoded;
191 |       else if (last_ndecoded != output->ndecoded)
192 |         ndecoded_same = false;
193 |     }
194 |   }
195 |   if (num_success >= filter_min_success && num_success <= filter_max_success)
196 |     goto keep;
197 |   if (filter_ndecoded_same && !ndecoded_same)
198 |     goto keep;
199 |   return;
200 | 
201 | keep:;
202 |   fwrite(&nworkers, sizeof(nworkers), 1, stdout);
203 | 
204 |   input_slot *input = &input_chunks[slot].inputs[idx];
205 |   fwrite(input, sizeof(*input), 1, stdout);
206 |   for (int j = 0; j < nworkers; j++) {
207 |     fwrite(&workers[j].soname_len, sizeof(workers[j].soname_len), 1, stdout);
208 |     fwrite(workers[j].soname, 1, workers[j].soname_len, stdout);
209 | 
210 |     output_slot *output = &workers[j].output_chunks[slot].outputs[idx];
211 |     static_assert(offsetof(output_slot, result) == sizeof(output_slot) - MISHEGOS_DEC_MAXLEN,
212 |                   "expect result buffer to be at end of slot");
213 |     fwrite(output, sizeof(*output) - MISHEGOS_DEC_MAXLEN + output->len, 1, stdout);
214 |   }
215 | }
216 | 
217 | static int worker_for_pid(pid_t pid) {
218 |   for (int i = 0; i < MISHEGOS_MAX_NWORKERS; i++) {
219 |     if (workers[i].pid == pid) {
220 |       return i;
221 |     }
222 |   }
223 |   return -1;
224 | }
225 | 
226 | static bool thread_mode = false;
227 | 
228 | static void worker_start(struct worker_config *wc) {
229 |   if (thread_mode) {
230 |     pthread_create(&wc->thread, NULL, worker, wc);
231 |   } else {
232 |     /* pipe to notify child that we are ready. */
233 |     int pipe_fds[2];
234 |     char tmp = 0;
235 |     if (pipe(pipe_fds) < 0) {
236 |       perror("pipe");
237 |       exit(1);
238 |     }
239 | 
240 |     pid_t child = fork();
241 |     if (child < 0) {
242 |       perror("fork");
243 |       exit(1);
244 |     } else if (child == 0) {
245 |       prctl(PR_SET_PDEATHSIG, SIGHUP);
246 |       close(pipe_fds[1]);
247 |       if (read(pipe_fds[0], &tmp, 1) != 1) {
248 |         /* parent died without us being killed by SIGHUP -- so exit. */
249 |         exit(1);
250 |       }
251 |       close(pipe_fds[0]);
252 |       worker(wc);
253 |       exit(0);
254 |     }
255 |     wc->pid = child;
256 |     close(pipe_fds[0]);
257 |     write(pipe_fds[1], &tmp, 1);
258 |     close(pipe_fds[1]);
259 |   }
260 | }
261 | 
262 | static void sigchld_handler(int sig) {
263 |   (void)sig;
264 | 
265 |   /* Multiple children might have died at the same time, but we get only one signal. */
266 |   int wstatus;
267 |   pid_t wpid;
268 |   while ((wpid = waitpid(-1, &wstatus, WNOHANG)) > 0) {
269 |     int workerno = worker_for_pid(wpid);
270 |     assert(workerno >= 0);
271 |     if (workerno < 0) {
272 |       /* worker died before we even had the chance to store its pid. */
273 |       abort();
274 |     }
275 |     input_chunk *ic = workers[workerno].input_chunks;
276 |     output_chunk *oc = workers[workerno].output_chunks;
277 |     for (size_t widx = 0; widx < MISHEGOS_NUM_CHUNKS; widx++) {
278 |       uint32_t remaining = mish_atomic_load(&oc[widx].remaining);
279 |       if (remaining == 0)
280 |         continue;
281 |       /* we found the position where the worker crashed. */
282 |       oc[widx].outputs[ic[widx].input_count - remaining].status = S_CRASH;
283 |       /* update generation and chunk index so that worker can restart. */
284 |       workers[workerno].start_gen = mish_atomic_load(&ic[widx].generation);
285 |       workers[workerno].start_idx = widx;
286 |       /* Mark worker as sigchld-received s.t. we can restart them. We obviously
287 |        * can't do that in a signal handler. */
288 |       workers[workerno].sigchld = true;
289 |       /* Reduce remaining_workers temporarily s.t. we always wake up. No need to
290 |        * explicitly wake, however: the futex syscall will be restarted and
291 |        * detect that the value changed */
292 |       mish_atomic_fetch_add(&ic[widx].remaining_workers, -1);
293 |       break;
294 |     }
295 |     /* We might get here because the worker terminated ordinarily -- ignore.
296 |      * There's also the case that the worker crashed outside decoding. This must
297 |      * be a bug and therefore should never happen(TM). Ignore this case, too. */
298 |   }
299 | }
300 | 
301 | int main(int argc, char **argv) {
302 |   const char *mutator_name = NULL;
303 | 
304 |   int opt;
305 |   while ((opt = getopt(argc, argv, "htm:s:n")) != -1) {
306 |     switch (opt) {
307 |     case 't':
308 |       thread_mode = false;
309 |       break;
310 |     case 'm':
311 |       mutator_name = optarg;
312 |       break;
313 |     case 's': {
314 |       char *next;
315 |       /* Both values are capped to nworkers below, s.t. -1 => nworkers - 1. */
316 |       filter_min_success = strtol(optarg, &next, 0);
317 |       if (*next == ':')
318 |         filter_max_success = strtol(next + 1, &next, 0);
319 |       if (*next != '\0')
320 |         errx(1, "-s needs format <min> or <min>:<max>");
321 |       break;
322 |     }
323 |     case 'n':
324 |       filter_ndecoded_same = true;
325 |       break;
326 |     case 'h':
327 |     default:
328 |       fprintf(stderr, "usage: %s [-t] [-m mutator] [-s min[:max]] [-n]\n", argv[0]);
329 |       fprintf(stderr, "  -t: use thread mode\n");
330 |       fprintf(stderr, "  -m: specify mutator\n");
331 |       fprintf(stderr, "  -s: keep samples where success count is in range; default is 1:-1\n");
332 |       fprintf(stderr, "      (0 = all; 1 = #success >= 1; -1 = #success = nworkers - 1;\n");
333 |       fprintf(stderr, "       1:-2 = #success >= 1 && <= nworkers - 1;\n");
334 |       fprintf(stderr, "       1:0 = filter all (e.g., for use with -n); etc.)\n");
335 |       fprintf(stderr, "  -n: keep samples where successful ndecoded differs\n");
336 |       return 1;
337 |     }
338 |   }
339 | 
340 |   if (optind + 1 != argc) {
341 |     fprintf(stderr, "expected worker file as positional argument\n");
342 |     return 1;
343 |   }
344 | 
345 |   if (!thread_mode) {
346 |     struct sigaction sigchld_action = {0};
347 |     sigchld_action.sa_handler = sigchld_handler;
348 |     sigchld_action.sa_flags = SA_NOCLDSTOP;
349 |     if (sigaction(SIGCHLD, &sigchld_action, NULL)) {
350 |       perror("sigaction");
351 |       return 1;
352 |     }
353 |   }
354 | 
355 |   mutator_t mutator = mutator_create(mutator_name);
356 | 
357 |   FILE *file = fopen(argv[optind], "r");
358 |   if (file == NULL) {
359 |     perror(argv[optind]);
360 |     return 1;
361 |   }
362 | 
363 |   input_chunk *input_chunks = alloc_shared(sizeof(input_chunk) * MISHEGOS_NUM_CHUNKS);
364 | 
365 |   int nworkers = 0;
366 |   uint64_t gen = 1;
367 |   uint64_t idx = 0;
368 | 
369 |   while (nworkers < MISHEGOS_MAX_NWORKERS) {
370 |     size_t size = 0;
371 |     char *line = NULL;
372 |     if (getline(&line, &size, file) < 0 || feof(file) != 0) {
373 |       break;
374 |     }
375 |     if (line[0] == '#') {
376 |       continue;
377 |     }
378 | 
379 |     /* getline retains the newline if present, so chop it off. */
380 |     line[strcspn(line, "\n")] = '\0';
381 |     if (access(line, R_OK) < 0) {
382 |       perror(line);
383 |       return 1;
384 |     }
385 | 
386 |     workers[nworkers].soname_len = strlen(line);
387 |     workers[nworkers].soname = line;
388 |     workers[nworkers].workerno = nworkers;
389 |     workers[nworkers].input_chunks = input_chunks;
390 |     workers[nworkers].output_chunks = alloc_shared(sizeof(output_chunk) * MISHEGOS_NUM_CHUNKS);
391 |     workers[nworkers].start_gen = gen;
392 |     workers[nworkers].start_idx = idx;
393 |     worker_start(&workers[nworkers]);
394 |     nworkers++;
395 |   }
396 | 
397 |   if (filter_min_success < 0) {
398 |     filter_min_success += nworkers + 1;
399 |   }
400 |   if (filter_max_success < 0) {
401 |     filter_max_success += nworkers + 1;
402 |   }
403 |   fprintf(stderr, "filter min=%d max=%d\n", filter_min_success, filter_max_success);
404 | 
405 |   uint64_t total = 0;
406 |   uint64_t exit_idx = MISHEGOS_NUM_CHUNKS;
407 |   while (true) {
408 |     mish_atomic_wait_for(&input_chunks[idx].remaining_workers, 0);
409 | 
410 |     if (!thread_mode) {
411 |       bool worker_restarted = false;
412 |       for (int i = 0; i < nworkers; i++) {
413 |         if (workers[i].sigchld) {
414 |           /* undo hack to forcefully wake us up. */
415 |           mish_atomic_fetch_add(&input_chunks[workers[i].start_idx].remaining_workers, 1);
416 |           workers[i].sigchld = false;
417 |           worker_start(&workers[i]);
418 |           worker_restarted = true;
419 |         }
420 |       }
421 |       if (worker_restarted) {
422 |         /* if we restarted a worker for current idx, wait for it again. */
423 |         continue;
424 |       }
425 |     }
426 | 
427 |     if (gen > 1) {
428 |       for (size_t i = 0; i < input_chunks[idx].input_count; i++) {
429 |         process(idx, i, input_chunks, nworkers, workers);
430 |       }
431 |     }
432 | 
433 |     if (idx == exit_idx) {
434 |       break;
435 |     }
436 | 
437 |     // Not yet exiting, so fill another chunk.
438 |     if (exit_idx == MISHEGOS_NUM_CHUNKS) {
439 |       size_t count = 0;
440 |       for (size_t i = 0; i < MISHEGOS_NUM_SLOTS_PER_CHUNK; i++) {
441 |         bool filled = mutator(&input_chunks[idx].inputs[i]);
442 |         if (filled) {
443 |           count++;
444 |         } else { // no more mutations
445 |           exit_idx = idx;
446 |           break;
447 |         }
448 |       }
449 | 
450 |       input_chunks[idx].input_count = count;
451 |       mish_atomic_store(&input_chunks[idx].remaining_workers, nworkers);
452 |       mish_atomic_store(&input_chunks[idx].generation, gen);
453 |       mish_atomic_notify(&input_chunks[idx].generation);
454 |     }
455 | 
456 |     idx++;
457 |     if (idx == MISHEGOS_NUM_CHUNKS) {
458 |       idx = 0;
459 |       gen++;
460 |     }
461 |   }
462 | }
463 | 


--------------------------------------------------------------------------------
/src/mishegos/mutator.c:
--------------------------------------------------------------------------------
  1 | #include "mutator.h"
  2 | 
  3 | #include "mish_common.h"
  4 | 
  5 | #include <err.h>
  6 | #include <stddef.h>
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | 
 10 | /* An x86 instruction's opcode is no longer than 3 bytes.
 11 |  */
 12 | typedef struct __attribute__((packed)) {
 13 |   uint8_t len;
 14 |   uint8_t op[3];
 15 | } opcode;
 16 | static_assert(sizeof(opcode) == 4, "opcode should be 4 bytes");
 17 | 
 18 | /* An x86 instruction is no longer than 15 bytes,
 19 |  * but the longest (potentially) structurally valid x86 instruction
 20 |  * is 26 bytes:
 21 |  *  4 byte legacy prefix
 22 |  *  1 byte prefix
 23 |  *  3 byte opcode
 24 |  *  1 byte ModR/M
 25 |  *  1 byte SIB
 26 |  *  8 byte displacement
 27 |  *  8 byte immediate
 28 |  *
 29 |  * We want to be able to "slide" around inside of a structurally valid
 30 |  * instruction in order to find errors, so we give ourselves enough space
 31 |  * here.
 32 |  */
 33 | typedef struct {
 34 |   uint8_t off;
 35 |   uint8_t len;
 36 |   uint8_t insn[26];
 37 | } insn_candidate;
 38 | 
 39 | static uint64_t rng_state[4];
 40 | static insn_candidate insn_cand;
 41 | 
 42 | /* An x86 instruction can have up to 4 legacy prefixes,
 43 |  * in any order, with no more than 1 prefix from each group.
 44 |  */
 45 | static uint8_t legacy_prefixes[] = {
 46 |     // Prefix group 1.
 47 |     0xf0, // repeat/lock
 48 |     0xf3, // rep, repe
 49 |     0xf2, // repne
 50 |     // Prefix group 2.
 51 |     0x2e, // segment override, cs
 52 |     0x36, // segment override, ss
 53 |     0x3e, // segment override, ds
 54 |     0x26, // segment override, es
 55 |     0x64, // segment override, fs
 56 |     0x65, // segment override, gs
 57 |     // Prefix group 3.
 58 |     0x66, // operand size override
 59 |     // Prefix group 4.
 60 |     0x67, // address size override
 61 | };
 62 | 
 63 | /* REX prefixes apply in long (64-bit) mode, and are made up
 64 |  * of a fixed 4-bit pattern + extension bits for operand size,
 65 |  * ModR/M and SIB.
 66 |  *
 67 |  * Each instruction should only have one REX prefix.
 68 |  */
 69 | static uint8_t rex_prefixes[] = {
 70 |     0b01000000, // ----
 71 |     0b01000001, // ---B
 72 |     0b01000010, // --X-
 73 |     0b01000011, // --BX
 74 |     0b01000100, // -R--
 75 |     0b01000101, // -R-B
 76 |     0b01000110, // -RX-
 77 |     0b01000111, // -RXB
 78 |     0b01001000, // W---
 79 |     0b01001001, // W--B
 80 |     0b01001010, // W-X-
 81 |     0b01001011, // W-XB
 82 |     0b01001100, // WR--
 83 |     0b01001101, // WR-B
 84 |     0b01001110, // WRX-
 85 |     0b01001111, // WRXB
 86 | };
 87 | 
 88 | #if defined __GLIBC__ && defined __linux__
 89 | 
 90 | #include <sys/random.h>
 91 | static int mish_getrandom(void *buf, size_t buflen, unsigned int flags) {
 92 |   return getrandom(buf, buflen, flags);
 93 | }
 94 | 
 95 | #elif defined __APPLE__ && defined __MACH__
 96 | 
 97 | #include <sys/random.h>
 98 | static int mish_getrandom(void *buf, size_t buflen, unsigned int flags) {
 99 |   return getentropy(buf, buflen);
100 | }
101 | 
102 | #else
103 | #error "we only support linux + glibc at the moment; help us out!"
104 | #endif
105 | 
106 | #ifndef NO_XOROSHIRO_RNG
107 | static inline uint64_t xoroshiro256_rotl(const uint64_t x, int k) {
108 |   return (x << k) | (x >> (64 - k));
109 | }
110 | 
111 | uint64_t xoroshiro256_next(void) {
112 |   const uint64_t result_starstar = xoroshiro256_rotl(rng_state[1] * 5, 7) * 9;
113 | 
114 |   const uint64_t t = rng_state[1] << 17;
115 | 
116 |   rng_state[2] ^= rng_state[0];
117 |   rng_state[3] ^= rng_state[1];
118 |   rng_state[1] ^= rng_state[2];
119 |   rng_state[0] ^= rng_state[3];
120 | 
121 |   rng_state[2] ^= t;
122 | 
123 |   rng_state[3] = xoroshiro256_rotl(rng_state[3], 45);
124 | 
125 |   return result_starstar;
126 | }
127 | 
128 | static inline uint64_t rand_long() {
129 |   return xoroshiro256_next();
130 | }
131 | 
132 | static inline uint8_t rand_byte() {
133 |   return (uint8_t)rand_long();
134 | }
135 | #else
136 | static uint64_t rand_long() {
137 |   uint64_t it;
138 |   mish_getrandom(&it, sizeof(it), 0);
139 |   return it;
140 | }
141 | 
142 | static uint8_t rand_byte() {
143 |   uint8_t it;
144 |   mish_getrandom(&it, sizeof(it), 0);
145 |   return it;
146 | }
147 | #endif
148 | 
149 | /* Creates a random (potentially invalid) opcode.
150 |  * Opcodes are 1-3 bytes long, and come in three formats:
151 |  *  1. Single byte (raw opcode)
152 |  *  2. Two bytes (escape byte, opcode)
153 |  *  3. Three bytes (escape byte 1, escape byte 2, opcode)
154 |  */
155 | static void rand_opcode(opcode *opc) {
156 |   switch (rand_byte() % 4) {
157 |   case 0: {
158 |     opc->len = 1;
159 |     opc->op[0] = rand_byte();
160 |     break;
161 |   }
162 |   case 1: {
163 |     opc->len = 2;
164 |     opc->op[0] = 0x0f;
165 |     opc->op[1] = rand_byte();
166 |     break;
167 |   }
168 |   case 2: {
169 |     opc->len = 3;
170 |     opc->op[0] = 0x0f;
171 |     opc->op[1] = 0x38;
172 |     opc->op[2] = rand_byte();
173 |     break;
174 |   }
175 |   case 3: {
176 |     opc->len = 3;
177 |     opc->op[0] = 0x0f;
178 |     opc->op[1] = 0x3a;
179 |     opc->op[2] = rand_byte();
180 |     break;
181 |   }
182 |   }
183 | }
184 | 
185 | static void build_sliding_candidate() {
186 |   memset(&insn_cand, 0, sizeof(insn_candidate));
187 | 
188 |   /* 4 random legacy prefixes.
189 |    *
190 |    * Observe that we don't attempt to enforce the "1 prefix from each group" rule.
191 |    */
192 |   for (int i = 0; i < 4; ++i) {
193 |     insn_cand.insn[i] = legacy_prefixes[rand_byte() % sizeof(legacy_prefixes)];
194 |   }
195 |   insn_cand.len += 4;
196 | 
197 |   /* REX prefix choices:
198 |    *   0. Random prefix from rex_prefixes table
199 |    *   1. Completely randomized prefix
200 |    *   3. No REX prefix
201 |    */
202 |   switch (rand_byte() % 3) {
203 |   case 0: {
204 |     insn_cand.insn[insn_cand.len] = rex_prefixes[rand_byte() % sizeof(rex_prefixes)];
205 |     insn_cand.len++;
206 |     break;
207 |   }
208 |   case 1: {
209 |     insn_cand.insn[insn_cand.len] = rand_byte();
210 |     insn_cand.len++;
211 |     break;
212 |   }
213 |   case 2: {
214 |     break;
215 |   }
216 |   }
217 | 
218 |   /* Opcode, up to 3 bytes.
219 |    */
220 |   opcode opc;
221 |   rand_opcode(&opc);
222 |   memcpy(insn_cand.insn + insn_cand.len, opc.op, opc.len);
223 |   insn_cand.len += opc.len;
224 | 
225 |   /* ModR/M and SIB. For now, just two random bytes.
226 |    */
227 |   insn_cand.insn[insn_cand.len++] = rand_byte();
228 |   insn_cand.insn[insn_cand.len++] = rand_byte();
229 | 
230 |   /* Displacement. Either none, or 1, 2, 4, or 8 bytes.
231 |    */
232 |   if (rand_byte() % 2 == 0) {
233 |     uint8_t displen = 1 << (rand_byte() % 4);
234 |     uint64_t disp = rand_long();
235 |     memcpy(insn_cand.insn + insn_cand.len, &disp, displen);
236 |     insn_cand.len += displen;
237 |   }
238 | 
239 |   /* Immediate. Either none, or 1, 2, 4, or 8 bytes.
240 |    */
241 |   if (rand_byte() % 2 == 0) {
242 |     uint8_t immlen = 1 << (rand_byte() % 4);
243 |     uint64_t imm = rand_long();
244 |     memcpy(insn_cand.insn + insn_cand.len, &imm, immlen);
245 |     insn_cand.len += immlen;
246 |   }
247 | }
248 | 
249 | /* Havoc: generate a random instruction candidate.
250 |  */
251 | static bool havoc_candidate(input_slot *slot) {
252 |   slot->len = (rand_byte() % MISHEGOS_INSN_MAXLEN) + 1;
253 |   uint64_t lower = rand_long();
254 |   uint64_t upper = rand_long();
255 |   memcpy(slot->raw_insn, &lower, 8);
256 |   memcpy(slot->raw_insn + 8, &upper, 7);
257 | 
258 |   return true;
259 | }
260 | 
261 | /* Sliding: generate an instruction candidate with the
262 |  * "sliding" approach.
263 |  */
264 | static bool sliding_candidate(input_slot *slot) {
265 |   /* An offset of zero into our sliding candidate indicates that we've slid
266 |    * all the way through and need to build a new candidate.
267 |    */
268 |   if (insn_cand.off == 0) {
269 |     build_sliding_candidate();
270 |   }
271 | 
272 |   /* If our sliding candidate is less than the maximum instruction size,
273 |    * then we have nothing to slide. Just copy it try a new candidate on the next
274 |    * call.
275 |    *
276 |    * Otherwise, take the maximum instruction size from our sliding
277 |    * candidate, plus the current offset. This gives us a bunch of
278 |    * high quality instruction "windows".
279 |    */
280 |   if (insn_cand.len <= MISHEGOS_INSN_MAXLEN) {
281 |     memcpy(slot->raw_insn, insn_cand.insn, insn_cand.len);
282 |     slot->len = insn_cand.len;
283 |     insn_cand.off = 0; // Shouldn't be necessary, but just to be explicit.
284 |   } else {
285 |     memcpy(slot->raw_insn, insn_cand.insn + insn_cand.off, MISHEGOS_INSN_MAXLEN);
286 |     slot->len = MISHEGOS_INSN_MAXLEN;
287 |     insn_cand.off = (insn_cand.off + 1) % (insn_cand.len - MISHEGOS_INSN_MAXLEN + 1);
288 |   }
289 | 
290 |   return true;
291 | }
292 | 
293 | /* Structured: generate an instruction candidate with the
294 |  * "structured" approach.
295 |  */
296 | static bool structured_candidate(input_slot *slot) {
297 |   /* We mirror build_sliding_candidate here, but with the constraint that
298 |    * we never overapproximate: we constrain ourselves to trying
299 |    * to build something that looks like an instruction of no more
300 |    * than 15 bytes.
301 |    */
302 | 
303 |   uint8_t len = 0;
304 | 
305 |   /* Up to 4 legacy prefixes. Like sliding, we don't try to enforce group rules.
306 |    * Unlike sliding, we allow for the possibility of no legacy prefixes.
307 |    * Running max: 4.
308 |    */
309 |   uint8_t prefix_count = (rand_byte() % 5);
310 |   for (int i = 0; i < prefix_count; ++i) {
311 |     slot->raw_insn[i] = legacy_prefixes[rand_byte() % sizeof(legacy_prefixes)];
312 |   }
313 |   len = prefix_count;
314 | 
315 |   /* One or none REX prefixes.
316 |    * Always choose a valid REX prefix if we're inserting one.
317 |    * Running max: 5.
318 |    */
319 |   if (rand_byte() % 2) {
320 |     slot->raw_insn[len] = rex_prefixes[rand_byte() % sizeof(rex_prefixes)];
321 |     len++;
322 |   }
323 | 
324 |   /* Random (but structured) opcode. Same as sliding.
325 |    * Running max: 8
326 |    */
327 |   opcode opc;
328 |   rand_opcode(&opc);
329 |   memcpy(slot->raw_insn + len, opc.op, opc.len);
330 |   len += opc.len;
331 | 
332 |   /* One or none ModR/M bytes, and one or none SIB bytes.
333 |    * Both of these are just 8-bit LUTs, so they can be fully random.
334 |    * Running max: 10.
335 |    */
336 |   if (rand_byte() % 2) {
337 |     slot->raw_insn[len] = rand_byte();
338 |     len++;
339 |   }
340 | 
341 |   if (rand_byte() % 2) {
342 |     slot->raw_insn[len] = rand_byte();
343 |     len++;
344 |   }
345 | 
346 |   /* Finally, we have up to 5 bytes to play with for the immediate and
347 |    * displacement. Fill some amount of that (maybe not all) with randomness.
348 |    */
349 |   uint64_t tail = rand_long();
350 |   uint8_t tail_size = rand_byte() % 6;
351 |   memcpy(slot->raw_insn + len, &tail, tail_size);
352 |   len += tail_size;
353 | 
354 |   slot->len = len;
355 | 
356 |   return true;
357 | }
358 | 
359 | /* Dummy: Generates a single NOP for debugging purposes.
360 |  */
361 | static bool dummy_candidate(input_slot *slot) {
362 |   slot->raw_insn[0] = 0x90;
363 |   slot->len = 1;
364 | 
365 |   /* NOTE(ww): We only ever want to fill one input slot with our dummy candidate,
366 |    * since other parts of mishegos disambiguate worker outputs by keying on the input.
367 |    */
368 |   return false;
369 | }
370 | 
371 | static void hex2bytes(uint8_t *outbuf, const char *const input, size_t input_len) {
372 |   for (size_t i = 0, j = 0; j < input_len / 2; i += 2, ++j) {
373 |     outbuf[j] = (input[i] % 32 + 9) % 25 * 16 + (input[i + 1] % 32 + 9) % 25;
374 |   }
375 | }
376 | 
377 | /* Manual: reads instruction candidates from stdin, one per line.
378 |  * Candidates are expected to be in hex format, with no 0x or \x prefix.
379 |  */
380 | static bool manual_candidate(input_slot *slot) {
381 |   char *line = NULL;
382 |   size_t size;
383 |   if (getline(&line, &size, stdin) < 0) {
384 |     /* Input exhausted.
385 |      */
386 |     return false;
387 |   }
388 | 
389 |   line[strcspn(line, "\n")] = '\0';
390 |   size_t linelen = strlen(line);
391 |   if (linelen == 0 || linelen > MISHEGOS_INSN_MAXLEN * 2) {
392 |     return false;
393 |   }
394 | 
395 |   hex2bytes(slot->raw_insn, line, linelen);
396 |   slot->len = linelen / 2;
397 | 
398 |   return true;
399 | }
400 | 
401 | mutator_t mutator_create(const char *name) {
402 |   mish_getrandom(rng_state, sizeof(rng_state), 0);
403 | 
404 |   if (name == NULL) // default is sliding candidate
405 |     return sliding_candidate;
406 |   if (!strcmp(name, "dummy"))
407 |     return dummy_candidate;
408 |   else if (!strcmp(name, "sliding"))
409 |     return sliding_candidate;
410 |   else if (!strcmp(name, "structured"))
411 |     return structured_candidate;
412 |   else if (!strcmp(name, "havoc"))
413 |     return havoc_candidate;
414 |   else if (!strcmp(name, "manual"))
415 |     return manual_candidate;
416 |   errx(1, "invalid mutator: %s", name);
417 | }
418 | 


--------------------------------------------------------------------------------
/src/mishegos/mutator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "mish_common.h"
 4 | 
 5 | #include <stdbool.h>
 6 | 
 7 | /* Generate a single fuzzing candidate and populate the given input slot with it.
 8 |  * Returns false if the configured mutation mode has been exhausted.
 9 |  */
10 | typedef bool (*mutator_t)(input_slot *);
11 | 
12 | mutator_t mutator_create(const char *name);
13 | 


--------------------------------------------------------------------------------
/src/mishmat/mishmat:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | # frozen_string_literal: true
  3 | 
  4 | # mishmat: Generate a matrix visualization of mishegos results in HTML
  5 | 
  6 | require "erb"
  7 | require "optparse"
  8 | require "json"
  9 | 
 10 | COLOR_TABLE = Hash.new("#808080").update(
 11 |   "success" => "#00FF00",
 12 |   "failure" => "#FF0000",
 13 |   "none" => "#0000FF",
 14 |   "partial" => "#808080",
 15 |   "unknown" => "#4B0082"
 16 | )
 17 | 
 18 | HEADER = ERB.new <<~HTML
 19 |   <!DOCTYPE html>
 20 |   <html>
 21 |     <head>
 22 |       <meta charset="utf-8">
 23 |       <meta generator="mishmash (mishegos)">
 24 |       <meta timestamp="#{Time.now.to_i}">
 25 |       <style type="text/css">
 26 |         table, th, td {
 27 |           border: 1px solid black;
 28 |         }
 29 | 
 30 |         table tr:hover > td {
 31 |           border: 5px solid black;
 32 |         }
 33 | 
 34 |         th, td {
 35 |           text-align: left;
 36 |         }
 37 | 
 38 |         th {
 39 |           position: sticky;
 40 |           top: 0;
 41 |           background: white;
 42 |         }
 43 |       </style>
 44 |     </head>
 45 | 
 46 |   <body>
 47 |     <h1>mishmat (mishegos)</h1>
 48 | 
 49 |     <h2>legend</h2>
 50 |     <table>
 51 |       <% COLOR_TABLE.each do |color, hex| %>
 52 |         <tr>
 53 |           <td><%= color %></td>
 54 |           <td style="background-color: <%= hex %>; width: 10px"></td>
 55 |         </tr>
 56 |       <% end %>
 57 |     </table>
 58 | 
 59 |     <br><br><br>
 60 | 
 61 |     <table>
 62 | HTML
 63 | 
 64 | HEADER_ROWS = ERB.new <<~HTML
 65 |   <tr>
 66 |     <th></th>
 67 |     <th colspan="100">worker</th>
 68 |   </tr>
 69 | 
 70 |   <tr>
 71 |     <th>input</th>
 72 |     <% row[:outputs].each do |col| %>
 73 |       <th>
 74 |         <%= col[:worker_so] %>
 75 |       </th>
 76 |     <% end %>
 77 |   </tr>
 78 | HTML
 79 | 
 80 | ENTRY_ROW = ERB.new <<~HTML
 81 |   <tr>
 82 |     <td>
 83 |       <code><%= row[:input] %></code>
 84 |     </td>
 85 |     <% row[:outputs].each do |col| %>
 86 |       <td style="background-color: <%= COLOR_TABLE[col[:status][:name]] %>">
 87 |         <code><%= col[:result] %></code> (<%= col[:ndecoded] %> / <%= col[:len] %>)
 88 |       </td>
 89 |     <% end %>
 90 |   </tr>
 91 | HTML
 92 | 
 93 | FOOTER = <<~HTML
 94 |   </table>
 95 |   </body>
 96 |   </html>
 97 | HTML
 98 | 
 99 | opts = {
100 |   limit: Float::INFINITY,
101 | }
102 | 
103 | def write_header!(row)
104 |   STDOUT.puts HEADER.result(binding)
105 |   STDOUT.puts HEADER_ROWS.result(binding)
106 | end
107 | 
108 | def write_footer!
109 |   STDOUT.puts FOOTER
110 | end
111 | 
112 | def write_row!(row)
113 |   STDOUT.puts ENTRY_ROW.result(binding)
114 | end
115 | 
116 | OptionParser.new do |o|
117 |   o.banner = "Usage: mishmat [options]"
118 | 
119 |   o.on "-l", "--limit LIMIT", Integer, "Entry cap" do |limit|
120 |     opts[:limit] = limit
121 |   end
122 | end.parse!
123 | 
124 | # Special-case the first row: We need to grab the figure out the appropriate number of
125 | # columns and their headers.
126 | row = JSON.parse STDIN.gets, symbolize_names: true
127 | write_header! row
128 | write_row! row
129 | 
130 | STDIN.each_line.with_index do |line, i|
131 |   break if i >= opts[:limit]
132 | 
133 |   write_row! JSON.parse(line, symbolize_names: true)
134 | end
135 | 
136 | write_footer!
137 | 


--------------------------------------------------------------------------------
/src/worker/Makefile:
--------------------------------------------------------------------------------
 1 | WORKERS = bfd \
 2 | 	capstone \
 3 | 	dynamorio \
 4 | 	fadec \
 5 | 	xed \
 6 | 	zydis \
 7 | 	bddisasm \
 8 | 	iced \
 9 | 	yaxpeax-x86 \
10 | 	ghidra \
11 | 	llvm
12 | 
13 | .PHONY: all
14 | all: $(WORKERS)
15 | 
16 | .PHONY: $(WORKERS)
17 | $(WORKERS):
18 | 	$(MAKE) \
19 | 		CFLAGS="$(CFLAGS) -fPIC" \
20 | 		LDFLAGS="-shared -Wl,-z,defs" \
21 | 		-C $@/
22 | 
23 | .PHONY: clean
24 | clean:
25 | 	for dir in $(WORKERS); do \
26 | 		$(MAKE) -C $$dir/ clean; \
27 | 	done
28 | 


--------------------------------------------------------------------------------
/src/worker/bddisasm/Makefile:
--------------------------------------------------------------------------------
 1 | override CPPFLAGS := $(CPPFLAGS) -Ibddisasm/inc
 2 | override LDFLAGS := $(LDFLAGS) -Lbddisasm/build
 3 | override LDLIBS := $(LDLIBS) -lbddisasm
 4 | 
 5 | .PHONY: all
 6 | all: bddisasm.so
 7 | 
 8 | bddisasm/build/bddisasm.a:
 9 | 	cmake -B bddisasm/build -S bddisasm -DCMAKE_BUILD_TYPE=Release -DBDD_INCLUDE_TOOL=OFF -DBDD_INCLUDE_ISAGENERATOR=OFF
10 | 	cmake --build bddisasm/build --target bddisasm --parallel
11 | 
12 | bddisasm.so: bddisasm/build/bddisasm.a bddisasm.o
13 | 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) bddisasm.o $(LDLIBS) -o $@
14 | 
15 | .PHONY: clean
16 | clean:
17 | 	rm -rf bddisasm/build
18 | 	rm -rf *.o *.so
19 | 


--------------------------------------------------------------------------------
/src/worker/bddisasm/bddisasm.c:
--------------------------------------------------------------------------------
  1 | #include "bddisasm/inc/bddisasm.h"
  2 | #include "../worker.h"
  3 | 
  4 | char *worker_name = "bddisasm";
  5 | 
  6 | static const char *bddisasm_strerror(NDSTATUS ndstatus) {
  7 |   switch (ndstatus) {
  8 |   case ND_STATUS_BUFFER_TOO_SMALL: {
  9 |     return "The provided input buffer is too small.";
 10 |   }
 11 |   case ND_STATUS_INVALID_ENCODING: {
 12 |     return "Invalid encoding/instruction.";
 13 |   }
 14 |   case ND_STATUS_INSTRUCTION_TOO_LONG: {
 15 |     return "Instruction exceeds the maximum 15 bytes.";
 16 |   }
 17 |   case ND_STATUS_INVALID_PREFIX_SEQUENCE: {
 18 |     return "Invalid prefix sequence is present.";
 19 |   }
 20 |   case ND_STATUS_INVALID_REGISTER_IN_INSTRUCTION: {
 21 |     return "The instruction uses an invalid register.";
 22 |   }
 23 |   case ND_STATUS_XOP_WITH_PREFIX: {
 24 |     return "XOP is present, but also a legacy prefix.";
 25 |   }
 26 |   case ND_STATUS_VEX_WITH_PREFIX: {
 27 |     return "VEX is present, but also a legacy prefix.";
 28 |   }
 29 |   case ND_STATUS_EVEX_WITH_PREFIX: {
 30 |     return "EVEX is present, but also a legacy prefix.";
 31 |   }
 32 |   case ND_STATUS_INVALID_ENCODING_IN_MODE: {
 33 |     return "Invalid encoding/instruction.";
 34 |   }
 35 |   case ND_STATUS_BAD_LOCK_PREFIX: {
 36 |     return "Invalid usage of LOCK.";
 37 |   }
 38 |   case ND_STATUS_CS_LOAD: {
 39 |     return "An attempt to load the CS register.";
 40 |   }
 41 |   case ND_STATUS_66_NOT_ACCEPTED: {
 42 |     return "0x66 prefix is not accepted.";
 43 |   }
 44 |   case ND_STATUS_16_BIT_ADDRESSING_NOT_SUPPORTED: {
 45 |     return "16 bit addressing mode not supported.";
 46 |   }
 47 |   case ND_STATUS_RIP_REL_ADDRESSING_NOT_SUPPORTED: {
 48 |     return "RIP-relative addressing not supported.";
 49 |   }
 50 |   case ND_STATUS_VSIB_WITHOUT_SIB: {
 51 |     return "Instruction uses VSIB, but SIB is not present.";
 52 |   }
 53 |   case ND_STATUS_INVALID_VSIB_REGS: {
 54 |     return "VSIB addressing, same vector reg used more than once.";
 55 |   }
 56 |   case ND_STATUS_VEX_VVVV_MUST_BE_ZERO: {
 57 |     return "VEX.VVVV field must be zero.";
 58 |   }
 59 |   case ND_STATUS_MASK_NOT_SUPPORTED: {
 60 |     return "Masking is not supported.";
 61 |   }
 62 |   case ND_STATUS_MASK_REQUIRED: {
 63 |     return "Masking is mandatory.";
 64 |   }
 65 |   case ND_STATUS_ER_SAE_NOT_SUPPORTED: {
 66 |     return "Embedded rounding/SAE not supported.";
 67 |   }
 68 |   case ND_STATUS_ZEROING_NOT_SUPPORTED: {
 69 |     return "Zeroing not supported.";
 70 |   }
 71 |   case ND_STATUS_ZEROING_ON_MEMORY: {
 72 |     return "Zeroing on memory.";
 73 |   }
 74 |   case ND_STATUS_ZEROING_NO_MASK: {
 75 |     return "Zeroing without masking.";
 76 |   }
 77 |   case ND_STATUS_BROADCAST_NOT_SUPPORTED: {
 78 |     return "Broadcast not supported.";
 79 |   }
 80 |   case ND_STATUS_INVALID_PARAMETER: {
 81 |     return "An invalid parameter was provided.";
 82 |   }
 83 |   case ND_STATUS_INVALID_INSTRUX: {
 84 |     return "The INSTRUX contains unexpected values.";
 85 |   }
 86 |   case ND_STATUS_BUFFER_OVERFLOW: {
 87 |     return "Not enough space is available to format textual disasm.";
 88 |   }
 89 |   case ND_STATUS_INTERNAL_ERROR: {
 90 |     return "Internal error occurred.";
 91 |   }
 92 |   default: {
 93 |     return "unknown";
 94 |   }
 95 |   }
 96 | }
 97 | 
 98 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) {
 99 |   _unused(bddisasm_strerror);
100 | 
101 |   INSTRUX instruction;
102 |   NDSTATUS ndstatus;
103 | 
104 |   ndstatus = NdDecodeEx(&instruction, raw_insn, length, ND_CODE_64, ND_DATA_64);
105 |   if (!ND_SUCCESS(ndstatus)) {
106 |     DLOG("bddisasm decoding failed: %s", bddisasm_strerror(ndstatus));
107 |     result->status = S_FAILURE;
108 |     return;
109 |   }
110 | 
111 |   ndstatus = NdToText(&instruction, 0, sizeof(result->result), result->result);
112 |   if (!ND_SUCCESS(ndstatus)) {
113 |     DLOG("bddisasm formatting failed: %s", bddisasm_strerror(ndstatus));
114 |     result->status = S_FAILURE;
115 |     return;
116 |   }
117 | 
118 |   result->status = S_SUCCESS;
119 |   result->len = strlen(result->result);
120 |   result->ndecoded = instruction.Length;
121 | }
122 | 


--------------------------------------------------------------------------------
/src/worker/bfd/Makefile:
--------------------------------------------------------------------------------
 1 | UNAME := $(shell uname)
 2 | 
 3 | ifeq ($(UNAME), Darwin)
 4 | 	# I can't even begin to describe how annoying this is:
 5 | 	# 1. libbfd's headers error out if some defines provided by a config.h are
 6 | 	#    missing. But config.h is also missing, because libbfd is considered
 7 | 	#    an "internal" library by the GNU binutils maintainers.
 8 | 	#    So we stub those in.
 9 | 	# 2. By default, binutils isn't packaged with libiberty on macOS.
10 | 	#    It has to be built manually with `--enable-install-libiberty`, which
11 | 	#    then needs to be manually linked in. The code below assumes the manual
12 | 	#    build was done with Homebrew.
13 | 	SNEAKY_MAKE_BFD_INCLUDES_WORK_DEFINES := -DPACKAGE=nice-try-bfd-maintainers -DPACKAGE_VERSION=1
14 | 	override CPPFLAGS := $(CPPFLAGS) \
15 | 		$(SNEAKY_MAKE_BFD_INCLUDES_WORK_DEFINES) \
16 | 		-I/usr/local/opt/binutils/include
17 | 	override LDFLAGS := $(LDFLAGS) -L/usr/local/opt/binutils/lib -liberty -lz
18 | endif
19 | 
20 | override LDLIBS := $(LDLIBS) -lbfd -lopcodes
21 | 
22 | .PHONY: all
23 | all: bfd.so
24 | 
25 | bfd.so: bfd.o
26 | 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@
27 | 
28 | .PHONY: clean
29 | clean:
30 | 	rm -rf *.o *.so
31 | 


--------------------------------------------------------------------------------
/src/worker/bfd/bfd.c:
--------------------------------------------------------------------------------
 1 | #include <dis-asm.h>
 2 | #include <stdarg.h>
 3 | 
 4 | #include "../worker.h"
 5 | 
 6 | /* BFD (libopcodes) adapter for mishegos.
 7 |  *
 8 |  * Some notes:
 9 |  * 1. libopcodes is (almost) completely undocumented. As such, a lot of the calls
10 |  *    here are educated guesses + me reading the header file + reading the objdump
11 |  *    source.
12 |  * 2. This code is terrible. My original idea was to use a memfd to slurp the data
13 |  *    coming from the fprintf callback, but didn't work for reasons.
14 |  *    So, I switched it to a big old buffer + (v)snprintf, and that
15 |  *    seems to mostly work. libopcodes isn't nice enough to print newlines
16 |  *    after each instruction for us, so I do that manually.
17 |  */
18 | 
19 | static char disasm_buf[MISHEGOS_DEC_MAXLEN];
20 | static size_t disasm_off;
21 | 
22 | static disassembler_ftype disasm;
23 | static struct disassemble_info disasm_info;
24 | 
25 | char *worker_name = "bfd";
26 | 
27 | static int dis_fprintf(void *_stream, const char *fmt, ...) {
28 |   assert(disasm_off <= MISHEGOS_DEC_MAXLEN && "disassembly buffer overrun?");
29 | 
30 |   size_t remaining_size = MISHEGOS_DEC_MAXLEN - disasm_off;
31 |   assert(remaining_size > 0);
32 | 
33 |   va_list arg;
34 |   va_start(arg, fmt);
35 |   size_t bytes_written = vsnprintf(disasm_buf + disasm_off, remaining_size, fmt, arg);
36 |   disasm_off += bytes_written;
37 |   va_end(arg);
38 |   return 0;
39 | }
40 | 
41 | static void init_dis() {
42 |   disasm = disassembler(bfd_arch_i386, false, bfd_mach_x86_64, NULL);
43 |   if (disasm == NULL) {
44 |     errx(1, "disassembler creation failed");
45 |   }
46 | }
47 | 
48 | void worker_ctor() {
49 |   init_dis();
50 | }
51 | 
52 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) {
53 |   /* dis_fprintf doesn't actually use the stream argument, it just takes
54 |    * disasm_buf from the module scope.
55 |    *
56 |    * I'm pretty sure most of this setup could go in init_dis, but it's here
57 |    * because I ran into problems with the original memfd implementation.
58 |    * Worth re-trying later.
59 |    */
60 |   init_disassemble_info(&disasm_info, NULL, dis_fprintf);
61 |   disasm_info.disassembler_options = "intel-mnemonic";
62 |   disasm_info.arch = bfd_arch_i386;
63 |   disasm_info.mach = bfd_mach_x86_64;
64 |   disasm_info.read_memory_func = buffer_read_memory;
65 |   disasm_info.buffer = raw_insn;
66 |   disasm_info.buffer_vma = 0;
67 |   disasm_info.buffer_length = length;
68 |   disassemble_init_for_target(&disasm_info);
69 | 
70 |   memset(disasm_buf, 0, MISHEGOS_DEC_MAXLEN);
71 | 
72 |   disasm_off = 0;
73 |   size_t pc = disasm(0, &disasm_info);
74 | 
75 |   /* Make sure each instruction is on its own line in the disassembly buffer.
76 |    */
77 |   size_t nl = snprintf(disasm_buf + disasm_off, MISHEGOS_DEC_MAXLEN - disasm_off, "\n");
78 |   assert(nl == 1 && "should have written exactly one byte");
79 |   _unused(nl);
80 |   disasm_off++;
81 | 
82 |   if (pc <= 0 || strstr(disasm_buf, "(bad)") != NULL) {
83 |     result->status = S_FAILURE;
84 |   } else {
85 |     result->status = S_SUCCESS;
86 |   }
87 | 
88 |   memcpy(result->result, disasm_buf, disasm_off);
89 |   result->len = disasm_off;
90 |   result->ndecoded = pc;
91 | }
92 | 


--------------------------------------------------------------------------------
/src/worker/capstone/Makefile:
--------------------------------------------------------------------------------
 1 | override CPPFLAGS := $(CPPFLAGS) -Icapstone/include
 2 | override LDFLAGS := $(LDFLAGS) -L./capstone
 3 | override LDLIBS := $(LDLIBS) -lcapstone
 4 | 
 5 | .PHONY: all
 6 | all: capstone.so
 7 | 
 8 | # This is some stupidity thanks to Capstone's misbehaving Makefile.
 9 | MAKEOVERRIDES := $(filter-out CFLAGS=%,$(MAKEOVERRIDES))
10 | MAKEOVERRIDES := $(filter-out CPPFLAGS=%,$(MAKEOVERRIDES))
11 | MAKEOVERRIDES := $(filter-out LDFLAGS=%,$(MAKEOVERRIDES))
12 | MAKEOVERRIDES := $(filter-out LDLIBS=%,$(MAKEOVERRIDES))
13 | capstone/libcapstone.so.5:
14 | 	$(MAKE) -C capstone/ \
15 | 		CAPSTONE_ARCHS="x86" CAPSTONE_X86_ATT_DISABLE=yes CAPSTONE_BUILD_CORE_ONLY=yes V=1
16 | 
17 | capstone.so: capstone/libcapstone.so.5 capstone.o
18 | 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) capstone.o $(LDLIBS) -o $@
19 | 
20 | .PHONY: clean
21 | clean:
22 | 	rm -rf *.o *.so
23 | 	$(MAKE) -C capstone/ clean
24 | 


--------------------------------------------------------------------------------
/src/worker/capstone/capstone.c:
--------------------------------------------------------------------------------
 1 | #include <capstone/capstone.h>
 2 | 
 3 | #include "../worker.h"
 4 | 
 5 | static csh cs_hnd;
 6 | 
 7 | char *worker_name = "capstone";
 8 | 
 9 | void worker_ctor() {
10 |   if (cs_open(CS_ARCH_X86, CS_MODE_64, &cs_hnd) != CS_ERR_OK) {
11 |     errx(1, "cs_open");
12 |   }
13 | }
14 | 
15 | void worker_dtor() {
16 |   cs_close(&cs_hnd);
17 | }
18 | 
19 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) {
20 |   cs_insn *insn;
21 |   size_t count = cs_disasm(cs_hnd, raw_insn, length, 0, 1, &insn);
22 |   if (count > 0) {
23 |     result->status = S_SUCCESS;
24 |     result->len =
25 |         snprintf(result->result, MISHEGOS_DEC_MAXLEN, "%s %s\n", insn[0].mnemonic, insn[0].op_str);
26 |     result->ndecoded = insn[0].size;
27 | 
28 |     cs_free(insn, count);
29 |   } else {
30 |     result->status = S_FAILURE;
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/worker/dynamorio/.gitignore:
--------------------------------------------------------------------------------
1 | obj/
2 | 


--------------------------------------------------------------------------------
/src/worker/dynamorio/Makefile:
--------------------------------------------------------------------------------
 1 | override CPPFLAGS := $(CPPFLAGS) -DLINUX -DX86_64 -DDR_FAST_IR -Iobj/include
 2 | override LDFLAGS := $(LDFLAGS) -Lobj/lib64
 3 | override LDLIBS := $(LDLIBS) -ldrdecode -ldrlibc
 4 | 
 5 | DYNAMORIO_BUILD_OPTS := -DBUILD_DRSTATS=NO \
 6 | 	-DBUILD_SAMPLES=NO \
 7 | 	-DBUILD_EXT=NO \
 8 | 	-DBUILD_CLIENTS=NO
 9 | 
10 | .PHONY: all
11 | all: dynamorio.so
12 | 
13 | dynamorio.so: dynamorio.o
14 | 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) dynamorio.o $(LDLIBS) -o $@
15 | 
16 | obj/lib64/libdrdecode.a:
17 | 	mkdir -p obj && \
18 | 		cd obj && \
19 | 		env -u CFLAGS -u CXXFLAGS \
20 | 			cmake $(DYNAMORIO_BUILD_OPTS) ../dynamorio && \
21 | 		cmake --build . -- -j4
22 | 
23 | dynamorio.o: dynamorio.c obj/lib64/libdrdecode.a
24 | 
25 | .PHONY: clean
26 | clean:
27 | 	rm -rf *.so
28 | 	rm -rf *.o
29 | 	rm -rf obj
30 | 


--------------------------------------------------------------------------------
/src/worker/dynamorio/dynamorio.c:
--------------------------------------------------------------------------------
 1 | #include <dr_api.h>
 2 | 
 3 | #include "../worker.h"
 4 | 
 5 | char *worker_name = "dynamorio";
 6 | 
 7 | void worker_ctor() {
 8 |   disassemble_set_syntax(DR_DISASM_INTEL);
 9 | }
10 | 
11 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) {
12 |   instr_t instr;
13 |   instr_init(GLOBAL_DCONTEXT, &instr);
14 |   uint8_t *next_pc = decode(GLOBAL_DCONTEXT, raw_insn, &instr);
15 |   if (next_pc == NULL) {
16 |     DLOG("dr decode failed");
17 |     result->status = S_FAILURE;
18 |     return;
19 |   }
20 | 
21 |   size_t len =
22 |       instr_disassemble_to_buffer(GLOBAL_DCONTEXT, &instr, result->result, MISHEGOS_DEC_MAXLEN);
23 |   instr_free(GLOBAL_DCONTEXT, &instr);
24 |   result->status = S_SUCCESS;
25 |   result->len = len;
26 |   result->ndecoded = next_pc - raw_insn;
27 | }
28 | 


--------------------------------------------------------------------------------
/src/worker/fadec/Makefile:
--------------------------------------------------------------------------------
 1 | override CPPFLAGS := $(CPPFLAGS) -Ifadec -Ifadec/build
 2 | override LDFLAGS := $(LDFLAGS) -Lfadec/build
 3 | override LDLIBS := $(LDLIBS) -lfadec
 4 | 
 5 | .PHONY: all
 6 | all: fadec.so
 7 | 
 8 | fadec/build/libfadec.a:
 9 | 	mkdir -p fadec/build && cd fadec && \
10 | 		env -u CPPFLAGS -u LDFLAGS -u LDLIBS meson build -Dbuildtype=release -Darchmode=only64 && \
11 | 		ninja -C build -v
12 | 
13 | fadec.so: fadec.c fadec/build/libfadec.a
14 | 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $< $(LDLIBS) -o $@
15 | 
16 | .PHONY: clean
17 | clean:
18 | 	rm -rf fadec/build
19 | 	rm -rf *.o *.so
20 | 
21 | 


--------------------------------------------------------------------------------
/src/worker/fadec/fadec.c:
--------------------------------------------------------------------------------
 1 | #include <fadec.h>
 2 | 
 3 | #include "../worker.h"
 4 | 
 5 | char *worker_name = "fadec";
 6 | 
 7 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) {
 8 |   FdInstr inst;
 9 |   int r = fd_decode(raw_insn, length, 64, 0, &inst);
10 |   if (r > 0) {
11 |     result->status = S_SUCCESS;
12 |     fd_format(&inst, result->result, MISHEGOS_DEC_MAXLEN);
13 |     result->len = strlen(result->result);
14 |     result->ndecoded = FD_SIZE(&inst);
15 |   } else {
16 |     result->status = r == FD_ERR_PARTIAL ? S_PARTIAL : S_FAILURE;
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/worker/ghidra/.gitignore:
--------------------------------------------------------------------------------
1 | /build
2 | /ghidra.so
3 | 


--------------------------------------------------------------------------------
/src/worker/ghidra/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | 
 3 | project(mishegos_ghidra)
 4 | 
 5 | # Pull in the CMake sleigh build support
 6 | set(sleigh_BUILD_SLEIGHSPECS ON CACHE BOOL "" FORCE)
 7 | add_subdirectory(sleigh-cmake EXCLUDE_FROM_ALL)
 8 | 
 9 | add_library(mishegos_ghidra SHARED ghidra.cc sleighMishegos.cc)
10 | set_target_properties(mishegos_ghidra
11 |   PROPERTIES
12 |     OUTPUT_NAME ghidra
13 |     PREFIX ""
14 | )
15 | target_compile_features(mishegos_ghidra PUBLIC cxx_std_11)
16 | 
17 | target_link_libraries(mishegos_ghidra PRIVATE sleigh::sla sleigh::decomp)
18 | 
19 | # TODO: Not sure how to get this into the project for only linking against the
20 | # mishegos shared library
21 | get_filename_component(mishegos_include_dir "../../include"
22 |   ABSOLUTE BASE_DIR ${PROJECT_SOURCE_DIRECTORY}
23 | )
24 | target_include_directories(mishegos_ghidra
25 |   PRIVATE "$<BUILD_INTERFACE:${mishegos_include_dir}>"
26 | )
27 | 
28 | add_dependencies(mishegos_ghidra
29 |   sleigh_spec_x86-64
30 | )
31 | 


--------------------------------------------------------------------------------
/src/worker/ghidra/Makefile:
--------------------------------------------------------------------------------
 1 | override LDFLAGS :=
 2 | override CXXFLAGS :=
 3 | 
 4 | .PHONY: all
 5 | all: ghidra.so
 6 | 
 7 | ghidra.so:
 8 | 	cmake -B build -S . \
 9 | 			-DCMAKE_BUILD_TYPE=Release \
10 | 			-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
11 | 			-Dsleigh_RELEASE_TYPE=HEAD \
12 | 			"-DFETCHCONTENT_SOURCE_DIR_GHIDRASOURCE=./ghidra" && \
13 | 		cmake --build build --verbose && \
14 | 		cp build/ghidra.$(SO_SUFFIX) ./ghidra.so
15 | 
16 | # Uncomment to build with address sanitizer. Remember to remove 'build'
17 | # directory if switching between the two
18 | #.PHONY: ghidra.so
19 | #.ONESHELL:
20 | #ghidra.so:
21 | #	cmake -B build -S . \
22 | #		-DCMAKE_BUILD_TYPE=Debug \
23 | #		-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
24 | #		-Dsleigh_RELEASE_TYPE=HEAD \
25 | #		"-DCMAKE_C_FLAGS=-Wall -Wpedantic -Wextra -fsanitize=address" \
26 | #		"-DCMAKE_CXX_FLAGS=-Wall -Wpedantic -Wextra -Wconversion -Wsign-conversion -Wcast-qual -Wshadow -Wformat=2 -Wundef -fsanitize=address" \
27 | #		"-DCMAKE_MODULE_LINKER_FLAGS=-fsanitize=address" \
28 | #		"-DFETCHCONTENT_SOURCE_DIR_GHIDRASOURCE=./ghidra"
29 | #	cmake --build build -j --verbose
30 | #	cp build/ghidra.so ./ghidra.so
31 | 
32 | .PHONY: clean
33 | clean:
34 | 	rm -rf build
35 | 	rm -f ./ghidra.so
36 | 


--------------------------------------------------------------------------------
/src/worker/ghidra/ghidra.cc:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is based loosely on Ghidra's sleighexample.cc file for
  3 |  * initializing and loading bytes for sleigh to disassemble
  4 |  *   https://github.com/NationalSecurityAgency/ghidra/blob/47f76c78d6b7d5c56a9256b0666620863805ff30/Ghidra/Features/Decompiler/src/decompile/cpp/sleighexample.cc
  5 |  */
  6 | #include <sleigh/architecture.hh>
  7 | #include <sleigh/loadimage.hh>
  8 | #include <sleigh/marshal.hh>
  9 | 
 10 | #include <iostream>
 11 | #include <exception>
 12 | #include <typeinfo>
 13 | #include <stdexcept>
 14 | 
 15 | #include "sleighMishegos.hh"
 16 | #include "../worker.h"
 17 | #include "mish_common.h"
 18 | 
 19 | using namespace ghidra;
 20 | 
 21 | extern "C" {
 22 | const char *worker_name = (const char *)"ghidra";
 23 | 
 24 | void worker_ctor();
 25 | 
 26 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length);
 27 | }
 28 | 
 29 | // This is a tiny LoadImage class which feeds the executable bytes to the translator
 30 | class MyLoadImage : public LoadImage {
 31 |   uintb baseaddr;
 32 |   int4 length;
 33 |   uint1 *data;
 34 | 
 35 | public:
 36 |   // "nofile" doesn't have any special meaning. Just doing what was done in
 37 |   // sleighExample.cc
 38 |   MyLoadImage(uintb ad, uint1 *ptr, int4 sz)
 39 |       : LoadImage("nofile"), baseaddr{ad}, length{sz}, data{ptr} {
 40 |   }
 41 |   virtual void loadFill(uint1 *ptr, int4 size, const Address &addr) override;
 42 |   string getArchType(void) const override {
 43 |     return "x86:LE:64:default";
 44 |   }
 45 |   virtual void adjustVma(long) override {
 46 |   }
 47 |   virtual void setData(uint1 *ptr, int4 sz) {
 48 |     this->data = ptr;
 49 |     this->length = sz;
 50 |   }
 51 | };
 52 | 
 53 | // This is the only important method for the LoadImage. It returns bytes from the static array
 54 | // depending on the address range requested
 55 | void MyLoadImage::loadFill(uint1 *ptr, int4 size, const Address &addr) {
 56 |   uintb start = addr.getOffset();
 57 |   uintb max = baseaddr + (length - 1);
 58 |   for (int4 i = 0; i < size; ++i) {              // For every byte requestes
 59 |     uintb curoff = start + i;                    // Calculate offset of byte
 60 |     if ((curoff < baseaddr) || (curoff > max)) { // If byte does not fall in window
 61 |       ptr[i] = 0;                                // return 0
 62 |       continue;
 63 |     }
 64 |     uintb diff = curoff - baseaddr;
 65 |     ptr[i] = data[(int4)diff]; // Otherwise return data from our window
 66 |   }
 67 | }
 68 | 
 69 | // Here is a simple class for emitting assembly.  In this case, we send the strings straight
 70 | // to the result.
 71 | class AssemblyMishegos : public AssemblyEmit {
 72 |   decode_result *result;
 73 | 
 74 | public:
 75 |   AssemblyMishegos(decode_result *dr) : result(dr){};
 76 |   virtual void dump(const Address &, const string &mnem, const string &body) {
 77 |     result->status = S_SUCCESS;
 78 |     result->len =
 79 |         snprintf(result->result, MISHEGOS_DEC_MAXLEN, "%s %s\n", mnem.c_str(), body.c_str());
 80 |   }
 81 | };
 82 | 
 83 | static const uintb START_ADDRESS = 0x0;
 84 | 
 85 | // Storing data files
 86 | DocumentStorage &g_docstorage() {
 87 |   static DocumentStorage docstorage;
 88 |   return docstorage;
 89 | }
 90 | 
 91 | // Context for disassembly
 92 | ContextInternal &g_context() {
 93 |   static ContextInternal context;
 94 |   return context;
 95 | }
 96 | 
 97 | // Loader for reading instruction bytes
 98 | MyLoadImage &g_loader() {
 99 |   static MyLoadImage loader(START_ADDRESS, nullptr, 0);
100 |   return loader;
101 | }
102 | 
103 | // Translator for doing disassembly
104 | SleighMishegos &g_trans() {
105 |   static SleighMishegos trans(&g_loader(), &g_context());
106 |   return trans;
107 | }
108 | 
109 | void worker_ctor() {
110 |   AttributeId::initialize();
111 |   ElementId::initialize();
112 | 
113 |   SleighMishegos &trans = g_trans();
114 | 
115 |   // Set up the assembler/pcode-translator
116 |   string sleighfilename = "src/worker/ghidra/build/sleigh-cmake/specfiles/Ghidra/Processors/x86/"
117 |                           "data/languages/x86-64.sla";
118 |   // Need this for correctly setting up the 64 bit x86 mode
119 |   string pspecfilename = "src/worker/ghidra/build/sleigh-cmake/specfiles/Ghidra/Processors/x86/"
120 |                          "data/languages/x86-64.pspec";
121 | 
122 |   // Read sleigh and spec file into DOM
123 |   DocumentStorage &docstorage = g_docstorage();
124 |   Element *sleighroot = docstorage.openDocument(sleighfilename)->getRoot();
125 |   docstorage.registerTag(sleighroot);
126 |   Element *specroot = docstorage.openDocument(pspecfilename)->getRoot();
127 |   docstorage.registerTag(specroot);
128 | 
129 |   trans.initialize(docstorage); // Initialize the translator
130 | 
131 |   // Now that context symbol names are loaded by the translator
132 |   // we can set the default context
133 |   // This imitates what is done in
134 |   //   void Architecture::parseProcessorConfig(DocumentStorage &store)
135 |   const Element *el = docstorage.getTag("processor_spec");
136 |   if (el == (const Element *)0)
137 |     throw LowlevelError("No processor configuration tag found");
138 |   XmlDecode decoder(&trans, el);
139 | 
140 |   uint4 elemId = decoder.openElement(ELEM_PROCESSOR_SPEC);
141 |   for (;;) {
142 |     uint4 subId = decoder.peekElement();
143 |     if (subId == 0)
144 |       break;
145 |     else if (subId == ELEM_CONTEXT_DATA) {
146 |       g_context().decodeFromSpec(decoder);
147 |       break;
148 |     } else {
149 |       decoder.openElement();
150 |       decoder.closeElementSkipping(subId);
151 |     }
152 |   }
153 |   decoder.closeElement(elemId);
154 | 
155 |   // Single instruction disasm. Prevent instructions from messing up future
156 |   // instruction disassembly
157 |   trans.allowContextSet(false);
158 | }
159 | 
160 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) {
161 |   MyLoadImage &loader = g_loader();
162 |   const SleighMishegos &trans = g_trans();
163 | 
164 |   loader.setData(raw_insn, length);
165 | 
166 |   // Set up the disassembly dumper
167 |   AssemblyMishegos assememit(result);
168 | 
169 |   // Starting disassembly address
170 |   Address addr(trans.getDefaultCodeSpace(), START_ADDRESS);
171 | 
172 |   try {
173 |     result->ndecoded = trans.printAssembly(assememit, addr);
174 |   } catch (...) {
175 |     // Uncomment for debugging exception info
176 |     // std::exception_ptr p = std::current_exception();
177 |     // std::cout << (p ? p.__cxa_exception_type()->name() : "null") << std::endl;
178 | 
179 |     result->status = S_FAILURE;
180 |   }
181 | }
182 | 


--------------------------------------------------------------------------------
/src/worker/ghidra/sleighMishegos.cc:
--------------------------------------------------------------------------------
  1 | /* ###
  2 |  * IP: GHIDRA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  *
 16 |  * This file was copied from upstream
 17 |  * https://github.com/NationalSecurityAgency/ghidra/blob/2536099c0eb2683ee0e416a127f8a8795f8de853/Ghidra/Features/Decompiler/src/decompile/cpp/sleigh.cc
 18 |  *
 19 |  * Modified by Eric Kilmer at Trail of Bits 2022
 20 |  * Modified to better support mishegos single-shot disassembly by not using the
 21 |  * disassembly cache. This allows us to get new disassembly results at the same
 22 |  * address without having to reinitialize everything.
 23 |  *
 24 |  * This file has been modified in a way to minimize the diff from upstream.
 25 |  * There is dead code and other code artifacts that probably wouldn't be
 26 |  * written in the same way had this functionality been written fresh.
 27 |  */
 28 | #include "sleighMishegos.hh"
 29 | #include <sleigh/loadimage.hh>
 30 | 
 31 | namespace ghidra {
 32 | 
 33 | PcodeCacher::PcodeCacher(void)
 34 | 
 35 | {
 36 |   // We aim to allocate this array only once
 37 |   uint4 maxsize = 600;
 38 |   poolstart = new VarnodeData[ maxsize ];
 39 |   endpool = poolstart + maxsize;
 40 |   curpool = poolstart;
 41 | }
 42 | 
 43 | PcodeCacher::~PcodeCacher(void)
 44 | 
 45 | {
 46 |   delete [] poolstart;
 47 | }
 48 | 
 49 | /// Expand the VarnodeData pool so that \e size more elements fit, and return
 50 | /// a pointer to first available element.
 51 | /// \param size is the number of elements to expand the pool by
 52 | /// \return the first available VarnodeData
 53 | VarnodeData *PcodeCacher::expandPool(uint4 size)
 54 | 
 55 | {
 56 |   uint4 curmax = endpool - poolstart;
 57 |   uint4 cursize = curpool - poolstart;
 58 |   if (cursize + size <= curmax)
 59 |     return curpool;		// No expansion necessary
 60 |   uint4 increase = (cursize + size) - curmax;
 61 |   if (increase < 100)		// Increase by at least 100
 62 |     increase = 100;
 63 | 
 64 |   uint4 newsize = curmax + increase;
 65 | 
 66 |   VarnodeData *newpool = new VarnodeData[newsize];
 67 |   for(uint4 i=0;i<cursize;++i)
 68 |     newpool[i] = poolstart[i];	// Copy old data
 69 |   // Update references to the old pool
 70 |   for(uint4 i=0;i<issued.size();++i) {
 71 |     VarnodeData *outvar = issued[i].outvar;
 72 |     if (outvar != (VarnodeData *)0) {
 73 |       outvar = newpool + (outvar - poolstart);
 74 |       issued[i].outvar = outvar;
 75 |     }
 76 |     VarnodeData *invar = issued[i].invar;
 77 |     if (invar != (VarnodeData *)0) {
 78 |       invar = newpool + (invar - poolstart);
 79 |       issued[i].invar = invar;
 80 |     }
 81 |   }
 82 |   list<RelativeRecord>::iterator iter;
 83 |   for(iter=label_refs.begin();iter!=label_refs.end();++iter) {
 84 |     VarnodeData *ref = (*iter).dataptr;
 85 |     (*iter).dataptr = newpool + (ref - poolstart);
 86 |   }
 87 | 
 88 |   delete [] poolstart;		// Free up old pool
 89 |   poolstart = newpool;
 90 |   curpool = newpool + (cursize + size);
 91 |   endpool = newpool + newsize;
 92 |   return newpool + cursize;
 93 | }
 94 | 
 95 | /// Store off a reference to the Varnode and the absolute index of the next
 96 | /// instruction.  The Varnode must be an operand of the current instruction.
 97 | /// \param ptr is the Varnode reference
 98 | void PcodeCacher::addLabelRef(VarnodeData *ptr)
 99 | 
100 | {
101 |   label_refs.emplace_back();
102 |   label_refs.back().dataptr = ptr;
103 |   label_refs.back().calling_index = issued.size();
104 | }
105 | 
106 | /// The label has an id that is referred to by Varnodes holding
107 | /// intra-instruction branch targets, prior to converting
108 | /// them to a \e relative \e branch offset.  The label is associated with
109 | /// the absolute index of the next PcodeData object to be issued,
110 | /// facilitating this conversion.
111 | /// \param id is the given id of the label
112 | void PcodeCacher::addLabel(uint4 id)
113 | 
114 | {
115 |   while(labels.size() <= id)
116 |     labels.push_back(0xbadbeef);
117 |   labels[ id ] = issued.size();
118 | }
119 | 
120 | void PcodeCacher::clear(void)
121 | 
122 | {
123 |   curpool = poolstart;
124 |   issued.clear();
125 |   label_refs.clear();
126 |   labels.clear();
127 | }
128 | 
129 | /// Assuming all the PcodeData has been generated for an
130 | /// instruction, go resolve any relative offsets and back
131 | /// patch their value(s) into the PcodeData
132 | void PcodeCacher::resolveRelatives(void)
133 | 
134 | {
135 |   list<RelativeRecord>::const_iterator iter;
136 |   for(iter=label_refs.begin();iter!=label_refs.end();++iter) {
137 |     VarnodeData *ptr = (*iter).dataptr;
138 |     uint4 id = ptr->offset;
139 |     if ((id >= labels.size())||(labels[id] == 0xbadbeef))
140 |       throw LowlevelError("Reference to non-existant sleigh label");
141 |     // Calculate the relative index given the two absolute indices
142 |     uintb res = labels[id] - (*iter).calling_index;
143 |     res &= calc_mask( ptr->size );
144 |     ptr->offset = res;
145 |   }
146 | }
147 | 
148 | /// Each p-code operation is presented to the emitter via its dump() method.
149 | /// \param addr is the Address associated with the p-code operation
150 | /// \param emt is the emitter
151 | void PcodeCacher::emit(const Address &addr,PcodeEmit *emt) const
152 | 
153 | {
154 |   vector<PcodeData>::const_iterator iter;
155 | 
156 |   for(iter=issued.begin();iter!=issued.end();++iter)
157 |     emt->dump(addr,(*iter).opc,(*iter).outvar,(*iter).invar,(*iter).isize);
158 | }
159 | 
160 | /// \brief Generate a concrete VarnodeData object from the given template (VarnodeTpl)
161 | ///
162 | /// \param vntpl is the template to reference
163 | /// \param vn is the object to fill in with concrete values
164 | void SleighBuilder::generateLocation(const VarnodeTpl *vntpl,VarnodeData &vn)
165 | 
166 | {
167 |   vn.space = vntpl->getSpace().fixSpace(*walker);
168 |   vn.size = vntpl->getSize().fix(*walker);
169 |   if (vn.space == const_space)
170 |     vn.offset = vntpl->getOffset().fix(*walker) & calc_mask(vn.size);
171 |   else if (vn.space == uniq_space) {
172 |     vn.offset = vntpl->getOffset().fix(*walker);
173 |     vn.offset |= uniqueoffset;
174 |   }
175 |   else
176 |     vn.offset = vn.space->wrapOffset(vntpl->getOffset().fix(*walker));
177 | }
178 | 
179 | /// \brief Generate a pointer VarnodeData from a dynamic template (VarnodeTpl)
180 | ///
181 | /// The symbol represents a value referenced through a dynamic pointer.
182 | /// This method generates the varnode representing the pointer itself and also
183 | /// returns the address space in anticipation of generating the LOAD or STORE
184 | /// that actually manipulates the value.
185 | /// \param vntpl is the dynamic template to reference
186 | /// \param vn is the object to fill with concrete values
187 | /// \return the address space being pointed to
188 | AddrSpace *SleighBuilder::generatePointer(const VarnodeTpl *vntpl,VarnodeData &vn)
189 | 
190 | {
191 |   const FixedHandle &hand(walker->getFixedHandle(vntpl->getOffset().getHandleIndex()));
192 |   vn.space = hand.offset_space;
193 |   vn.size = hand.offset_size;
194 |   if (vn.space == const_space)
195 |     vn.offset = hand.offset_offset & calc_mask(vn.size);
196 |   else if (vn.space == uniq_space)
197 |     vn.offset = hand.offset_offset | uniqueoffset;
198 |   else
199 |     vn.offset = vn.space->wrapOffset(hand.offset_offset);
200 |   return hand.space;
201 | }
202 | 
203 | void SleighBuilder::generatePointerAdd(PcodeData *op,const VarnodeTpl *vntpl)
204 | 
205 | {
206 |   uintb offsetPlus = vntpl->getOffset().getReal() & 0xffff;
207 |   if (offsetPlus == 0) {
208 |     return;
209 |   }
210 |   PcodeData *nextop = cache->allocateInstruction();
211 |   nextop->opc = op->opc;
212 |   nextop->invar = op->invar;
213 |   nextop->isize = op->isize;
214 |   nextop->outvar = op->outvar;
215 |   op->isize = 2;
216 |   op->opc = CPUI_INT_ADD;
217 |   VarnodeData *newparams = op->invar = cache->allocateVarnodes(2);
218 |   newparams[0] = nextop->invar[1];
219 |   newparams[1].space = const_space;	// Add in V_OFFSET_PLUS
220 |   newparams[1].offset = offsetPlus;
221 |   newparams[1].size = newparams[0].size;
222 |   op->outvar = nextop->invar + 1;	// Output of ADD is input to original op
223 |   op->outvar->space = uniq_space;		// Result of INT_ADD in special runtime temp
224 |   op->outvar->offset = uniq_space->getTrans()->getUniqueStart(Translate::RUNTIME_BITRANGE_EA);
225 | }
226 | 
227 | void SleighBuilder::dump(OpTpl *op)
228 | 
229 | {				// Dump on op through low-level dump interface
230 | 				// filling in dynamic loads and stores if necessary
231 |   PcodeData *thisop;
232 |   VarnodeData *invars;
233 |   VarnodeData *loadvars;
234 |   VarnodeData *storevars;
235 |   VarnodeTpl *vn,*outvn;
236 |   int4 isize = op->numInput();
237 | 				// First build all the inputs
238 |   invars = cache->allocateVarnodes(isize);
239 |   for(int4 i=0;i<isize;++i) {
240 |     vn = op->getIn(i);
241 |     if (vn->isDynamic(*walker)) {
242 |       generateLocation(vn,invars[i]); // Input of -op- is really temporary storage
243 |       PcodeData *load_op = cache->allocateInstruction();
244 |       load_op->opc = CPUI_LOAD;
245 |       load_op->outvar = invars + i;
246 |       load_op->isize = 2;
247 |       loadvars = load_op->invar = cache->allocateVarnodes(2);
248 |       AddrSpace *spc = generatePointer(vn,loadvars[1]);
249 |       loadvars[0].space = const_space;
250 |       loadvars[0].offset = (uintb)(uintp)spc;
251 |       loadvars[0].size = sizeof(spc);
252 |       if (vn->getOffset().getSelect() == ConstTpl::v_offset_plus)
253 | 	generatePointerAdd(load_op, vn);
254 |     }
255 |     else
256 |       generateLocation(vn,invars[i]);
257 |   }
258 |   if ((isize>0)&&(op->getIn(0)->isRelative())) {
259 |     invars->offset += getLabelBase();
260 |     cache->addLabelRef(invars);
261 |   }
262 |   thisop = cache->allocateInstruction();
263 |   thisop->opc = op->getOpcode();
264 |   thisop->invar = invars;
265 |   thisop->isize = isize;
266 |   outvn = op->getOut();
267 |   if (outvn != (VarnodeTpl *)0) {
268 |     if (outvn->isDynamic(*walker)) {
269 |       storevars = cache->allocateVarnodes(3);
270 |       generateLocation(outvn,storevars[2]); // Output of -op- is really temporary storage
271 |       thisop->outvar = storevars+2;
272 |       PcodeData *store_op = cache->allocateInstruction();
273 |       store_op->opc = CPUI_STORE;
274 |       store_op->isize = 3;
275 |       // store_op->outvar = (VarnodeData *)0;
276 |       store_op->invar = storevars;
277 |       AddrSpace *spc = generatePointer(outvn,storevars[1]); // pointer
278 |       storevars[0].space = const_space;
279 |       storevars[0].offset = (uintb)(uintp)spc; // space in which to store
280 |       storevars[0].size = sizeof(spc);
281 |       if (outvn->getOffset().getSelect() == ConstTpl::v_offset_plus)
282 | 	generatePointerAdd(store_op,outvn);
283 |     }
284 |     else {
285 |       thisop->outvar = cache->allocateVarnodes(1);
286 |       generateLocation(outvn,*thisop->outvar);
287 |     }
288 |   }
289 | }
290 | 
291 | /// \brief Build a named p-code section of a constructor that contains only implied BUILD directives
292 | ///
293 | /// If a named section of a constructor is empty, we still need to walk
294 | /// through any subtables that might contain p-code in their named sections.
295 | /// This method treats each subtable operand as an implied \e build directive,
296 | /// in the otherwise empty section.
297 | /// \param ct is the matching currently Constructor being built
298 | /// \param secnum is the particular \e named section number to build
299 | void SleighBuilder::buildEmpty(Constructor *ct,int4 secnum)
300 | 
301 | {
302 |   int4 numops = ct->getNumOperands();
303 | 
304 |   for(int4 i=0;i<numops;++i) {
305 |     SubtableSymbol *sym = (SubtableSymbol *)ct->getOperand(i)->getDefiningSymbol();
306 |     if (sym == (SubtableSymbol *)0) continue;
307 |     if (sym->getType() != SleighSymbol::subtable_symbol) continue;
308 | 
309 |     walker->pushOperand(i);
310 |     ConstructTpl *construct = walker->getConstructor()->getNamedTempl(secnum);
311 |     if (construct == (ConstructTpl *)0)
312 |       buildEmpty(walker->getConstructor(),secnum);
313 |     else
314 |       build(construct,secnum);
315 |     walker->popOperand();
316 |   }
317 | }
318 | 
319 | /// Bits used to make temporary registers unique across multiple instructions
320 | /// are generated based on the given address.
321 | /// \param addr is the given Address
322 | void SleighBuilder::setUniqueOffset(const Address &addr)
323 | 
324 | {
325 |   uniqueoffset = (addr.getOffset() & uniquemask)<<4;
326 | }
327 | 
328 | /// \brief Constructor
329 | ///
330 | /// \param w is the parsed instruction
331 | /// \param dcache is a cache of nearby instruction parses
332 | /// \param pc will hold the PcodeData and VarnodeData objects produced by \b this builder
333 | /// \param cspc is the constant address space
334 | /// \param uspc is the unique address space
335 | /// \param umask is the mask to use to find unique bits within an Address
336 | SleighBuilder::SleighBuilder(ParserWalker *w,DisassemblyCache *dcache,PcodeCacher *pc,AddrSpace *cspc,
337 | 			     AddrSpace *uspc,uint4 umask)
338 |   : PcodeBuilder(0)
339 | {
340 |   walker = w;
341 |   discache = dcache;
342 |   cache = pc;
343 |   const_space = cspc;
344 |   uniq_space = uspc;
345 |   uniquemask = umask;
346 |   uniqueoffset = (walker->getAddr().getOffset() & uniquemask)<<4;
347 | }
348 | 
349 | void SleighBuilder::appendBuild(OpTpl *bld,int4 secnum)
350 | 
351 | {
352 |   // Append p-code for a particular build statement
353 |   int4 index = bld->getIn(0)->getOffset().getReal(); // Recover operand index from build statement
354 | 				// Check if operand is a subtable
355 |   SubtableSymbol *sym = (SubtableSymbol *)walker->getConstructor()->getOperand(index)->getDefiningSymbol();
356 |   if ((sym==(SubtableSymbol *)0)||(sym->getType() != SleighSymbol::subtable_symbol)) return;
357 | 
358 |   walker->pushOperand(index);
359 |   Constructor *ct = walker->getConstructor();
360 |   if (secnum >=0) {
361 |     ConstructTpl *construct = ct->getNamedTempl(secnum);
362 |     if (construct == (ConstructTpl *)0)
363 |       buildEmpty(ct,secnum);
364 |     else
365 |       build(construct,secnum);
366 |   }
367 |   else {
368 |     ConstructTpl *construct = ct->getTempl();
369 |     build(construct,-1);
370 |   }
371 |   walker->popOperand();
372 | }
373 | 
374 | void SleighBuilder::delaySlot(OpTpl *op)
375 | 
376 | {
377 |   // Append pcode for an entire instruction (delay slot)
378 |   // in the middle of the current instruction
379 |   ParserWalker *tmp = walker;
380 |   uintb olduniqueoffset = uniqueoffset;
381 | 
382 |   Address baseaddr = tmp->getAddr();
383 |   int4 fallOffset = tmp->getLength();
384 |   int4 delaySlotByteCnt = tmp->getParserContext()->getDelaySlot();
385 |   int4 bytecount = 0;
386 |   do {
387 |     Address newaddr = baseaddr + fallOffset;
388 |     setUniqueOffset(newaddr);
389 |     const ParserContext *pos = discache->getParserContext(newaddr);
390 |     if (pos->getParserState() != ParserContext::pcode)
391 |       throw LowlevelError("Could not obtain cached delay slot instruction");
392 |     int4 len = pos->getLength();
393 | 
394 |     ParserWalker newwalker( pos );
395 |     walker = &newwalker;
396 |     walker->baseState();
397 |     build(walker->getConstructor()->getTempl(),-1); // Build the whole delay slot
398 |     fallOffset += len;
399 |     bytecount += len;
400 |   } while(bytecount < delaySlotByteCnt);
401 |   walker = tmp;			// Restore original context
402 |   uniqueoffset = olduniqueoffset;
403 | }
404 | 
405 | void SleighBuilder::setLabel(OpTpl *op)
406 | 
407 | {
408 |   cache->addLabel( op->getIn(0)->getOffset().getReal()+getLabelBase() );
409 | }
410 | 
411 | void SleighBuilder::appendCrossBuild(OpTpl *bld,int4 secnum)
412 | 
413 | {
414 |   // Weave in the p-code section from an instruction at another address
415 |   // bld-param(0) contains the address of the instruction
416 |   // bld-param(1) contains the section number
417 |   if (secnum>=0)
418 |     throw LowlevelError("CROSSBUILD directive within a named section");
419 |   secnum = bld->getIn(1)->getOffset().getReal();
420 |   VarnodeTpl *vn = bld->getIn(0);
421 |   AddrSpace *spc = vn->getSpace().fixSpace(*walker);
422 |   uintb addr = spc->wrapOffset( vn->getOffset().fix(*walker) );
423 | 
424 |   ParserWalker *tmp = walker;
425 |   uintb olduniqueoffset = uniqueoffset;
426 | 
427 |   Address newaddr(spc,addr);
428 |   setUniqueOffset(newaddr);
429 |   const ParserContext *pos = discache->getParserContext( newaddr );
430 |   if (pos->getParserState() != ParserContext::pcode)
431 |     throw LowlevelError("Could not obtain cached crossbuild instruction");
432 | 
433 |   ParserWalker newwalker( pos, tmp->getParserContext() );
434 |   walker = &newwalker;
435 | 
436 |   walker->baseState();
437 |   Constructor *ct = walker->getConstructor();
438 |   ConstructTpl *construct = ct->getNamedTempl(secnum);
439 |   if (construct == (ConstructTpl *)0)
440 |     buildEmpty(ct,secnum);
441 |   else
442 |     build(construct,secnum);
443 |   walker = tmp;
444 |   uniqueoffset = olduniqueoffset;
445 | }
446 | 
447 | /// \param min is the minimum number of allocations before a reuse is expected
448 | /// \param hashsize is the number of elements in the hash-table
449 | void DisassemblyCache::initialize(int4 min,int4 hashsize)
450 | 
451 | {
452 |   minimumreuse = min;
453 |   mask = hashsize-1;
454 |   uintb masktest = coveringmask((uintb)mask);
455 |   if (masktest != (uintb)mask)	// -hashsize- must be a power of 2
456 |     throw LowlevelError("Bad windowsize for disassembly cache");
457 |   list = new ParserContext *[minimumreuse];
458 |   nextfree = 0;
459 |   hashtable = new ParserContext *[hashsize];
460 |   for(int4 i=0;i<minimumreuse;++i) {
461 |     ParserContext *pos = new ParserContext(contextcache,translate);
462 |     pos->initialize(75,20,constspace);
463 |     list[i] = pos;
464 |   }
465 |   ParserContext *pos = list[0];
466 |   for(int4 i=0;i<hashsize;++i)
467 |     hashtable[i] = pos;		// Make sure all hashtable positions point to a real ParserContext
468 | }
469 | 
470 | void DisassemblyCache::free(void)
471 | 
472 | {
473 |   for(int4 i=0;i<minimumreuse;++i)
474 |     delete list[i];
475 |   delete [] list;
476 |   delete [] hashtable;
477 | }
478 | 
479 | /// \param trans is the Translate object instantiating this cache (for inst_next2 callbacks)
480 | /// \param ccache is the ContextCache front-end shared across all the parser contexts
481 | /// \param cspace is the constant address space used for minting constant Varnodes
482 | /// \param cachesize is the number of distinct ParserContext objects in this cache
483 | /// \param windowsize is the size of the ParserContext hash-table
484 | DisassemblyCache::DisassemblyCache(Translate *trans,ContextCache *ccache,AddrSpace *cspace,int4 cachesize,int4 windowsize)
485 | 
486 | {
487 |   translate = trans;
488 |   contextcache = ccache;
489 |   constspace = cspace;
490 |   initialize(cachesize,windowsize);		// Set default settings for the cache
491 | }
492 | 
493 | /// Return a (possibly cached) ParserContext that is associated with \e addr
494 | /// If n different calls to this interface are made with n different Addresses, if
495 | ///    - n <= minimumreuse   AND
496 | ///    - all the addresses are within the windowsize (=mask+1)
497 | ///
498 | /// then the cacher guarantees that you get all different ParserContext objects
499 | /// \param addr is the Address to disassemble at
500 | /// \return the ParserContext associated with the address
501 | ParserContext *DisassemblyCache::getParserContext(const Address &addr)
502 | 
503 | {
504 |   int4 hashindex = ((int4) addr.getOffset()) & mask;
505 |   ParserContext *res = hashtable[ hashindex ];
506 |   if (res->getAddr() == addr)
507 |     return res;
508 |   res = list[ nextfree ];
509 |   nextfree += 1;		// Advance the circular index
510 |   if (nextfree >= minimumreuse)
511 |     nextfree = 0;
512 |   res->setAddr(addr);
513 |   res->setParserState(ParserContext::uninitialized);	// Need to start over with parsing
514 |   hashtable[ hashindex ] = res;	// Stick it into the hashtable
515 |   return res;
516 | }
517 | 
518 | /// \param ld is the LoadImage to draw program bytes from
519 | /// \param c_db is the context database
520 | SleighMishegos::SleighMishegos(LoadImage *ld,ContextDatabase *c_db)
521 |   : SleighBase()
522 | 
523 | {
524 |   loader = ld;
525 |   context_db = c_db;
526 |   cache = new ContextCache(c_db);
527 |   discache = (DisassemblyCache *)0;
528 |   pos = (ParserContext *)0;
529 | }
530 | 
531 | void SleighMishegos::clearForDelete(void)
532 | 
533 | {
534 |   delete cache;
535 |   if (discache != (DisassemblyCache *)0)
536 |     delete discache;
537 |   if (pos != (ParserContext *)0)
538 |     delete pos;
539 | }
540 | 
541 | SleighMishegos::~SleighMishegos(void)
542 | 
543 | {
544 |   clearForDelete();
545 | }
546 | 
547 | /// Completely clear everything except the base and reconstruct
548 | /// with a new LoadImage and ContextDatabase
549 | /// \param ld is the new LoadImage
550 | /// \param c_db is the new ContextDatabase
551 | void SleighMishegos::reset(LoadImage *ld,ContextDatabase *c_db)
552 | 
553 | {
554 |   clearForDelete();
555 |   pcode_cache.clear();
556 |   loader = ld;
557 |   context_db = c_db;
558 |   cache = new ContextCache(c_db);
559 |   discache = (DisassemblyCache *)0;
560 |   pos = (ParserContext *)0;
561 | }
562 | 
563 | /// The .sla file from the document store is loaded and cache objects are prepared
564 | /// \param store is the document store containing the main \<sleigh> tag.
565 | void SleighMishegos::initialize(DocumentStorage &store)
566 | 
567 | {
568 |   if (!isInitialized()) {	// Initialize the base if not already
569 |     const Element *el = store.getTag("sleigh");
570 |     if (el == (const Element *)0)
571 |       throw LowlevelError("Could not find sleigh tag");
572 |     restoreXml(el);
573 |   }
574 |   else
575 |     reregisterContext();
576 |   uint4 parser_cachesize = 2;
577 |   uint4 parser_windowsize = 32;
578 |   if ((maxdelayslotbytes > 1)||(unique_allocatemask != 0)) {
579 |     parser_cachesize = 8;
580 |     parser_windowsize = 256;
581 |   }
582 |   pos = new ParserContext(cache,this);
583 |   // Values taken from (now removed) DisassemblyCache::initialize. No
584 |   // explanation for magic values
585 |   pos->initialize(75, 20, getConstantSpace());
586 | }
587 | 
588 | /// \brief Obtain a parse tree for the instruction at the given address
589 | ///
590 | /// The tree may be cached from a previous access.  If the address
591 | /// has not been parsed, disassembly is performed, and a new parse tree
592 | /// is prepared.  Depending on the desired \e state, the parse tree
593 | /// can be prepared either for disassembly or for p-code generation.
594 | /// \param addr is the given address of the instruction
595 | /// \param state is the desired parse state.
596 | /// \return the parse tree object (ParseContext)
597 | ParserContext *SleighMishegos::obtainContext(const Address &addr,int4 state) const
598 | 
599 | {
600 |   pos->setAddr(addr);
601 |   pos->setParserState(ParserContext::uninitialized);
602 |   int4 curstate = pos->getParserState();
603 |   if (curstate >= state)
604 |     return pos;
605 |   if (curstate == ParserContext::uninitialized) {
606 |     resolve(*pos);
607 |     if (state == ParserContext::disassembly)
608 |       return pos;
609 |   }
610 |   // If we reach here,  state must be ParserContext::pcode
611 |   resolveHandles(*pos);
612 |   return pos;
613 | }
614 | 
615 | /// Resolve \e all the constructors involved in the instruction at the indicated address
616 | /// \param pos is the parse object that will hold the resulting tree
617 | void SleighMishegos::resolve(ParserContext &pos) const
618 | 
619 | {
620 |   loader->loadFill(pos.getBuffer(),16,pos.getAddr());
621 |   ParserWalkerChange walker(&pos);
622 |   pos.deallocateState(walker);	// Clear the previous resolve and initialize the walker
623 |   Constructor *ct,*subct;
624 |   uint4 off;
625 |   int4 oper,numoper;
626 | 
627 |   pos.setDelaySlot(0);
628 |   walker.setOffset(0);		// Initial offset
629 |   pos.clearCommits();		// Clear any old context commits
630 |   pos.loadContext();		// Get context for current address
631 |   ct = root->resolve(walker);	// Base constructor
632 |   walker.setConstructor(ct);
633 |   ct->applyContext(walker);
634 |   while(walker.isState()) {
635 |     ct = walker.getConstructor();
636 |     oper = walker.getOperand();
637 |     numoper = ct->getNumOperands();
638 |     while(oper < numoper) {
639 |       OperandSymbol *sym = ct->getOperand(oper);
640 |       off = walker.getOffset(sym->getOffsetBase()) + sym->getRelativeOffset();
641 |       pos.allocateOperand(oper,walker); // Descend into new operand and reserve space
642 |       walker.setOffset(off);
643 |       TripleSymbol *tsym = sym->getDefiningSymbol();
644 |       if (tsym != (TripleSymbol *)0) {
645 | 	subct = tsym->resolve(walker);
646 | 	if (subct != (Constructor *)0) {
647 | 	  walker.setConstructor(subct);
648 | 	  subct->applyContext(walker);
649 | 	  break;
650 | 	}
651 |       }
652 |       walker.setCurrentLength(sym->getMinimumLength());
653 |       walker.popOperand();
654 |       oper += 1;
655 |     }
656 |     if (oper >= numoper) { // Finished processing constructor
657 |       walker.calcCurrentLength(ct->getMinimumLength(),numoper);
658 |       walker.popOperand();
659 | 				// Check for use of delayslot
660 |       ConstructTpl *templ = ct->getTempl();
661 |       if ((templ != (ConstructTpl *)0)&&(templ->delaySlot() > 0))
662 | 	pos.setDelaySlot(templ->delaySlot());
663 |     }
664 |   }
665 |   pos.setNaddr(pos.getAddr()+pos.getLength());	// Update Naddr to pointer after instruction
666 |   pos.setParserState(ParserContext::disassembly);
667 | }
668 | 
669 | /// Resolve handle templates for the given parse tree, assuming Constructors
670 | /// are already resolved.
671 | /// \param pos is the given parse tree
672 | void SleighMishegos::resolveHandles(ParserContext &pos) const
673 | 
674 | {
675 |   TripleSymbol *triple;
676 |   Constructor *ct;
677 |   int4 oper,numoper;
678 | 
679 |   ParserWalker walker(&pos);
680 |   walker.baseState();
681 |   while(walker.isState()) {
682 |     ct = walker.getConstructor();
683 |     oper = walker.getOperand();
684 |     numoper = ct->getNumOperands();
685 |     while(oper < numoper) {
686 |       OperandSymbol *sym = ct->getOperand(oper);
687 |       walker.pushOperand(oper);	// Descend into node
688 |       triple = sym->getDefiningSymbol();
689 |       if (triple != (TripleSymbol *)0) {
690 | 	if (triple->getType() == SleighSymbol::subtable_symbol)
691 | 	  break;
692 | 	else			// Some other kind of symbol as an operand
693 | 	  triple->getFixedHandle(walker.getParentHandle(),walker);
694 |       }
695 |       else {			// Must be an expression
696 | 	PatternExpression *patexp = sym->getDefiningExpression();
697 | 	intb res = patexp->getValue(walker);
698 | 	FixedHandle &hand(walker.getParentHandle());
699 | 	hand.space = pos.getConstSpace(); // Result of expression is a constant
700 | 	hand.offset_space = (AddrSpace *)0;
701 | 	hand.offset_offset = (uintb)res;
702 | 	hand.size = 0;		// This size should not get used
703 |       }
704 |       walker.popOperand();
705 |       oper += 1;
706 |     }
707 |     if (oper >= numoper) {	// Finished processing constructor
708 |       ConstructTpl *templ = ct->getTempl();
709 |       if (templ != (ConstructTpl *)0) {
710 | 	HandleTpl *res = templ->getResult();
711 | 	if (res != (HandleTpl *)0)	// Pop up handle to containing operand
712 | 	  res->fix(walker.getParentHandle(),walker);
713 | 	// If we need an indicator that the constructor exports nothing try
714 |         // else
715 | 	//   walker.getParentHandle().setInvalid();
716 |       }
717 |       walker.popOperand();
718 |     }
719 |   }
720 |   pos.setParserState(ParserContext::pcode);
721 | }
722 | 
723 | int4 SleighMishegos::instructionLength(const Address &baseaddr) const
724 | 
725 | {
726 |   ParserContext *pos = obtainContext(baseaddr,ParserContext::disassembly);
727 |   return pos->getLength();
728 | }
729 | 
730 | int4 SleighMishegos::printAssembly(AssemblyEmit &emit,const Address &baseaddr) const
731 | 
732 | {
733 |   int4 sz;
734 | 
735 |   ParserContext *pos = obtainContext(baseaddr,ParserContext::disassembly);
736 |   ParserWalker walker(pos);
737 |   walker.baseState();
738 | 
739 |   Constructor *ct = walker.getConstructor();
740 |   ostringstream mons;
741 |   ct->printMnemonic(mons,walker);
742 |   ostringstream body;
743 |   ct->printBody(body,walker);
744 |   emit.dump(baseaddr,mons.str(),body.str());
745 |   sz = pos->getLength();
746 |   return sz;
747 | }
748 | 
749 | int4 SleighMishegos::oneInstruction(PcodeEmit &emit,const Address &baseaddr) const
750 | 
751 | {
752 |   throw UnimplError("Unimplemented oneInstruction", 0);
753 |   int4 fallOffset;
754 |   if (alignment != 1) {
755 |     if ((baseaddr.getOffset() % alignment)!=0) {
756 |       ostringstream s;
757 |       s << "Instruction address not aligned: " << baseaddr;
758 |       throw UnimplError(s.str(),0);
759 |     }
760 |   }
761 | 
762 |   ParserContext *pos = obtainContext(baseaddr,ParserContext::pcode);
763 |   pos->applyCommits();
764 |   fallOffset = pos->getLength();
765 | 
766 |   if (pos->getDelaySlot()>0) {
767 |     int4 bytecount = 0;
768 |     do {
769 |     // Do not pass pos->getNaddr() to obtainContext, as pos may have been previously cached and had naddr adjusted
770 |       ParserContext *delaypos = obtainContext(pos->getAddr() + fallOffset,ParserContext::pcode);
771 |       delaypos->applyCommits();
772 |       int4 len = delaypos->getLength();
773 |       fallOffset += len;
774 |       bytecount += len;
775 |     } while(bytecount < pos->getDelaySlot());
776 |     pos->setNaddr(pos->getAddr()+fallOffset);
777 |   }
778 |   ParserWalker walker(pos);
779 |   walker.baseState();
780 |   pcode_cache.clear();
781 |   SleighBuilder builder(&walker,discache,&pcode_cache,getConstantSpace(),getUniqueSpace(),unique_allocatemask);
782 |   try {
783 |     builder.build(walker.getConstructor()->getTempl(),-1);
784 |     pcode_cache.resolveRelatives();
785 |     pcode_cache.emit(baseaddr,&emit);
786 |   } catch(UnimplError &err) {
787 |     ostringstream s;
788 |     s << "Instruction not implemented in pcode:\n ";
789 |     ParserWalker *cur = builder.getCurrentWalker();
790 |     cur->baseState();
791 |     Constructor *ct = cur->getConstructor();
792 |     cur->getAddr().printRaw(s);
793 |     s << ": ";
794 |     ct->printMnemonic(s,*cur);
795 |     s << "  ";
796 |     ct->printBody(s,*cur);
797 |     err.explain = s.str();
798 |     err.instruction_length = fallOffset;
799 |     throw err;
800 |   }
801 |   return fallOffset;
802 | }
803 | 
804 | void SleighMishegos::registerContext(const string &name,int4 sbit,int4 ebit)
805 | 
806 | {
807 |   context_db->registerVariable(name,sbit,ebit);
808 | }
809 | 
810 | void SleighMishegos::setContextDefault(const string &name,uintm val)
811 | 
812 | {
813 |   context_db->setVariableDefault(name,val);
814 | }
815 | 
816 | void SleighMishegos::allowContextSet(bool val) const
817 | 
818 | {
819 |   cache->allowSet(val);
820 | }
821 | 
822 | } // End namespace ghidra
823 | 


--------------------------------------------------------------------------------
/src/worker/ghidra/sleighMishegos.hh:
--------------------------------------------------------------------------------
  1 | /* ###
  2 |  * IP: GHIDRA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  *
 16 |  * This file was copied from upstream
 17 |  * https://github.com/NationalSecurityAgency/ghidra/blob/2536099c0eb2683ee0e416a127f8a8795f8de853/Ghidra/Features/Decompiler/src/decompile/cpp/sleigh.hh
 18 |  *
 19 |  * Modified by Eric Kilmer at Trail of Bits 2022
 20 |  * Modified to better support mishegos single-shot disassembly by not using the
 21 |  * disassembly cache. This allows us to get new disassembly results at the same
 22 |  * address without having to reinitialize everything.
 23 |  *
 24 |  * This file has been modified in a way to minimize the diff from upstream.
 25 |  * There is dead code and other code artifacts that probably wouldn't be
 26 |  * written in the same way had this functionality been written fresh.
 27 |  */
 28 | /// \file sleigh.hh
 29 | /// \brief Classes and utilities for the main SLEIGH engine
 30 | 
 31 | #ifndef __SLEIGHMISHEGOS__
 32 | #define __SLEIGHMISHEGOS__
 33 | 
 34 | #include <sleigh/sleighbase.hh>
 35 | 
 36 | namespace ghidra {
 37 | 
 38 | class LoadImage;
 39 | 
 40 | /// \brief Class for describing a relative p-code branch destination
 41 | ///
 42 | /// An intra-instruction p-code branch takes a \e relative operand.
 43 | /// The actual value produced during p-code generation is calculated at
 44 | /// the last second using \b this. It stores the index of the BRANCH
 45 | /// instruction and a reference to its destination operand. This initially
 46 | /// holds a reference to a destination \e label symbol, but is later updated
 47 | /// with the final relative value.
 48 | struct RelativeRecord {
 49 |   VarnodeData *dataptr;		///< Varnode indicating relative offset
 50 |   uintb calling_index;		///< Index of instruction containing relative offset
 51 | };
 52 | 
 53 | /// \brief Data for building one p-code instruction
 54 | ///
 55 | /// Raw data used by the emitter to produce a single PcodeOp
 56 | struct PcodeData {
 57 |   OpCode opc;			///< The op code
 58 |   VarnodeData *outvar;	     	///< Output Varnode data (or null)
 59 |   VarnodeData *invar;		///< Array of input Varnode data
 60 |   int4 isize;			///< Number of input Varnodes
 61 | };
 62 | 
 63 | /// \brief Class for caching a chunk of p-code, prior to emitting
 64 | ///
 65 | /// The engine accumulates PcodeData and VarnodeData objects for
 66 | /// a single instruction.  Once the full instruction is constructed,
 67 | /// the objects are passed to the emitter (PcodeEmit) via the emit() method.
 68 | /// The class acts as a pool of memory for PcodeData and VarnodeData objects
 69 | /// that can be reused repeatedly to emit multiple instructions.
 70 | class PcodeCacher {
 71 |   VarnodeData *poolstart;		///< Start of the pool of VarnodeData objects
 72 |   VarnodeData *curpool;			///< First unused VarnodeData
 73 |   VarnodeData *endpool;			///< End of the pool of VarnodeData objects
 74 |   vector<PcodeData> issued;		///< P-code ops issued for the current instruction
 75 |   list<RelativeRecord> label_refs;	///< References to labels
 76 |   vector<uintb> labels;			///< Locations of labels
 77 |   VarnodeData *expandPool(uint4 size);	///< Expand the memory pool
 78 | public:
 79 |   PcodeCacher(void);		///< Constructor
 80 |   ~PcodeCacher(void);		///< Destructor
 81 | 
 82 |   /// \brief Allocate data objects for a new set of Varnodes
 83 |   ///
 84 |   /// \param size is the number of objects to allocate
 85 |   /// \return a pointer to the array of available VarnodeData objects
 86 |   VarnodeData *allocateVarnodes(uint4 size) {
 87 |     VarnodeData *newptr = curpool + size;
 88 |     if (newptr <= endpool) {
 89 |       VarnodeData *res = curpool;
 90 |       curpool = newptr;
 91 |       return res;
 92 |     }
 93 |     return expandPool(size);
 94 |   }
 95 | 
 96 |   /// \brief Allocate a data object for a new p-code operation
 97 |   ///
 98 |   /// \return the new PcodeData object
 99 |   PcodeData *allocateInstruction(void) {
100 |     issued.emplace_back();
101 |     PcodeData *res = &issued.back();
102 |     res->outvar = (VarnodeData *)0;
103 |     res->invar = (VarnodeData *)0;
104 |     return res;
105 |   }
106 |   void addLabelRef(VarnodeData *ptr);	///< Denote a Varnode holding a \e relative \e branch offset
107 |   void addLabel(uint4 id);		///< Attach a label to the \e next p-code instruction
108 |   void clear(void);			///< Reset the cache so that all objects are unallocated
109 |   void resolveRelatives(void);		///< Rewrite branch target Varnodes as \e relative offsets
110 |   void emit(const Address &addr,PcodeEmit *emt) const;	///< Pass the cached p-code data to the emitter
111 | };
112 | 
113 | /// \brief A container for disassembly context used by the SLEIGH engine
114 | ///
115 | /// This acts as a factor for the ParserContext objects which are used to disassemble
116 | /// a single instruction.  These all share a ContextCache which is a front end for
117 | /// accessing the ContextDatabase and resolving context variables from the SLEIGH spec.
118 | /// ParserContext objects are stored in a hash-table keyed by the address of the instruction.
119 | class DisassemblyCache {
120 |   Translate *translate;		///< The Translate object that owns this cache
121 |   ContextCache *contextcache;	///< Cached values from the ContextDatabase
122 |   AddrSpace *constspace;	///< The constant address space
123 |   int4 minimumreuse;		///< Can call getParserContext this many times, before a ParserContext is reused
124 |   uint4 mask;			///< Size of the hashtable in form 2^n-1
125 |   ParserContext **list;		///< (circular) array of currently cached ParserContext objects
126 |   int4 nextfree;		///< Current end/beginning of circular list
127 |   ParserContext **hashtable;	///< Hashtable for looking up ParserContext via Address
128 |   void initialize(int4 min,int4 hashsize);	///< Initialize the hash-table of ParserContexts
129 |   void free(void);		///< Free the hash-table of ParserContexts
130 | public:
131 |   DisassemblyCache(Translate *trans,ContextCache *ccache,AddrSpace *cspace,int4 cachesize,int4 windowsize);	///< Constructor
132 |   ~DisassemblyCache(void) { free(); }	///< Destructor
133 |   ParserContext *getParserContext(const Address &addr);		///< Get the parser for a particular Address
134 | };
135 | 
136 | /// \brief Build p-code from a pre-parsed instruction
137 | ///
138 | /// Through the build() method, \b this walks the parse tree and prepares data
139 | /// for final emission as p-code.  (The final emitting is done separately through the
140 | /// PcodeCacher.emit() method).  Generally, only p-code for one instruction is prepared.
141 | /// But, through the \b delay-slot mechanism, build() may recursively visit
142 | /// additional instructions.
143 | class SleighBuilder : public PcodeBuilder {
144 |   virtual void dump( OpTpl *op );
145 |   AddrSpace *const_space;		///< The constant address space
146 |   AddrSpace *uniq_space;		///< The unique address space
147 |   uintb uniquemask;			///< Mask of address bits to use to uniquify temporary registers
148 |   uintb uniqueoffset;			///< Uniquifier bits for \b this instruction
149 |   DisassemblyCache *discache;		///< Cache of disassembled instructions
150 |   PcodeCacher *cache;			///< Cache accumulating p-code data for the instruction
151 |   void buildEmpty(Constructor *ct,int4 secnum);
152 |   void generateLocation(const VarnodeTpl *vntpl,VarnodeData &vn);
153 |   AddrSpace *generatePointer(const VarnodeTpl *vntpl,VarnodeData &vn);
154 |   void generatePointerAdd(PcodeData *op,const VarnodeTpl *vntpl);
155 |   void setUniqueOffset(const Address &addr);	///< Set uniquifying bits for the current instruction
156 | public:
157 |   SleighBuilder(ParserWalker *w,DisassemblyCache *dcache,PcodeCacher *pc,AddrSpace *cspc,AddrSpace *uspc,uint4 umask);
158 |   virtual void appendBuild(OpTpl *bld,int4 secnum);
159 |   virtual void delaySlot(OpTpl *op);
160 |   virtual void setLabel(OpTpl *op);
161 |   virtual void appendCrossBuild(OpTpl *bld,int4 secnum);
162 | };
163 | 
164 | /// \brief A full SLEIGH engine
165 | ///
166 | /// Its provided with a LoadImage of the bytes to be disassembled and
167 | /// a ContextDatabase.
168 | ///
169 | /// Assembly is produced via the printAssembly() method, provided with an
170 | /// AssemblyEmit object and an Address.
171 | ///
172 | /// P-code is produced via the oneInstruction() method, provided with a PcodeEmit
173 | /// object and an Address.
174 | class SleighMishegos : public SleighBase {
175 |   LoadImage *loader;			///< The mapped bytes in the program
176 |   ContextDatabase *context_db;		///< Database of context values steering disassembly
177 |   ContextCache *cache;			///< Cache of recently used context values
178 |   mutable DisassemblyCache *discache;	///< Cache of recently parsed instructions
179 |   mutable PcodeCacher pcode_cache;	///< Cache of p-code data just prior to emitting
180 |   ParserContext *pos;
181 |   void clearForDelete(void);		///< Delete the context and disassembly caches
182 | protected:
183 |   ParserContext *obtainContext(const Address &addr,int4 state) const;
184 |   void resolve(ParserContext &pos) const;	///< Generate a parse tree suitable for disassembly
185 |   void resolveHandles(ParserContext &pos) const;	///< Prepare the parse tree for p-code generation
186 | public:
187 |   SleighMishegos(LoadImage *ld,ContextDatabase *c_db);		///< Constructor
188 |   virtual ~SleighMishegos(void);				///< Destructor
189 |   void reset(LoadImage *ld,ContextDatabase *c_db);	///< Reset the engine for a new program
190 |   virtual void initialize(DocumentStorage &store);
191 |   virtual void registerContext(const string &name,int4 sbit,int4 ebit);
192 |   virtual void setContextDefault(const string &nm,uintm val);
193 |   virtual void allowContextSet(bool val) const;
194 |   virtual int4 instructionLength(const Address &baseaddr) const;
195 |   virtual int4 oneInstruction(PcodeEmit &emit,const Address &baseaddr) const;
196 |   virtual int4 printAssembly(AssemblyEmit &emit,const Address &baseaddr) const;
197 | };
198 | 
199 | } // End namespace ghidra
200 | 
201 | /** \page sleigh SLEIGH
202 | 
203 |   \section sleightoc Table of Contents
204 | 
205 |     - \ref sleighoverview
206 |     - \ref sleighbuild
207 |     - \ref sleighuse
208 |     - \subpage sleighAPIbasic
209 |     - \subpage sleighAPIemulate
210 | 
211 |   \b Key \b Classes
212 |     - \ref Translate
213 |     - \ref AssemblyEmit
214 |     - \ref PcodeEmit
215 |     - \ref LoadImage
216 |     - \ref ContextDatabase
217 | 
218 |   \section sleighoverview Overview
219 | 
220 |   Welcome to \b SLEIGH, a machine language translation and
221 |   dissassembly engine.  SLEIGH is both a processor
222 |   specification language and the associated library and
223 |   tools for using such a specification to generate assembly
224 |   and to generate \b pcode, a reverse engineering Register
225 |   Transfer Language (RTL), from binary machine instructions.
226 | 
227 |   SLEIGH was originally based on \b SLED, a
228 |   \e Specification \e Language \e for \e Encoding \e and
229 |   \e Decoding, designed by Norman Ramsey and Mary F. Fernandez,
230 |   which performed disassembly (and assembly).  SLEIGH
231 |   extends SLED by providing semantic descriptions (via the
232 |   RTL) of machine instructions and other practical enhancements
233 |   for doing real world reverse engineering.
234 | 
235 |   SLEIGH is part of Project \b GHIDRA. It provides the core
236 |   of the GHIDRA disassembler and the data-flow and
237 |   decompilation analysis.  However, SLEIGH can serve as a
238 |   standalone library for use in other applications for
239 |   providing a generic disassembly and RTL translation interface.
240 | 
241 |   \section sleighbuild Building SLEIGH
242 | 
243 |   There are a couple of \e make targets for building the SLEIGH
244 |   library from source.  These are:
245 | 
246 |   \code
247 |      make libsla.a               # Build the main library
248 | 
249 |      make libsla_dbg.a           # Build the library with debug symbols
250 |   \endcode
251 | 
252 |   The source code file \e sleighexample.cc has a complete example
253 |   of initializing the Translate engine and using it to generate
254 |   assembly and pcode.  The source has a hard-coded file name,
255 |   \e x86testcode, as the example binary executable it attempts
256 |   to decode, but this can easily be changed.  It also needs
257 |   a SLEIGH specification file (\e .sla) to be present.
258 | 
259 |   Building the example application can be done with something
260 |   similar to the following makefile fragment.
261 | 
262 |   \code
263 |     # The C compiler
264 |     CXX=g++
265 | 
266 |     # Debug flags
267 |     DBG_CXXFLAGS=-g -Wall -Wno-sign-compare
268 | 
269 |     OPT_CXXFLAGS=-O2 -Wall -Wno-sign-compare
270 | 
271 |     # libraries
272 |     INCLUDES=-I./src
273 | 
274 |     LNK=src/libsla_dbg.a
275 | 
276 |     sleighexample.o:      sleighexample.cc
277 |           $(CXX) -c $(DBG_CXXFLAGS) -o sleighexample sleighexample.o $(LNK)
278 | 
279 |     clean:
280 |           rm -rf *.o sleighexample
281 |   \endcode
282 | 
283 |   \section sleighuse Using SLEIGH
284 | 
285 |   SLEIGH is a generic reverse engineering tool in the sense
286 |   that the API is designed to be completely processor
287 |   independent.  In order to process binary executables for a
288 |   specific processor, The library reads in a \e
289 |   specification \e file, which describes how instructions
290 |   are encoded and how they are interpreted by the processor.
291 |   An application which needs to do disassembly or generate
292 |   \b pcode can design to the SLEIGH API once, and then the
293 |   application will automatically support any processor for
294 |   which there is a specification.
295 | 
296 |   For working with a single processor, the SLEIGH library
297 |   needs to load a single \e compiled form of the processor
298 |   specification, which is traditionally given a ".sla" suffix.
299 |   Most common processors already have a ".sla" file available.
300 |   So to use SLEIGH with these processors, the library merely
301 |   needs to be made aware of the desired file.  This documentation
302 |   covers the use of the SLEIGH API, assuming that this
303 |   specification file is available.
304 | 
305 |   The ".sla" files themselves are created by running
306 |   the \e compiler on a file written in the formal SLEIGH
307 |   language.  These files traditionally have the suffix ".slaspec"
308 |   For those who want to design such a specification for a new
309 |   processor, please refer to the document, "SLEIGH: A Language
310 |   for Rapid Processor Specification."
311 | 
312 |  */
313 | 
314 |  /**
315 |   \page sleighAPIbasic The Basic SLEIGH Interface
316 | 
317 |   To use SLEIGH as a library within an application, there
318 |   are basically five classes that you need to be aware of.
319 | 
320 |     - \ref sleightranslate
321 |     - \ref sleighassememit
322 |     - \ref sleighpcodeemit
323 |     - \ref sleighloadimage
324 |     - \ref sleighcontext
325 | 
326 |   \section sleightranslate Translate (or Sleigh)
327 | 
328 |   The core SLEIGH class is Sleigh, which is derived from the
329 |   interface, Translate.  In order to instantiate it in your code,
330 |   you need a LoadImage object, and a ContextDatabase object.
331 |   The load image is responsible for retrieving instruction
332 |   bytes, based on address, from a binary executable. The context
333 |   database provides the library extra mode information that may
334 |   be necessary to do the disassembly or translation.  This can
335 |   be used, for instance, to specify that an x86 binary is running
336 |   in 32-bit mode, or to specify that an ARM processor is running
337 |   in THUMB mode.  Once these objects are built, the Sleigh
338 |   object can be immediately instantiated.
339 | 
340 |   \code
341 |   LoadImageBfd *loader;
342 |   ContextDatabase *context;
343 |   Translate *trans;
344 | 
345 |   // Set up the loadimage
346 |   // Providing an executable name and architecture
347 |   string loadimagename = "x86testcode";
348 |   string bfdtarget= "default";
349 | 
350 |   loader = new LoadImageBfd(loadimagename,bfdtarget);
351 |   loader->open();       // Load the executable from file
352 | 
353 |   context = new ContextInternal();   // Create a processor context
354 | 
355 |   trans = new Sleigh(loader,context);  // Instantiate the translator
356 |   \endcode
357 | 
358 |   Once the Sleigh object is in hand, the only required
359 |   initialization step left is to inform it of the ".sla" file.
360 |   The file is in XML format and needs to be read in using
361 |   SLEIGH's built-in XML parser. The following code accomplishes
362 |   this.
363 | 
364 |   \code
365 |   string sleighfilename = "specfiles/x86.sla";
366 |   DocumentStorage docstorage;
367 |   Element *sleighroot = docstorage.openDocument(sleighfilename)->getRoot();
368 |   docstorage.registerTag(sleighroot);
369 |   trans->initialize(docstorage);  // Initialize the translator
370 |   \endcode
371 | 
372 |   \section sleighassememit AssemblyEmit
373 | 
374 |   In order to do disassembly, you need to derive a class from
375 |   AssemblyEmit, and implement the method \e dump.  The library
376 |   will call this method exactly once, for each instruction
377 |   disassembled.
378 | 
379 |   This routine simply needs to decide how (and where) to print
380 |   the corresponding portion of the disassembly.  For instance,
381 | 
382 |   \code
383 |   class AssemblyRaw : public AssemblyEmit {
384 |   public:
385 |     virtual void dump(const Address &addr,const string &mnem,const string &body) {
386 |       addr.printRaw(cout);
387 |       cout << ": " << mnem << ' ' << body << endl;
388 |     }
389 |   };
390 |   \endcode
391 | 
392 |   This is a minimal implementation that simply dumps the
393 |   disassembly straight to standard out.  Once this object is
394 |   instantiated, the Sleigh object can use it to write out
395 |   assembly via the Translate::printAssembly() method.
396 | 
397 |   \code
398 |   AssemblyEmit *assememit = new AssemblyRaw();
399 | 
400 |   Address addr(trans->getDefaultCodeSpace(),0x80484c0);
401 |   int4 length;                  // Length of instruction in bytes
402 | 
403 |   length = trans->printAssembly(*assememit,addr);
404 |   addr = addr + length;        // Advance to next instruction
405 |   length = trans->printAssembly(*assememit,addr);
406 |   addr = addr + length;
407 |   length = trans->printAssembly(*assememit,addr);
408 |   \endcode
409 | 
410 |   \section sleighpcodeemit PcodeEmit
411 | 
412 |   In order to generate a \b pcode translation of a machine
413 |   instruction, you need to derive a class from PcodeEmit and
414 |   implement the virtual method \e dump. This method will be
415 |   invoked once for each \b pcode operation in the translation
416 |   of a machine instruction.  There will likely be multiple calls
417 |   per instruction.  Each call passes in a single \b pcode
418 |   operation, complete with its possible varnode output, and
419 |   all of its varnode inputs.  Here is an example of a PcodeEmit
420 |   object that simply prints out the \b pcode.
421 | 
422 |   \code
423 |   class PcodeRawOut : public PcodeEmit {
424 |   public:
425 |     virtual void dump(const Address &addr,OpCode opc,VarnodeData *outvar,VarnodeData *vars,int4 isize);
426 |   };
427 | 
428 |   static void print_vardata(ostream &s,VarnodeData &data)
429 | 
430 |   {
431 |     s << '(' << data.space->getName() << ',';
432 |     data.space->printOffset(s,data.offset);
433 |     s << ',' << dec << data.size << ')';
434 |   }
435 | 
436 |   void PcodeRawOut::dump(const Address &addr,OpCode opc,VarnodeData *outvar,VarnodeData *vars,int4 isize)
437 | 
438 |   {
439 |     if (outvar != (VarnodeData *)0) {     // The output is optional
440 |       print_vardata(cout,*outvar);
441 |       cout << " = ";
442 |     }
443 |     cout << get_opname(opc);
444 |     // Possibly check for a code reference or a space reference
445 |     for(int4 i=0;i<isize;++i) {
446 |       cout << ' ';
447 |       print_vardata(cout,vars[i]);
448 |     }
449 |     cout << endl;
450 |   }
451 |   \endcode
452 | 
453 |   Notice that the \e dump routine uses the built-in function
454 |   \e get_opname to find a string version of the opcode.  Each
455 |   varnode is defined in terms of the VarnodeData object, which
456 |   is defined simply:
457 | 
458 |   \code
459 |   struct VarnodeData {
460 |     AddrSpace *space;          // The address space
461 |     uintb offset;              // The offset within the space
462 |     uint4 size;                // The number of bytes at that location
463 |   };
464 |   \endcode
465 | 
466 |   Once the PcodeEmit object is instantiated, the Sleigh object can
467 |   use it to generate pcode, one instruction at a time, using the
468 |   Translate::oneInstruction() const method.
469 | 
470 |   \code
471 |   PcodeEmit *pcodeemit = new PcodeRawOut();
472 | 
473 |   Address addr(trans->getDefaultCodeSpace(),0x80484c0);
474 |   int4 length;                   // Length of instruction in bytes
475 | 
476 |   length = trans->oneInstruction(*pcodeemit,addr);
477 |   addr = addr + length;         // Advance to next instruction
478 |   length = trans->oneInstruction(*pcodeemit,addr);
479 |   addr = addr + length;
480 |   length = trans->oneInstruction(*pcodeemit,addr);
481 |   \endcode
482 | 
483 |   For an application to properly \e follow \e flow, while translating
484 |   machine instructions into pcode, the emitted pcode must be
485 |   inspected for the various branch operations.
486 | 
487 |   \section sleighloadimage LoadImage
488 | 
489 |   A LoadImage holds all the binary data from an executable file
490 |   in the format similar to how it would exist when being executed
491 |   by a real processor.  The interface to this from SLEIGH is
492 |   actually very simple, although it can hide a complicated
493 |   structure.  One method does most of the work, LoadImage::loadFill().
494 |   It takes a byte pointer, a size, and an Address. The method
495 |   is expected to fill in the \e ptr array with \e size bytes
496 |   taken from the load image, corresponding to the address \e addr.
497 |   There are two more virtual methods that are required for a
498 |   complete implementation of LoadImage, \e getArchType and
499 |   \e adjustVma, but these do not need to be implemented fully.
500 | 
501 |   \code
502 |   class MyLoadImage : public LoadImage {
503 |   public:
504 |     MyLoadImage(const string &nm) : Loadimage(nm) {}
505 |     virtual void loadFill(uint1 *ptr,int4 size,const Address &addr);
506 |     virtual string getArchType(void) const { return "mytype"; }
507 |     virtual void adjustVma(long adjust) {}
508 |   };
509 |   \endcode
510 | 
511 |   \section sleighcontext ContextDatabase
512 | 
513 |   The ContextDatabase needs to keep track of any possible
514 |   context variable and its value, over different address ranges.
515 |   In most cases, you probably don't need to override the class
516 |   yourself, but can use the built-in class, ContextInternal.
517 |   This provides the basic functionality required and will work
518 |   for different architectures.  What you may need to do is
519 |   set values for certain variables, depending on the processor
520 |   and the environment it is running in.  For instance, for
521 |   the x86 platform, you need to set the \e addrsize and \e opsize
522 |   bits, to indicate the processor would be running in 32-bit
523 |   mode.  The context variables specific to a particular processor
524 |   are established by the SLEIGH spec.  So the variables can
525 |   only be set \e after the spec has been loaded.
526 | 
527 |   \code
528 |     ...
529 |     context = new ContextInternal();
530 |     trans = new Sleigh(loader,context);
531 |     DocumentStorage docstorage;
532 |     Element *root = docstorage.openDocument("specfiles/x86.sla")->getRoot();
533 |     docstorage.registerTag(root);
534 |     trans->initialize(docstorage);
535 | 
536 |     context->setVariableDefault("addrsize",1);  // Address size is 32-bits
537 |     context->setVariableDefault("opsize",1);    // Operand size is 32-bits
538 |   \endcode
539 | 
540 | 
541 |  */
542 | #endif
543 | 


--------------------------------------------------------------------------------
/src/worker/iced/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "iced"
 3 | version = "1.0.0"
 4 | authors = ["mishegos"]
 5 | edition = "2018"
 6 | 
 7 | [lib]
 8 | crate-type = ["cdylib"]
 9 | 
10 | [dependencies.iced-x86]
11 | default-features = false
12 | features = ["std", "decoder", "intel"]
13 | path = "./iced/src/rust/iced-x86"
14 | 
15 | [build-dependencies]
16 | bindgen = "*"
17 | 
18 | [profile.release]
19 | codegen-units = 1
20 | lto = true
21 | opt-level = 3
22 | 


--------------------------------------------------------------------------------
/src/worker/iced/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all
 2 | all: iced.so
 3 | 
 4 | iced.so: target/release/libiced.$(SO_SUFFIX)
 5 | 	cp target/release/libiced.$(SO_SUFFIX) $@
 6 | 
 7 | target/release/libiced.$(SO_SUFFIX):
 8 | 	cargo test --release
 9 | 	cargo build --release
10 | 
11 | .PHONY: clean
12 | clean:
13 | 	cargo clean --release
14 | 	rm -f *.so
15 | 


--------------------------------------------------------------------------------
/src/worker/iced/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path::PathBuf;
 3 | 
 4 | fn main() {
 5 |     println!("cargo:rerun-if-changed=build.rs");
 6 |     println!("cargo:rerun-if-changed=wrapper.h");
 7 | 
 8 |     let clang_args = env::var("RUST_BINDGEN_CLANG_ARGS").unwrap();
 9 |     let bindings = bindgen::Builder::default()
10 |         .header("wrapper.h")
11 |         .clang_args(clang_args.split_ascii_whitespace())
12 |         .parse_callbacks(Box::new(bindgen::CargoCallbacks))
13 |         .generate()
14 |         .expect("Unable to generate bindings");
15 | 
16 |     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
17 |     bindings
18 |         .write_to_file(out_path.join("bindings.rs"))
19 |         .expect("Couldn't write bindings!");
20 | }
21 | 


--------------------------------------------------------------------------------
/src/worker/iced/src/lib.rs:
--------------------------------------------------------------------------------
 1 | mod mishegos;
 2 | 
 3 | use iced_x86::*;
 4 | use mishegos::{
 5 |     decode_result, decode_status_S_FAILURE, decode_status_S_PARTIAL, decode_status_S_SUCCESS,
 6 | };
 7 | 
 8 | // This is pretty ugly and assumes sizeof(char) == 1
 9 | #[no_mangle]
10 | pub static mut worker_name: *const std::os::raw::c_char =
11 |     WORKER_NAME.as_ptr() as *const std::os::raw::c_char;
12 | static WORKER_NAME: &str = "iced\0";
13 | 
14 | #[allow(clippy::missing_safety_doc)]
15 | #[no_mangle]
16 | pub unsafe extern "C" fn try_decode(result: *mut decode_result, raw_insn: *const u8, length: u8) {
17 |     assert!(!result.is_null());
18 |     assert!(!raw_insn.is_null());
19 |     let data = std::slice::from_raw_parts(raw_insn, length as usize);
20 |     let result = &mut *result;
21 |     let error = match try_decode_safe(64, 0, data) {
22 |         Err(error) => error,
23 |         Ok((instr_len, output)) => {
24 |             result.ndecoded = instr_len as u16;
25 |             assert_eq!(std::mem::size_of::<std::os::raw::c_char>(), 1);
26 |             if output.len() > result.result.len() {
27 |                 decode_status_S_FAILURE
28 |             } else {
29 |                 std::ptr::copy(
30 |                     output.as_ptr(),
31 |                     result.result.as_mut_ptr() as *mut u8,
32 |                     output.len(),
33 |                 );
34 |                 result.len = output.len() as u16;
35 |                 decode_status_S_SUCCESS
36 |             }
37 |         }
38 |     };
39 |     result.status = error;
40 | }
41 | 
42 | fn try_decode_safe(bitness: u32, ip: u64, data: &[u8]) -> Result<(usize, String), u32> {
43 |     const DECODER_OPTIONS: u32 = DecoderOptions::NONE;
44 | 
45 |     let mut decoder = Decoder::new(bitness, data, DECODER_OPTIONS);
46 |     decoder.set_ip(ip);
47 |     let instr = decoder.decode();
48 | 
49 |     if instr.is_invalid() {
50 |         match decoder.last_error() {
51 |             DecoderError::None => unreachable!(),
52 |             DecoderError::NoMoreBytes => Err(decode_status_S_PARTIAL),
53 |             _ => Err(decode_status_S_FAILURE),
54 |         }
55 |     } else {
56 |         let mut formatter = IntelFormatter::new();
57 |         // Try to match default XED output
58 |         formatter.options_mut().set_hex_suffix("");
59 |         formatter.options_mut().set_hex_prefix("0x");
60 |         formatter.options_mut().set_uppercase_hex(false);
61 |         formatter
62 |             .options_mut()
63 |             .set_space_after_operand_separator(true);
64 |         formatter
65 |             .options_mut()
66 |             .set_memory_size_options(MemorySizeOptions::Always);
67 |         formatter.options_mut().set_always_show_scale(true);
68 |         formatter.options_mut().set_rip_relative_addresses(true);
69 |         formatter
70 |             .options_mut()
71 |             .set_small_hex_numbers_in_decimal(false);
72 |         formatter.options_mut().set_cc_ge(CC_ge::nl);
73 |         formatter.options_mut().set_cc_a(CC_a::nbe);
74 |         formatter.options_mut().set_cc_e(CC_e::z);
75 |         formatter.options_mut().set_cc_ne(CC_ne::nz);
76 |         formatter.options_mut().set_cc_ae(CC_ae::nb);
77 |         formatter.options_mut().set_cc_g(CC_g::nle);
78 |         formatter.options_mut().set_show_branch_size(false);
79 |         formatter.options_mut().set_branch_leading_zeroes(false);
80 |         formatter.options_mut().set_use_pseudo_ops(false);
81 | 
82 |         let mut output = String::new();
83 |         formatter.format(&instr, &mut output);
84 | 
85 |         Ok((instr.len(), output))
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/worker/iced/src/mishegos.rs:
--------------------------------------------------------------------------------
 1 | #![allow(non_upper_case_globals)]
 2 | #![allow(non_camel_case_types)]
 3 | #![allow(non_snake_case)]
 4 | #![allow(dead_code)]
 5 | // "warning: `extern` block uses type `u128`, which is not FFI-safe"
 6 | // "note: 128-bit integers don't currently have a known stable ABI"
 7 | #![allow(improper_ctypes)]
 8 | #![allow(clippy::redundant_static_lifetimes)]
 9 | 
10 | include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
11 | 


--------------------------------------------------------------------------------
/src/worker/iced/wrapper.h:
--------------------------------------------------------------------------------
1 | #include "../worker.h"
2 | 


--------------------------------------------------------------------------------
/src/worker/llvm/Makefile:
--------------------------------------------------------------------------------
 1 | LLVM_CONFIG=llvm-config
 2 | override CPPFLAGS := $(CPPFLAGS) $(shell $(LLVM_CONFIG) --cppflags)
 3 | override LDFLAGS := $(LDFLAGS) $(shell $(LLVM_CONFIG) --ldflags) -Wl,-z,defs
 4 | override LDLIBS := $(LDLIBS) $(shell $(LLVM_CONFIG) --libs)
 5 | 
 6 | .PHONY: all
 7 | all: llvm.so
 8 | 
 9 | llvm.so: llvm.c
10 | 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $< $(LDLIBS) -o $@
11 | 
12 | .PHONY: clean
13 | clean:
14 | 	rm -rf *.o *.so
15 | 
16 | 


--------------------------------------------------------------------------------
/src/worker/llvm/llvm.c:
--------------------------------------------------------------------------------
 1 | #include <llvm-c/Disassembler.h>
 2 | #include <llvm-c/Target.h>
 3 | 
 4 | #include "../worker.h"
 5 | 
 6 | static LLVMDisasmContextRef dis;
 7 | 
 8 | char *worker_name = "llvm";
 9 | 
10 | void worker_ctor() {
11 |   LLVMInitializeX86TargetInfo();
12 |   LLVMInitializeX86Target();
13 |   LLVMInitializeX86TargetMC();
14 |   LLVMInitializeX86Disassembler();
15 |   dis = LLVMCreateDisasm("x86_64-linux-gnu", NULL, 0, NULL, NULL);
16 |   if (!dis) {
17 |     errx(1, "LLVMCreateDisasm");
18 |   }
19 |   // Hex immediates and Intel syntax
20 |   // The first option doesn't seem to have an effect, though.
21 |   if (!LLVMSetDisasmOptions(dis, LLVMDisassembler_Option_PrintImmHex |
22 |                                      LLVMDisassembler_Option_AsmPrinterVariant))
23 |     errx(1, "LLVMSetDisasmOptions");
24 | }
25 | 
26 | void worker_dtor() {
27 |   LLVMDisasmDispose(dis);
28 | }
29 | 
30 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) {
31 |   size_t len = LLVMDisasmInstruction(dis, raw_insn, length, 0, result->result, MISHEGOS_DEC_MAXLEN);
32 |   if (len > 0) {
33 |     result->status = S_SUCCESS;
34 |     result->len = strlen(result->result);
35 |     result->ndecoded = len;
36 |   } else {
37 |     result->status = S_FAILURE;
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/worker/worker.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "mish_common.h"
 4 | 
 5 | /* frequently needed by workers. */
 6 | #include <err.h>
 7 | #include <string.h>
 8 | 
 9 | /* This is fine for now. */
10 | typedef output_slot decode_result;
11 | 
12 | typedef void (*try_decode_t)(decode_result *result, uint8_t *raw_insn, uint8_t length);
13 | 


--------------------------------------------------------------------------------
/src/worker/xed/Makefile:
--------------------------------------------------------------------------------
 1 | # NOTE(ww): I don't fully understand why I need the RPATH here
 2 | # but not in the capstone build.
 3 | override CFLAGS := $(CFLAGS) -Wl,-rpath,$(shell pwd)/xed/kits/xed-mishegos/lib
 4 | override CPPFLAGS := $(CPPFLAGS) -Ixed/kits/xed-mishegos/include
 5 | override LDFLAGS := $(LDFLAGS) -Lxed/kits/xed-mishegos/lib
 6 | override LDLIBS := $(LDLIBS) -lxed
 7 | 
 8 | .PHONY: all
 9 | all: xed.so
10 | 
11 | xed/kits/xed-mishegos/libxed.so:
12 | 	cd xed && \
13 | 		python3 ./mfile.py install --shared --install-dir=kits/xed-mishegos -j 4
14 | 
15 | xed.so: xed.o
16 | 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) xed.o $(LDLIBS) -o $@
17 | 
18 | xed.o: xed/kits/xed-mishegos/libxed.so xed.c
19 | 
20 | .PHONY: clean
21 | clean:
22 | 	cd xed && python3 ./mfile.py clean && rm -rf kits
23 | 	rm -rf *.o *.so
24 | 


--------------------------------------------------------------------------------
/src/worker/xed/xed.c:
--------------------------------------------------------------------------------
 1 | #include <xed/xed-interface.h>
 2 | 
 3 | #include "../worker.h"
 4 | 
 5 | char *worker_name = "xed";
 6 | 
 7 | void worker_ctor() {
 8 |   xed_tables_init();
 9 | }
10 | 
11 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) {
12 |   xed_decoded_inst_t xedd;
13 |   xed_decoded_inst_zero(&xedd);
14 |   xed_decoded_inst_set_mode(&xedd, XED_MACHINE_MODE_LONG_64, XED_ADDRESS_WIDTH_64b);
15 |   xed_decoded_inst_set_input_chip(&xedd, XED_CHIP_ALL);
16 | 
17 |   xed_error_enum_t xed_error = xed_decode(&xedd, raw_insn, length);
18 |   if (xed_error != XED_ERROR_NONE) {
19 |     DLOG("xed_decode failed: %s", xed_error_enum_t2str(xed_error));
20 | 
21 |     /* Special-case XED_ERROR_BUFFER_TOO_SHORT, since it's something
22 |      * we have a status for beyond generic failure.
23 |      */
24 |     if (xed_error == XED_ERROR_BUFFER_TOO_SHORT) {
25 |       result->status = S_PARTIAL;
26 |     } else {
27 |       result->status = S_FAILURE;
28 |     }
29 |     return;
30 |   }
31 | 
32 |   /* TODO(ww): Fixure out whether xed_format_context decodes up to MISHEGOS_DEC_MAXLEN,
33 |    * or saves space for the NULL terminator. It probably doesn't matter in either case
34 |    * since nothing will be nearly that long, but it'd be good to know.
35 |    */
36 |   if (!xed_format_context(XED_SYNTAX_INTEL, &xedd, result->result, MISHEGOS_DEC_MAXLEN, 0, 0, 0)) {
37 |     DLOG("xed_format_context failed!");
38 |     /* TODO(ww): Maybe distinguish this formatting failure from the decoding
39 |      * failure above.
40 |      */
41 |     result->status = S_FAILURE;
42 |     return;
43 |   }
44 | 
45 |   result->status = S_SUCCESS;
46 |   result->len = strlen(result->result);
47 |   result->ndecoded = xed_decoded_inst_get_length(&xedd);
48 | }
49 | 


--------------------------------------------------------------------------------
/src/worker/yaxpeax-x86/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | libyaxpeax_x86_mishegos.so
3 | 


--------------------------------------------------------------------------------
/src/worker/yaxpeax-x86/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "yaxpeax-x86-mishegos"
 3 | version = "0.1.0"
 4 | authors = ["iximeow <git@iximeow.net>"]
 5 | edition = "2018"
 6 | 
 7 | [lib]
 8 | crate-type = ["cdylib"]
 9 | 
10 | [dependencies]
11 | yaxpeax-x86 = { version = "1.0.0" }
12 | yaxpeax-arch = { version = "0.2.0" }
13 | 
14 | [build-dependencies]
15 | bindgen = "*"
16 | 
17 | [profile.release]
18 | lto = true
19 | opt-level = 3
20 | 


--------------------------------------------------------------------------------
/src/worker/yaxpeax-x86/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all
 2 | all: libyaxpeax_x86_mishegos.so
 3 | 
 4 | libyaxpeax_x86_mishegos.so: target/release/libyaxpeax_x86_mishegos.$(SO_SUFFIX)
 5 | 	cp target/release/libyaxpeax_x86_mishegos.$(SO_SUFFIX) $@
 6 | 
 7 | target/release/libyaxpeax_x86_mishegos.$(SO_SUFFIX): src/lib.rs Cargo.toml
 8 | 	cargo build --release
 9 | 
10 | .PHONY: clean
11 | clean:
12 | 	cargo clean
13 | 	rm -f *.so
14 | 


--------------------------------------------------------------------------------
/src/worker/yaxpeax-x86/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path::PathBuf;
 3 | 
 4 | fn main() {
 5 |     println!("cargo:rerun-if-changed=build.rs");
 6 |     println!("cargo:rerun-if-changed=wrapper.h");
 7 | 
 8 |     let clang_args = env::var("RUST_BINDGEN_CLANG_ARGS").unwrap_or_else(|_| "-I../../include".to_string());
 9 | 
10 |     let bindings = PathBuf::from(env::var("OUT_DIR").unwrap()).join("mishegos.rs");
11 |     bindgen::Builder::default()
12 |         .header("../worker.h")
13 |         .clang_args(clang_args.split_ascii_whitespace())
14 |         .parse_callbacks(Box::new(bindgen::CargoCallbacks))
15 |         .generate()
16 |         .expect("failed to generate bindings")
17 |         .write_to_file(bindings)
18 |         .expect("failed to write bindings");
19 | }
20 | 


--------------------------------------------------------------------------------
/src/worker/yaxpeax-x86/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use std::os::raw::c_char;
 2 | use yaxpeax_x86::long_mode as amd64;
 3 | use yaxpeax_arch::{AddressBase, Decoder, LengthedInstruction};
 4 | 
 5 | // shhh no warnings please
 6 | #[allow(warnings)]
 7 | mod mishegos {
 8 |     include!(concat!(env!("OUT_DIR"), "/mishegos.rs"));
 9 | }
10 | 
11 | use crate::mishegos::{decode_result, MISHEGOS_DEC_MAXLEN, decode_status_S_SUCCESS, decode_status_S_FAILURE, decode_status_S_PARTIAL};
12 | 
13 | #[no_mangle]
14 | pub static mut worker_name: *const c_char = b"yaxpeax-x86-mishegos\x00".as_ptr() as *const i8;
15 | 
16 | static mut INSTR: Option<amd64::Instruction> = None;
17 | #[no_mangle]
18 | pub extern "C" fn try_decode(decode_result: *mut decode_result, bytes: *const u8, length: u8) {
19 |     unsafe {
20 |         if INSTR.is_none() {
21 |             INSTR = Some(amd64::Instruction::default());
22 |         }
23 |     }
24 |     let decode_result = unsafe { decode_result.as_mut().expect("decode_result is not null") };
25 |     let data = unsafe {
26 |         std::slice::from_raw_parts(bytes.as_ref().expect("bytes is not null"), length as usize)
27 |     };
28 |     let decoder = amd64::InstDecoder::default();
29 |     let mut reader = yaxpeax_arch::U8Reader::new(data);
30 | 
31 |     match decoder.decode_into(unsafe { INSTR.as_mut().unwrap() }, &mut reader) {
32 |         Err(amd64::DecodeError::ExhaustedInput) => {
33 |             decode_result.status = decode_status_S_PARTIAL;
34 |         }
35 |         Err(_error) => {
36 |             decode_result.status = decode_status_S_FAILURE;
37 |         }
38 |         Ok(()) => {
39 |             let instr = unsafe { INSTR.as_ref().unwrap() };
40 |             decode_result.ndecoded = 0u64.wrapping_offset(instr.len()) as u16;
41 |             let text = instr.to_string();
42 |             assert!(text.len() < MISHEGOS_DEC_MAXLEN as usize);
43 |             for (i, x) in text.as_bytes().iter().enumerate() {
44 |                 decode_result.result[i] = *x as i8;
45 |             }
46 |             decode_result.len = text.len() as u16;
47 |             decode_result.status = decode_status_S_SUCCESS;
48 |         }
49 |     };
50 | }
51 | 


--------------------------------------------------------------------------------
/src/worker/zydis/Makefile:
--------------------------------------------------------------------------------
 1 | # Include stupidity.
 2 | override CPPFLAGS := $(CPPFLAGS) \
 3 | 	-DZYDIS_STATIC_BUILD \
 4 | 	-Izydis/include \
 5 | 	-Izydis/dependencies/zycore/include \
 6 | 	-Izydis/build \
 7 | 	-Izydis/build/zycore
 8 | 
 9 | .PHONY: all
10 | all: zydis.so
11 | 
12 | zydis/build/libZydis.a:
13 | 	cd zydis && \
14 | 		mkdir build && \
15 | 		cd build && \
16 | 		cmake -DZYDIS_BUILD_TOOLS=OFF -DZYDIS_BUILD_EXAMPLES=OFF -DZYDIS_STATIC_DEFINE=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo .. && \
17 | 		cmake --build . -- -j4
18 | 
19 | zydis.so: zydis.o
20 | 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) \
21 | 	-Wl,--whole-archive zydis/build/libZydis.a -Wl,--no-whole-archive \
22 | 	zydis.o $(LDLIBS) -o $@
23 | 
24 | zydis.o: zydis/build/libZydis.a zydis.c
25 | 
26 | .PHONY: clean
27 | clean:
28 | 	rm -rf *.o *.so
29 | 	rm -rf zydis/build
30 | 


--------------------------------------------------------------------------------
/src/worker/zydis/zydis.c:
--------------------------------------------------------------------------------
 1 | /* Dumbness. */
 2 | #define ZYDIS_STATIC_BUILD
 3 | #include <Zydis/Zydis.h>
 4 | 
 5 | #include "../worker.h"
 6 | 
 7 | char *worker_name = "zydis";
 8 | 
 9 | static ZydisDecoder zdecoder;
10 | static ZydisFormatter zformatter;
11 | 
12 | /* I couldn't find this defined anywhere in zycore/zydis.
13 |  */
14 | static const char *ZyanStatus_strerror(ZyanStatus zstatus) {
15 |   switch (zstatus) {
16 |   case ZYDIS_STATUS_NO_MORE_DATA: {
17 |     return "no more data";
18 |   }
19 |   case ZYDIS_STATUS_DECODING_ERROR: {
20 |     return "general decoding error";
21 |   }
22 |   case ZYDIS_STATUS_INSTRUCTION_TOO_LONG: {
23 |     return "instruction too long";
24 |   }
25 |   case ZYDIS_STATUS_BAD_REGISTER: {
26 |     return "invalid register";
27 |   }
28 |   case ZYDIS_STATUS_ILLEGAL_LOCK: {
29 |     return "illegal lock prefix";
30 |   }
31 |   case ZYDIS_STATUS_ILLEGAL_LEGACY_PFX: {
32 |     return "illegal legacy prefix";
33 |   }
34 |   case ZYDIS_STATUS_ILLEGAL_REX: {
35 |     return "illegal REX prefix";
36 |   }
37 |   case ZYDIS_STATUS_INVALID_MAP: {
38 |     return "illegal opcode map value";
39 |   }
40 |   case ZYDIS_STATUS_MALFORMED_EVEX: {
41 |     return "illegal EVEX prefix";
42 |   }
43 |   case ZYDIS_STATUS_MALFORMED_MVEX: {
44 |     return "illegal MVEX prefix";
45 |   }
46 |   case ZYDIS_STATUS_INVALID_MASK: {
47 |     return "invalid write mask";
48 |   }
49 |   default: {
50 |     return "unknown";
51 |   }
52 |   }
53 | }
54 | 
55 | void worker_ctor() {
56 |   ZydisDecoderInit(&zdecoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64);
57 |   ZydisFormatterInit(&zformatter, ZYDIS_FORMATTER_STYLE_INTEL);
58 | 
59 |   /* TODO(ww): Zydis has a bunch of formatter options; we probably
60 |    * want to set some of them to make its output easier to normalize.
61 |    */
62 | }
63 | 
64 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) {
65 |   _unused(ZyanStatus_strerror);
66 | 
67 |   ZydisDecodedInstruction insn;
68 |   ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT_VISIBLE];
69 |   ZyanStatus zstatus =
70 |       ZydisDecoderDecodeFull(&zdecoder, raw_insn, length, &insn, operands,
71 |                              ZYDIS_MAX_OPERAND_COUNT_VISIBLE, ZYDIS_DFLAG_VISIBLE_OPERANDS_ONLY);
72 |   if (!ZYAN_SUCCESS(zstatus)) {
73 |     DLOG("zydis decoding failed: %s", ZyanStatus_strerror(zstatus));
74 | 
75 |     if (zstatus == ZYDIS_STATUS_NO_MORE_DATA) {
76 |       result->status = S_PARTIAL;
77 |     } else {
78 |       result->status = S_FAILURE;
79 |     }
80 |     return;
81 |   }
82 | 
83 |   zstatus =
84 |       ZydisFormatterFormatInstruction(&zformatter, &insn, operands, insn.operand_count_visible,
85 |                                       result->result, MISHEGOS_DEC_MAXLEN, 0);
86 |   if (!ZYAN_SUCCESS(zstatus)) {
87 |     DLOG("zydis formatting failed: %s", ZyanStatus_strerror(zstatus));
88 |     result->status = S_FAILURE;
89 |     return;
90 |   }
91 | 
92 |   result->status = S_SUCCESS;
93 |   result->len = strlen(result->result);
94 |   result->ndecoded = insn.length;
95 | }
96 | 


--------------------------------------------------------------------------------
/workers.spec:
--------------------------------------------------------------------------------
 1 | ./src/worker/bfd/bfd.so
 2 | ./src/worker/capstone/capstone.so
 3 | ./src/worker/dynamorio/dynamorio.so
 4 | ./src/worker/fadec/fadec.so
 5 | ./src/worker/xed/xed.so
 6 | ./src/worker/zydis/zydis.so
 7 | ./src/worker/bddisasm/bddisasm.so
 8 | ./src/worker/iced/iced.so
 9 | ./src/worker/yaxpeax-x86/libyaxpeax_x86_mishegos.so
10 | ./src/worker/ghidra/ghidra.so
11 | ./src/worker/llvm/llvm.so
12 | 


--------------------------------------------------------------------------------