├── .clang-format ├── .editorconfig ├── .github ├── dependabot.yml └── workflows │ └── ci.yml ├── .gitignore ├── .gitmodules ├── CODEOWNERS ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── docs ├── adding_a_worker.md └── output_cohorts_format.md ├── src ├── analysis │ ├── analysis │ ├── pass │ │ ├── dedupe │ │ │ ├── dedupe │ │ │ └── spec.yml │ │ ├── filter-all-failure │ │ │ ├── filter-all-failure │ │ │ └── spec.yml │ │ ├── filter-all-success │ │ │ ├── filter-all-success │ │ │ └── spec.yml │ │ ├── filter-any-failure │ │ │ ├── filter-any-failure │ │ │ └── spec.yml │ │ ├── filter-bddisasm-salc │ │ │ ├── filter-bddisasm-salc │ │ │ └── spec.yml │ │ ├── filter-destroy-bddisasm │ │ │ ├── filter-destroy-bddisasm │ │ │ └── spec.yml │ │ ├── filter-destroy-capstone │ │ │ ├── filter-destroy-capstone │ │ │ └── spec.yml │ │ ├── filter-destroy-ghidra │ │ │ ├── filter-destroy-ghidra │ │ │ └── spec.yml │ │ ├── filter-ghidra-lock │ │ │ ├── filter-ghidra-lock │ │ │ └── spec.yml │ │ ├── filter-incomparable │ │ │ ├── filter-incomparable │ │ │ └── spec.yml │ │ ├── filter-ndecoded-different │ │ │ ├── filter-ndecoded-different │ │ │ └── spec.yml │ │ ├── filter-ndecoded-same │ │ │ ├── filter-ndecoded-same │ │ │ └── spec.yml │ │ ├── filter-xed-find-overaccept │ │ │ ├── filter-xed-find-overaccept │ │ │ └── spec.yml │ │ ├── filter-xed-find-underaccept │ │ │ ├── filter-xed-find-underaccept │ │ │ └── spec.yml │ │ ├── find-size-discrepancies │ │ │ ├── find-size-discrepancies │ │ │ └── spec.yml │ │ ├── minimize-input │ │ │ ├── minimize-input │ │ │ └── spec.yml │ │ └── normalize │ │ │ ├── normalize │ │ │ └── spec.yml │ └── passes.yml ├── include │ └── mish_common.h ├── mish2jsonl │ ├── Makefile │ └── mish2jsonl.c ├── mishegos │ ├── Makefile │ ├── mishegos.c │ ├── mutator.c │ └── mutator.h ├── mishmat │ └── mishmat └── worker │ ├── Makefile │ ├── bddisasm │ ├── Makefile │ └── bddisasm.c │ ├── bfd │ ├── Makefile │ └── bfd.c │ ├── capstone │ ├── Makefile │ └── capstone.c │ ├── dynamorio │ ├── .gitignore │ ├── Makefile │ └── dynamorio.c │ ├── fadec │ ├── Makefile │ └── fadec.c │ ├── ghidra │ ├── .gitignore │ ├── CMakeLists.txt │ ├── Makefile │ ├── ghidra.cc │ ├── sleighMishegos.cc │ └── sleighMishegos.hh │ ├── iced │ ├── Cargo.toml │ ├── Makefile │ ├── build.rs │ ├── src │ │ ├── lib.rs │ │ └── mishegos.rs │ └── wrapper.h │ ├── llvm │ ├── Makefile │ └── llvm.c │ ├── worker.h │ ├── xed │ ├── Makefile │ └── xed.c │ ├── yaxpeax-x86 │ ├── .gitignore │ ├── Cargo.toml │ ├── Makefile │ ├── build.rs │ └── src │ │ └── lib.rs │ └── zydis │ ├── Makefile │ └── zydis.c └── workers.spec /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -2 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Right 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: None 15 | AllowShortIfStatementsOnASingleLine: false 16 | AllowShortLoopsOnASingleLine: false 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: false 20 | AlwaysBreakTemplateDeclarations: false 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakBeforeTernaryOperators: true 43 | BreakConstructorInitializersBeforeComma: false 44 | BreakConstructorInitializers: BeforeColon 45 | BreakAfterJavaFieldAnnotations: false 46 | BreakStringLiterals: true 47 | ColumnLimit: 100 48 | CommentPragmas: '^ IWYU pragma:' 49 | CompactNamespaces: false 50 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 51 | ConstructorInitializerIndentWidth: 4 52 | ContinuationIndentWidth: 4 53 | Cpp11BracedListStyle: true 54 | DerivePointerAlignment: false 55 | DisableFormat: false 56 | ExperimentalAutoDetectBinPacking: false 57 | FixNamespaceComments: true 58 | ForEachMacros: 59 | - foreach 60 | - Q_FOREACH 61 | - BOOST_FOREACH 62 | IncludeBlocks: Preserve 63 | IncludeCategories: 64 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 65 | Priority: 2 66 | - Regex: '^(<|"(gtest|gmock|isl|json)/)' 67 | Priority: 3 68 | - Regex: '.*' 69 | Priority: 1 70 | IncludeIsMainRegex: '(Test)?$' 71 | IndentCaseLabels: false 72 | IndentPPDirectives: None 73 | IndentWidth: 2 74 | IndentWrappedFunctionNames: false 75 | JavaScriptQuotes: Leave 76 | JavaScriptWrapImports: true 77 | KeepEmptyLinesAtTheStartOfBlocks: true 78 | MacroBlockBegin: '' 79 | MacroBlockEnd: '' 80 | MaxEmptyLinesToKeep: 1 81 | NamespaceIndentation: None 82 | ObjCBlockIndentWidth: 2 83 | ObjCSpaceAfterProperty: false 84 | ObjCSpaceBeforeProtocolList: true 85 | PenaltyBreakAssignment: 2 86 | PenaltyBreakBeforeFirstCallParameter: 19 87 | PenaltyBreakComment: 300 88 | PenaltyBreakFirstLessLess: 120 89 | PenaltyBreakString: 1000 90 | PenaltyExcessCharacter: 1000000 91 | PenaltyReturnTypeOnItsOwnLine: 60 92 | PointerAlignment: Right 93 | ReflowComments: true 94 | SortIncludes: false 95 | SpaceAfterCStyleCast: false 96 | SpaceBeforeAssignmentOperators: true 97 | SpaceBeforeParens: ControlStatements 98 | SpaceInEmptyParentheses: false 99 | SpacesBeforeTrailingComments: 1 100 | SpacesInAngles: false 101 | SpacesInContainerLiterals: true 102 | SpacesInCStyleCastParentheses: false 103 | SpacesInParentheses: false 104 | SpacesInSquareBrackets: false 105 | Standard: Cpp11 106 | TabWidth: 2 107 | UseTab: Never 108 | ... 109 | 110 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | insert_final_newline = true 6 | trim_trailing_whitespace = true 7 | 8 | [*.{c,h}] 9 | indent_style = space 10 | indent_size = 2 11 | 12 | [Makefile] 13 | indent_style = tab 14 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: gitsubmodule 4 | directory: "/" 5 | schedule: 6 | interval: weekly 7 | open-pull-requests-limit: 10 8 | - package-ecosystem: cargo 9 | directory: "/src/worker/yaxpeax-x86" 10 | schedule: 11 | interval: weekly 12 | open-pull-requests-limit: 10 13 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | schedule: 9 | # run CI every day even if no PRs/merges occur 10 | - cron: '0 12 * * *' 11 | 12 | jobs: 13 | lint: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Install dependencies 18 | run: sudo apt install -y cppcheck clang-format-12 19 | - name: Lint 20 | run: | 21 | make fmt && git diff --exit-code 22 | make lint 23 | 24 | docker-build: 25 | runs-on: ubuntu-latest 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | - name: Checkout submodules 30 | run: | 31 | auth_header="$(git config --local --get http.https://github.com/.extraheader)" 32 | git submodule sync --recursive 33 | git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive 34 | 35 | - name: Docker build 36 | run: docker build -t mishegos . 37 | 38 | - name: Docker Smoketest 39 | run: | 40 | docker run --rm mishegos bash -eo pipefail -c './src/mishegos/mishegos -m manual ./workers.spec <<< "90" | ./src/mish2jsonl/mish2jsonl' 41 | 42 | - name: Docker Test Fuzz 43 | run: | 44 | docker run --rm mishegos bash -eo pipefail -c \ 45 | '(timeout --preserve-status 5s ./src/mishegos/mishegos -s 0 ./workers.spec || true) | ./src/mish2jsonl/mish2jsonl | tail' 46 | 47 | build: 48 | runs-on: ubuntu-latest 49 | steps: 50 | - uses: actions/checkout@v4 51 | 52 | - name: Checkout submodules 53 | run: | 54 | auth_header="$(git config --local --get http.https://github.com/.extraheader)" 55 | git submodule sync --recursive 56 | git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive 57 | 58 | - name: Install dependencies 59 | run: | 60 | sudo apt-get update 61 | 62 | sudo apt-get install -y \ 63 | build-essential \ 64 | binutils-dev \ 65 | python2 \ 66 | python3 \ 67 | cmake \ 68 | meson \ 69 | ruby \ 70 | autotools-dev \ 71 | autoconf \ 72 | llvm-dev \ 73 | libtool 74 | 75 | sudo update-alternatives --install /usr/bin/python python /usr/bin/python2 1 76 | 77 | - name: Build 78 | run: make -j$(nproc) 79 | 80 | - name: Smoketest 81 | # Disassemble NOP 82 | run: | 83 | set -eo pipefail 84 | ./src/mishegos/mishegos -m manual ./workers.spec <<< "90" | ./src/mish2jsonl/mish2jsonl 85 | 86 | - name: Test Fuzz 87 | run: | 88 | set -eo pipefail 89 | (timeout --preserve-status 5s ./src/mishegos/mishegos -s 0 ./workers.spec || true) | ./src/mish2jsonl/mish2jsonl | tail 90 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | src/mishegos/mishegos 4 | src/worker/worker 5 | src/mish2jsonl/mish2jsonl 6 | Cargo.lock 7 | target/ 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/worker/capstone/capstone"] 2 | path = src/worker/capstone/capstone 3 | url = https://github.com/aquynh/capstone.git 4 | [submodule "src/worker/xed/xed"] 5 | path = src/worker/xed/xed 6 | url = https://github.com/intelxed/xed.git 7 | [submodule "src/worker/xed/mbuild"] 8 | path = src/worker/xed/mbuild 9 | url = https://github.com/intelxed/mbuild.git 10 | [submodule "src/worker/zydis/zydis"] 11 | path = src/worker/zydis/zydis 12 | url = https://github.com/zyantific/zydis.git 13 | [submodule "src/worker/dynamorio/dynamorio"] 14 | path = src/worker/dynamorio/dynamorio 15 | url = https://github.com/DynamoRIO/dynamorio.git 16 | [submodule "src/worker/fadec/fadec"] 17 | path = src/worker/fadec/fadec 18 | url = https://github.com/aengelke/fadec.git 19 | [submodule "src/worker/bddisasm/bddisasm"] 20 | path = src/worker/bddisasm/bddisasm 21 | url = https://github.com/bitdefender/bddisasm.git 22 | branch = master 23 | [submodule "src/worker/iced/iced"] 24 | path = src/worker/iced/iced 25 | url = https://github.com/0xd4d/iced.git 26 | [submodule "src/worker/ghidra/ghidra"] 27 | path = src/worker/ghidra/ghidra 28 | url = https://github.com/NationalSecurityAgency/ghidra.git 29 | [submodule "src/worker/ghidra/sleigh-cmake"] 30 | path = src/worker/ghidra/sleigh-cmake 31 | url = https://github.com/lifting-bits/sleigh.git 32 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @woodruffw @ekilmer 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | RUN export DEBIAN_FRONTEND="noninteractive" && \ 4 | apt-get update && \ 5 | apt-get install -y \ 6 | gpg wget && \ 7 | wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && \ 8 | echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null && \ 9 | apt-get update && \ 10 | apt-get install -y \ 11 | build-essential \ 12 | binutils-dev \ 13 | python \ 14 | python3 \ 15 | cmake \ 16 | meson \ 17 | ruby \ 18 | autotools-dev \ 19 | autoconf \ 20 | libtool \ 21 | git \ 22 | curl \ 23 | llvm-dev \ 24 | libclang-dev \ 25 | clang 26 | 27 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 28 | ENV PATH="/root/.cargo/bin:${PATH}" 29 | 30 | WORKDIR /app/mishegos 31 | COPY ./ . 32 | 33 | ARG TARGET=all 34 | RUN make "${TARGET}" -j $(nproc) 35 | 36 | CMD ["/bin/bash"] 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | UNAME := $(shell uname) 2 | 3 | CFLAGS := \ 4 | -std=gnu11 -Wall -pthread -O2 \ 5 | -I$(shell pwd)/src/include 6 | LDLIBS := -ldl -lpthread 7 | CPPFLAGS := 8 | CXXFLAGS := \ 9 | -std=c++11 -Wall -pthread -O2 \ 10 | -I$(shell pwd)/src/include 11 | # TODO(ww): https://github.com/rust-lang/rust-bindgen/issues/1651 12 | # RUSTFLAGS := -D warnings 13 | RUST_BINDGEN_CLANG_ARGS := \ 14 | -I$(shell pwd)/src/include 15 | 16 | ifeq ($(UNAME), Darwin) 17 | SO_SUFFIX := dylib 18 | else 19 | SO_SUFFIX := so 20 | # Linux needs -lrt for the POSIX shm(3) family calls. 21 | LDLIBS := $(LDLIBS) -lrt 22 | endif 23 | 24 | export UNAME 25 | export CFLAGS 26 | export LDLIBS 27 | export CPPFLAGS 28 | export CXXFLAGS 29 | export RUST_BINDGEN_CLANG_ARGS 30 | export SO_SUFFIX 31 | 32 | 33 | ALL_SRCS := $(shell \ 34 | find . -type f \ 35 | \( \ 36 | -path '*/capstone/capstone/*' -o \ 37 | -path '*/vendor/*' -o \ 38 | -path '*/dynamorio/dynamorio/*' -o \ 39 | -path '*/dynamorio/obj/*' -o \ 40 | -path '*/fadec/fadec/*' -o \ 41 | -path '*/udis86/udis86/*' -o \ 42 | -path '*/xed/xed/*' -o \ 43 | -path '*/xed/mbuild/*' -o \ 44 | -path '*/zydis/zydis/*' -o \ 45 | -path '*/bddisasm/bddisasm/*' -o \ 46 | -path '*/ghidra/sleighMishegos*' -o \ 47 | -path '*/ghidra/ghidra/*' -o \ 48 | -path '*/ghidra/build/*' -o \ 49 | -path '*/ghidra/sleigh-cmake/*' \ 50 | \) \ 51 | -prune \ 52 | -o \( \ 53 | -name 'sleighMishegos*' -o \ 54 | -name '*.c' -o \ 55 | -name '*.cc' -o \ 56 | -name '*.h' -o \ 57 | -name '*.hh' \ 58 | \) \ 59 | -print \ 60 | ) 61 | 62 | .PHONY: all 63 | all: mishegos worker mish2jsonl 64 | 65 | .PHONY: debug 66 | debug: CPPFLAGS += -DDEBUG 67 | debug: CFLAGS += -g 68 | debug: all 69 | 70 | .PHONY: mishegos 71 | mishegos: 72 | $(MAKE) -C src/mishegos 73 | 74 | .PHONY: worker 75 | worker: 76 | $(MAKE) -C src/worker $(WORKERS) 77 | 78 | .PHONY: mish2jsonl 79 | mish2jsonl: 80 | $(MAKE) -C src/mish2jsonl 81 | 82 | .PHONY: fmt 83 | fmt: 84 | clang-format -i -style=file $(ALL_SRCS) 85 | 86 | .PHONY: lint 87 | lint: 88 | cppcheck --error-exitcode=1 $(ALL_SRCS) 89 | 90 | .PHONY: edit 91 | edit: 92 | $(EDITOR) $(ALL_SRCS) 93 | 94 | .PHONY: clean 95 | clean: 96 | $(MAKE) -C src/worker clean 97 | $(MAKE) -C src/mishegos clean 98 | $(MAKE) -C src/mish2jsonl clean 99 | 100 | .PHONY: update-submodules 101 | update-submodules: 102 | git submodule foreach git pull origin master 103 | 104 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mishegos 2 | ======== 3 | 4 | [![CI](https://github.com/trailofbits/mishegos/actions/workflows/ci.yml/badge.svg)](https://github.com/trailofbits/mishegos/actions/workflows/ci.yml) 5 | 6 | A differential fuzzer for x86 decoders. 7 | 8 | ![mishegos](https://user-images.githubusercontent.com/3059210/59005797-da89b400-87ec-11e9-8274-321edfa6df45.png) 9 | 10 | Read more about `mishegos` in its accompanying [blog post](https://blog.trailofbits.com/2019/10/31/destroying-x86_64-instruction-decoders-with-differential-fuzzing/) 11 | and academic publication ([paper](https://github.com/gangtan/LangSec-papers-and-slides/raw/main/langsec21/papers/Woodruff_LangSec21.pdf) 12 | | [recording](https://www.youtube.com/watch?v=a2q86KTZt0g) 13 | | [slides](https://github.com/trailofbits/publications/blob/master/presentations/Differential%20analysis%20of%20x86-64%20decoders/langsec-2021-slides.pdf)). 14 | 15 | ```bibtex 16 | @InProceedings{woodruff21differential, 17 | author = "William Woodruff and Niki Carroll and Sebastiaan Peters", 18 | title = "Differential analysis of x86-64 instruction decoders", 19 | booktitle = "Proceedings of the Seventh Language-Theoretic Security Workshop~({LangSec}) at the {IEEE} Symposium on Security and Privacy", 20 | year = "2021", 21 | month = "May" 22 | } 23 | ``` 24 | 25 | ## Usage 26 | 27 | Start with a clone, including submodules: 28 | 29 | ```bash 30 | git clone --recurse-submodules https://github.com/trailofbits/mishegos 31 | ``` 32 | 33 | ### Building 34 | 35 | `mishegos` is most easily built within Docker: 36 | 37 | ```bash 38 | docker build -t mishegos . 39 | ``` 40 | 41 | Alternatively, you can try building it directly. 42 | 43 | Make sure you have `binutils-dev` (or however your system provides `libopcodes`) installed: 44 | 45 | ```bash 46 | make 47 | # or 48 | make debug 49 | ``` 50 | 51 | Build specific workers by passing a space-delimited list as the `WORKERS` varable: 52 | 53 | ```bash 54 | WORKERS="bfd capstone" make worker 55 | ``` 56 | 57 | ### Running 58 | 59 | Run the fuzzer for a bit: 60 | 61 | ```bash 62 | ./src/mishegos/mishegos ./workers.spec > /tmp/mishegos 63 | ``` 64 | 65 | `mishegos` checks for three environment variables: 66 | 67 | * `V=1` enables verbose output on `stderr` 68 | * `D=1` enables the "dummy" mutation mode for debugging purposes 69 | * `M=1` enables the "manual" mutation mode (i.e., read from `stdin`) 70 | * `MODE=mode` can be used to configure the mutation mode in the absence of `D` and `M` 71 | * Valid mutation modes are `sliding` (default), `havoc`, and `structured` 72 | 73 | Convert mishegos's raw output into JSONL suitable for analysis: 74 | 75 | ```bash 76 | ./src/mish2jsonl/mish2jsonl /tmp/mishegos > /tmp/mishegos.jsonl 77 | ``` 78 | 79 | `mish2jsonl` checks for `V=1` to enable verbose output on `stderr`. 80 | 81 | Run an analysis/filter pass group on the results: 82 | 83 | ```bash 84 | ./src/analysis/analysis -p same-size-different-decodings < /tmp/mishegos.jsonl > /tmp/mishegos.interesting 85 | ``` 86 | 87 | Generate an ~ugly~ pretty visualization of the filtered results: 88 | 89 | ```bash 90 | ./src/mishmat/mishmat < /tmp/mishegos.interesting > /tmp/mishegos.html 91 | open /tmp/mishegos.html 92 | ``` 93 | 94 | Tip: The HTML file that `mishmat` generates could be hundreds of megabytes large, which will likely result in a bad browser viewing experience. Using the [`split`](https://man7.org/linux/man-pages/man1/split.1.html) tool, you can create multiple smaller HTML files with a specified number of entries per file (10,000 in the following example) and load each of them separately: 95 | 96 | ```bash 97 | mkdir /tmp/mishegos-html 98 | split -d --lines=10000 - /tmp/mishegos-html/mishegos_ \ 99 | --additional-suffix='.html' --filter='./src/mishmat/mishmat > $FILE' \ 100 | < /tmp/mishegos.interesting 101 | ``` 102 | 103 | ### Contributing 104 | 105 | We welcome contributors to mishegos! 106 | 107 | A guide for adding new disassembler workers can be found [here](./docs/adding_a_worker.md). 108 | 109 | ### Performance notes 110 | 111 | All numbers below correspond to the following run: 112 | 113 | ```bash 114 | V=1 timeout 60s ./src/mishegos/mishegos ./workers.spec > /tmp/mishegos 115 | ``` 116 | 117 | Outside Docker: 118 | 119 | * On a Linux desktop (Ubuntu 20.04, Ryzen 5 3600, 32GB DDR4): 120 | * Commit [`d80063a`](https://github.com/trailofbits/mishegos/commit/d80063a575c4b10d5f787ac88f45d44c8e7f9937) 121 | * 8 workers (no `udis86`) + 1 `mishegos` fuzzer process 122 | * 8.7M outputs/minute 123 | * 9 cores pinned 124 | 125 | ## TODO 126 | 127 | * Performance improvements 128 | * Break cohort collection out into a separate process (requires re-addition of semaphores) 129 | * Maybe use a better data structure for input/output/cohort slots 130 | * Add a scaling factor for workers, e.g. spawn `N` of each worker 131 | * Pre-analysis normalization (whitespace, immediate representation, prefixes) 132 | * Analysis strategies: 133 | * Filter by length, decode status discrepancies 134 | * Easy: lexical comparison 135 | * Easy: reassembly + effects modeling (maybe with microx?) 136 | * Scoring ideas: 137 | * Low value: Flag/prefix discrepancies 138 | * Medium value: Decode success/failure/crash discrepancies 139 | * High value: Decode discrepancies with differing control flow, operands, maybe some immediates 140 | * Visualization ideas: 141 | * Basic but not really basic: some kind of mouse-over differential visualization 142 | 143 | ## License 144 | 145 | `mishegos` is licensed and distributed under the [Apache v2.0](LICENSE) license. [Contact us](mailto:opensource@trailofbits.com) if you’re looking for an exception to the terms. 146 | -------------------------------------------------------------------------------- /docs/adding_a_worker.md: -------------------------------------------------------------------------------- 1 | Adding a mishegos worker 2 | ======================== 3 | 4 | Adding a new worker to mishegos is (relatively) straightforward. 5 | 6 | This page makes an attempt to document the process, but no guarantees about 7 | correctness or being up-to-date are made. When in doubt refer to 8 | a simple worker already in the tree, like 9 | [capstone](https://github.com/trailofbits/mishegos/tree/master/src/worker/capstone). 10 | 11 | ## Adding the worker 12 | 13 | A good worker is self contained within its `./src/worker/WORKERNAME/` directory. 14 | 15 | That directory should look something like this: 16 | 17 | ``` 18 | ./src/worker/WORKERNAME/: 19 | SOME_SUBMODULE/ 20 | Makefile 21 | WORKERNAME.c 22 | ``` 23 | 24 | Each member is discussed below. 25 | 26 | ### `SOME_SUBMODULE/` 27 | 28 | If your worker requires a disassembly library that is **either** (1) actively maintained **or** 29 | (2) is unavailable in popular package managers, then it should be submoduled within the worker 30 | directory. Multiple submodules (or recursive submodules, if necessary) are fine; see the XED worker 31 | for an example. 32 | 33 | ### `Makefile` 34 | 35 | Your worker directory should include a single `Makefile` that builds both the target disassembler 36 | and the mishegos worker. 37 | 38 | Two `make` targets are required: 39 | 40 | * `all`: Build all dependencies and the worker's shared object 41 | * `clean`: Clean the worker's shared object and, *optionally*, the builds of all dependencies 42 | 43 | Your `all` target should produce some reasonably named shared object (`WORKERNAME.so` is 44 | currently common in the codebase) in the worker directory. You'll need this shared object's path 45 | later. 46 | 47 | ### `WORKERNAME.c` 48 | 49 | `WORKERNAME.c` should implement the mishegos worker ABI, which is the following: 50 | 51 | ```c 52 | char *worker_name; 53 | void worker_ctor(); 54 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length); 55 | void worker_dtor(); 56 | ``` 57 | 58 | See the existing workers and header files for type and usage examples. 59 | 60 | `worker_name` is a static string that *uniquely identifies the worker*. Duplicating `worker_name` 61 | across different kinds of workers will cause very bad things to happen. 62 | 63 | `worker_ctor` and `worker_dtor` are **optional** and run on worker process startup and termination, 64 | respectively. 65 | 66 | ## Integrating into the build 67 | 68 | Once you have a worker in place, you'll have to modify a few files to get mishegos to build 69 | and fuzz with it. 70 | 71 | ### `./src/workers/Makefile` 72 | 73 | This `Makefile` contains a `WORKERS` variable. Add `WORKERNAME` (or whatever you named 74 | your worker directory) to it. 75 | 76 | ### `./Makefile` 77 | 78 | The top-level `Makefile` contains an `ALL_SRCS` variable. This variable has a `find` expression 79 | in it that excludes submodule sources from automated linting tasks. Add glob(s) matching your 80 | worker's submodule(s) to it. 81 | 82 | ### `./workers.spec` 83 | 84 | This is a newline-delimited list of shared objects that `mishegos` (the main fuzzer binary) 85 | takes via an argument. Add the path to your worker shared object to it. 86 | -------------------------------------------------------------------------------- /docs/output_cohorts_format.md: -------------------------------------------------------------------------------- 1 | Cohort Output Format 2 | ===================== 3 | 4 | This file briefly describes the format of `mishegos`'s binary output. 5 | 6 | The details of the binary format are an implementation detail and should only be of interest 7 | if working on mishegos itself; users looking to analyze mishegos's results should run 8 | `mish2jsonl` and operate on the JSONL-formatted results. 9 | 10 | ## Motivation 11 | 12 | Earlier versions of mishegos dumped their results directly to JSONL. This required 13 | us to do JSON serialization and internal allocations in the fuzzing lifecycle, incurring 14 | a performance hit. 15 | 16 | ## Format 17 | 18 | Mishegos's binary output is a sequence of "cohorts", each of which contains `N` outputs 19 | where `N` is the number of workers. 20 | 21 | Each cohort begins with a header: 22 | 23 | * `nworkers` (`u32`): The number of workers present in this output cohort 24 | * `input` (`u64` + `str`): A length-prefixed, pretty-printed hex string of the input handled by 25 | this cohort 26 | 27 | After the header, each cohort contains `nworkers` output records. Each output contains: 28 | 29 | * `status` (`u32`): A status code corresponding to the `decode_status` enum 30 | * `ndecoded` (`u16`): The number of bytes of `input` decoded 31 | * `workerno` (`u32`): The worker's identifying index 32 | * `worker_so` (`u64` + `str`): A length-prefixed string containg the path to the worker's dynamic 33 | shared object 34 | * `len` (`u16`): The string length of the decoded instruction, or `0` if none is present 35 | * `result` (`str`): A string of `len` bytes containing the decoded instruction 36 | 37 | Visualized: 38 | 39 | ``` 40 | |-------------------------| 41 | | cohort 1: nworkers: 3 | 42 | | output 1 | 43 | | output 2 | 44 | | output 3 | 45 | |-------------------------| 46 | | cohort 2: nworkers: 3 | 47 | | output 1 | 48 | | output 2 | 49 | | output 3 | 50 | |-------------------------| 51 | | cohort ... | 52 | | .... | 53 | |_________________________| 54 | ``` 55 | 56 | ## Implementation 57 | 58 | Mishego's binary output is transformed into JSONL via a parser specified in 59 | [Kaitai Struct](https://kaitai.io/)'s DSL. 60 | -------------------------------------------------------------------------------- /src/analysis/analysis: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # analysis: collect analysis passes, order them, and pipeline 5 | # mishegos results through them 6 | 7 | require "yaml" 8 | require "ostruct" 9 | require "pathname" 10 | require "set" 11 | require "open3" 12 | require "optparse" 13 | 14 | def hai(msg) 15 | warn "[analysis] #{msg}" if VERBOSE 16 | end 17 | 18 | def load_pass!(dir) 19 | hai "loading pass from #{dir}" 20 | 21 | spec = dir / "spec.yml" 22 | raise "Pass missing spec: #{spec}" unless spec.file? 23 | 24 | pass = OpenStruct.new YAML.load_file(spec) 25 | pass.spec = spec 26 | pass.dir = dir 27 | pass.not_before ||= [] 28 | pass.cmd = pass.dir / pass.run 29 | 30 | pass 31 | end 32 | 33 | # A mix-in for operations on all passes. 34 | module PassOperations 35 | def build_graph! 36 | graph = OpenStruct.new(nodes: [], edges: []) 37 | 38 | each do |pass| 39 | graph.nodes << pass 40 | 41 | pass.not_before.each do |nb| 42 | pred = find { |p| p.name == nb } 43 | raise "#{pass.name} depends on missing pass: #{nb}" unless pred 44 | 45 | graph.edges << [pred, pass] 46 | end 47 | end 48 | 49 | graph 50 | end 51 | 52 | def verify! 53 | hai "verifying #{size} passes" 54 | 55 | raise "one or more duplicate pass names" if uniq(&:name).size != size 56 | raise "one or more nonexecutable passes" unless all? { |p| p.cmd.executable? } 57 | 58 | self 59 | end 60 | 61 | # This is just a topological sort of our pass DAG. 62 | # Why? Nescio; sed fieri sentio et excrucior. 63 | # NOTE: Currently unused; we assume that the analysis's order is valid. 64 | def order! 65 | hai "realizing pass DAG into a concrete order" 66 | 67 | graph = build_graph! 68 | ordered = [] 69 | node_set = [] 70 | 71 | # Our initial node set consists of only nodes that don't have a predecessor. 72 | graph.nodes.each do |node| 73 | next if graph.edges.any? { |e| e[1] == node } 74 | 75 | node_set << node 76 | end 77 | 78 | until node_set.empty? 79 | node = node_set.shift 80 | ordered << node 81 | 82 | succ_nodes = graph.nodes.select { |s| graph.edges.include?([node, s]) } 83 | succ_nodes.each do |succ| 84 | graph.edges.delete [node, succ] 85 | next if graph.edges.any? { |e| e[1] == succ } 86 | 87 | node_set << succ 88 | end 89 | end 90 | 91 | raise "pass DAG contains a cycle" unless graph.edges.empty? 92 | 93 | replace ordered 94 | self 95 | end 96 | 97 | def run! 98 | hai "running passes: #{map(&:name)}" 99 | 100 | cmds = map(&:cmd).map(&:to_s) 101 | Open3.pipeline(*cmds, in: $stdin, out: $stdout) 102 | 103 | self 104 | end 105 | end 106 | 107 | VERBOSE = ENV["VERBOSE"] || ENV["V"] 108 | PASS_DIR = Pathname.new File.expand_path("pass", __dir__) 109 | PASS_FILE = Pathname.new File.expand_path("passes.yml", __dir__) 110 | 111 | opts = { 112 | profile: "default", 113 | describe: false, 114 | } 115 | 116 | OptionParser.new do |o| 117 | o.banner = "Usage: analysis [options]" 118 | 119 | o.on "-p", "--profile PROFILE", String, "Use the given analysis profile" do |profile| 120 | opts[:profile] = profile 121 | end 122 | 123 | o.on "-d", "--describe", "Describe each step of the given profile instead of running" do 124 | opts[:describe] = true 125 | end 126 | end.parse! 127 | 128 | $stderr.sync = true 129 | 130 | hai "pass directory: #{PASS_DIR}" 131 | hai "pass spec file: #{PASS_FILE}" 132 | 133 | profile = YAML.load_file(PASS_FILE)[opts[:profile]] 134 | raise "no such profile: #{opts[:profile]}" unless profile 135 | 136 | hai "#{opts[:profile]} passes: #{profile}" 137 | 138 | passes = PASS_DIR.children.select(&:directory?).map do |pass_dir| 139 | load_pass! pass_dir 140 | end 141 | 142 | # Select only the passes defined by the profile, and sort them by their order 143 | # in the profile. 144 | passes 145 | .select! { |p| profile.include? p.name } 146 | .sort_by! { |p| profile.index p.name } 147 | passes.extend PassOperations 148 | passes.verify! 149 | 150 | if opts[:describe] 151 | puts opts[:profile] 152 | passes.each { |pass| puts "\t#{pass.name}: #{pass.desc}" } 153 | else 154 | passes.run! 155 | end 156 | -------------------------------------------------------------------------------- /src/analysis/pass/dedupe/dedupe: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # dedupe: filter out any cohorts whose inputs have already appeared at least once. 5 | 6 | require "json" 7 | require "set" 8 | 9 | warn "[+] pass: dedupe" 10 | 11 | count = 0 12 | seen = Set.new 13 | $stdin.each_line do |line| 14 | result = JSON.parse line, symbolize_names: true 15 | 16 | # add? returns nil if the element is already present, saving us 17 | # two separate operations (check + add). 18 | if seen.add?(result[:input]).nil? 19 | count += 1 20 | next 21 | end 22 | 23 | $stdout.puts result.to_json 24 | end 25 | 26 | warn "[+] pass: dedupe done: #{count} filtered" 27 | -------------------------------------------------------------------------------- /src/analysis/pass/dedupe/spec.yml: -------------------------------------------------------------------------------- 1 | name: dedupe 2 | desc: Filter out any duplicate cohorts (by input) 3 | run: dedupe 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-all-failure/filter-all-failure: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-all-failure: remove all cohorts in which each worker failed 5 | 6 | require "json" 7 | 8 | warn "[+] pass: filter-all-failure" 9 | 10 | count = 0 11 | $stdin.each_line do |line| 12 | result = JSON.parse line, symbolize_names: true 13 | 14 | if result[:outputs].all? { |o| o[:status][:name] == "failure" } 15 | count += 1 16 | next 17 | end 18 | 19 | $stdout.puts result.to_json 20 | end 21 | 22 | warn "[+] pass: filter-all-failure done: #{count} filtered" 23 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-all-failure/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-all-failure 2 | desc: Remove all cohorts in which each worker failed 3 | run: filter-all-failure 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-all-success/filter-all-success: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-all-success: remove all cohorts in which each worker suceeded 5 | 6 | require "json" 7 | 8 | STDERR.puts "[+] pass: filter-all-success" 9 | 10 | count = 0 11 | STDIN.each_line do |line| 12 | result = JSON.parse line, symbolize_names: true 13 | 14 | if result[:outputs].all? { |o| o[:status][:name] == "success" } 15 | count += 1 16 | next 17 | end 18 | 19 | STDOUT.puts result.to_json 20 | end 21 | 22 | STDERR.puts "[+] pass: filter-all-success done: #{count} filtered" 23 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-all-success/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-all-success 2 | desc: Remove all cohorts in which each worker succeeded 3 | run: filter-all-success 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-any-failure/filter-any-failure: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-any-failure: remove any cohorts in which each worker failed 5 | 6 | require "json" 7 | 8 | warn "[+] pass: filter-any-failure" 9 | 10 | count = 0 11 | $stdin.each_line do |line| 12 | result = JSON.parse line, symbolize_names: true 13 | 14 | if result[:outputs].any? { |o| o[:status][:name] == "failure" } 15 | count += 1 16 | next 17 | end 18 | 19 | $stdout.puts result.to_json 20 | end 21 | 22 | warn "[+] pass: filter-any-failure done: #{count} filtered" 23 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-any-failure/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-any-failure 2 | desc: Remove any cohorts in which each worker failed 3 | run: filter-any-failure 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-bddisasm-salc/filter-bddisasm-salc: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-xed-find-underaccept: find inputs that XED potentially underaccepts 5 | # (i.e., inputs the other high-quality decoders think are valid) 6 | 7 | require "json" 8 | 9 | # TODO(ww): Remove this. 10 | BDDISASM_SO = "./src/worker/bddisasm/bddisasm.so" 11 | 12 | warn "[+] pass: filter-bddisasm-salc" 13 | 14 | count = 0 15 | $stdin.each_line do |line| 16 | result = JSON.parse line, symbolize_names: true 17 | 18 | bddisasm = result[:outputs].find { |o| o[:worker_so] == BDDISASM_SO } 19 | 20 | if bddisasm[:result] == "SALC" 21 | count += 1 22 | next 23 | end 24 | 25 | $stdout.puts result.to_json 26 | end 27 | 28 | warn "[+] pass: filter-bddisasm-salc done: #{count} filtered" 29 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-bddisasm-salc/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-bddisasm-salc 2 | desc: Find bddisasm results that decode to SALC 3 | run: filter-bddisasm-salc 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-destroy-bddisasm/filter-destroy-bddisasm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-destroy-bddisasm: find results that only bddisasm gets right (or wrong) 5 | 6 | require "json" 7 | 8 | # TODO(ww): Remove this. 9 | BDDISASM_SO = "./src/worker/bddisasm/bddisasm.so" 10 | XED_SO = "./src/worker/xed/xed.so" 11 | ZYDIS_SO = "./src/worker/zydis/zydis.so" 12 | 13 | warn "[+] pass: filter-destroy-bddisasm" 14 | 15 | count = 0 16 | $stdin.each_line do |line| 17 | result = JSON.parse line, symbolize_names: true 18 | 19 | bddisasm = result[:outputs].find { |o| o[:worker_so] == BDDISASM_SO } 20 | xed = result[:outputs].find { |o| o[:worker_so] == XED_SO } 21 | zydis = result[:outputs].find { |o| o[:worker_so] == ZYDIS_SO } 22 | 23 | if bddisasm[:status][:value] == xed[:status][:value] && bddisasm[:status][:value] == zydis[:status][:value] 24 | count += 1 25 | next 26 | end 27 | 28 | $stdout.puts result.to_json 29 | end 30 | 31 | warn "[+] pass: filter-destroy-bddisasm done: #{count} filtered" 32 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-destroy-bddisasm/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-destroy-bddisasm 2 | desc: Find results that only bddisasm gets right (or wrong) 3 | run: filter-destroy-bddisasm 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-destroy-capstone/filter-destroy-capstone: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-destroy-capstone: find results that only capstone gets right (or wrong) 5 | 6 | require "json" 7 | 8 | # TODO(ww): Remove this. 9 | CAPSTONE_SO = "./src/worker/capstone/capstone.so" 10 | XED_SO = "./src/worker/xed/xed.so" 11 | ZYDIS_SO = "./src/worker/zydis/zydis.so" 12 | 13 | warn "[+] pass: filter-destroy-capstone" 14 | 15 | count = 0 16 | $stdin.each_line do |line| 17 | result = JSON.parse line, symbolize_names: true 18 | 19 | capstone = result[:outputs].find { |o| o[:worker_so] == CAPSTONE_SO } 20 | xed = result[:outputs].find { |o| o[:worker_so] == XED_SO } 21 | zydis = result[:outputs].find { |o| o[:worker_so] == ZYDIS_SO } 22 | 23 | if capstone[:status][:value] == xed[:status][:value] && capstone[:status][:value] == zydis[:status][:value] 24 | count += 1 25 | next 26 | end 27 | 28 | $stdout.puts result.to_json 29 | end 30 | 31 | warn "[+] pass: filter-destroy-capstone done: #{count} filtered" 32 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-destroy-capstone/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-destroy-capstone 2 | desc: Find results that only capstone gets right (or wrong) 3 | run: filter-destroy-capstone 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-destroy-ghidra/filter-destroy-ghidra: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-destroy-ghidra: find results that only ghidra gets right (or wrong) 5 | 6 | require "json" 7 | 8 | XED_SO = "./src/worker/xed/xed.so" 9 | ZYDIS_SO = "./src/worker/zydis/zydis.so" 10 | ICED_SO = "./src/worker/iced/iced.so" 11 | GHIDRA_SO = "./src/worker/ghidra/ghidra.so" 12 | 13 | warn "[+] pass: filter-destroy-ghidra" 14 | 15 | count = 0 16 | $stdin.each_line do |line| 17 | result = JSON.parse line, symbolize_names: true 18 | 19 | xed = result[:outputs].find { |o| o[:worker_so] == XED_SO } 20 | zydis = result[:outputs].find { |o| o[:worker_so] == ZYDIS_SO } 21 | iced = result[:outputs].find { |o| o[:worker_so] == ICED_SO } 22 | ghidra = result[:outputs].find { |o| o[:worker_so] == GHIDRA_SO } 23 | 24 | if ghidra[:status][:value] == xed[:status][:value] && 25 | ghidra[:status][:value] == zydis[:status][:value] && 26 | ghidra[:status][:value] == iced[:status][:value] 27 | count += 1 28 | next 29 | end 30 | 31 | $stdout.puts result.to_json 32 | end 33 | 34 | warn "[+] pass: filter-destroy-ghidra done: #{count} filtered" 35 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-destroy-ghidra/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-destroy-ghidra 2 | desc: Find results that only ghidra gets right (or wrong) 3 | run: filter-destroy-ghidra 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-ghidra-lock/filter-ghidra-lock: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-ghidra-lock: Find Ghidra results that decode to "LOCK" 5 | # instruction. "LOCK" is a prefix, not a real instruction: 6 | # https://github.com/NationalSecurityAgency/ghidra/issues/2033#issue-645334803 7 | 8 | require "json" 9 | 10 | GHIDRA_SO = "./src/worker/ghidra/ghidra.so" 11 | 12 | warn "[+] pass: filter-ghidra-lock" 13 | 14 | count = 0 15 | $stdin.each_line do |line| 16 | result = JSON.parse line, symbolize_names: true 17 | 18 | ghidra = result[:outputs].find { |o| o[:worker_so] == GHIDRA_SO } 19 | 20 | if ghidra[:result] == "LOCK" 21 | count += 1 22 | next 23 | end 24 | 25 | $stdout.puts result.to_json 26 | end 27 | 28 | warn "[+] pass: filter-ghidra-lock done: #{count} filtered" 29 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-ghidra-lock/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-ghidra-lock 2 | desc: Find ghidra results that decode to LOCK 3 | run: filter-ghidra-lock 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-incomparable/filter-incomparable: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-incomparable: remove any cohorts whose results can't be compared, 5 | # i.e. any cohorts that have fewer than two successful results 6 | 7 | require "json" 8 | 9 | warn "[+] pass: filter-incomparable" 10 | 11 | def success?(decoder) 12 | decoder[:status][:value] == 1 13 | end 14 | 15 | count = 0 16 | $stdin.each_line do |line| 17 | result = JSON.parse line, symbolize_names: true 18 | 19 | outputs = result[:outputs] 20 | successes = outputs.count { |o| success?(o) } 21 | if successes < 2 22 | count += 1 23 | next 24 | end 25 | 26 | $stdout.puts result.to_json 27 | end 28 | 29 | warn "[+] pass: filter-incomparable done: #{count} filtered" 30 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-incomparable/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-incomparable 2 | desc: Find out cohorts whose results can't be compared (i.e., that have less than two successes) 3 | run: filter-incomparable 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-ndecoded-different/filter-ndecoded-different: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-ndecoded-different: remove any cohorts where one or more outputs 5 | # consumed different amounts of the input 6 | # 7 | # NOTE: Observe that "decoded the same number of bytes" is *not* the same 8 | # as "decoded to the same instruction". As such, this pass will probably produce 9 | # false negatives if your goal is to find instructions of the same size that decode 10 | # to different things. 11 | 12 | require "json" 13 | 14 | warn "[+] pass: filter-ndecoded-different" 15 | 16 | count = 0 17 | $stdin.each_line do |line| 18 | result = JSON.parse line, symbolize_names: true 19 | 20 | outputs_ndecoded = result[:outputs].map { |o| o[:ndecoded] } 21 | if outputs_ndecoded.uniq.size > 1 22 | count += 1 23 | next 24 | end 25 | 26 | $stdout.puts result.to_json 27 | end 28 | 29 | warn "[+] pass: filter-ndecoded-different done: #{count} filtered" 30 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-ndecoded-different/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-ndecoded-different 2 | desc: Filter out any cohorts where one or more cohorts consumed different amounts of input 3 | run: filter-ndecoded-different 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-ndecoded-same/filter-ndecoded-same: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-ndecoded-same: remove any cohorts where all outputs consumed 5 | # exactly the same number of input bytes. 6 | # 7 | # NOTE: Observe that "decoded the same number of bytes" is *not* the same 8 | # as "decoded to the same instruction". As such, this pass will probably produce 9 | # false negatives if your goal is to find instructions of the same size that decode 10 | # to different things. 11 | 12 | require "json" 13 | 14 | warn "[+] pass: filter-ndecoded-same" 15 | 16 | count = 0 17 | $stdin.each_line do |line| 18 | result = JSON.parse line, symbolize_names: true 19 | 20 | outputs = result[:outputs] 21 | if outputs.all? { |o| o[:ndecoded] == outputs.first[:ndecoded] } 22 | count += 1 23 | next 24 | end 25 | 26 | $stdout.puts result.to_json 27 | end 28 | 29 | warn "[+] pass: filter-ndecoded-same done: #{count} filtered" 30 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-ndecoded-same/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-ndecoded-same 2 | desc: Filter out any cohorts where all outputs consumed exactly the same number of input bytes 3 | run: filter-ndecoded-same 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-xed-find-overaccept/filter-xed-find-overaccept: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-xed-find-overaccept: find inputs that XED potentially overaccepts 5 | # (i.e., inputs the other high-quality decoders think are invalid) 6 | 7 | require "json" 8 | 9 | # TODO(ww): Remove this. 10 | BDDISASM_SO = "./src/worker/bddisasm/bddisasm.so" 11 | XED_SO = "./src/worker/xed/xed.so" 12 | ZYDIS_SO = "./src/worker/zydis/zydis.so" 13 | ICED_SO = "./src/worker/iced/iced.so" 14 | 15 | def success?(decoder) 16 | decoder[:status][:value] == 1 17 | end 18 | 19 | def failure?(decoder) 20 | !success?(decoder) 21 | end 22 | 23 | def failure_by_consensus?(*decoders) 24 | nfailures = decoders.select { |d| failure?(d) }.size 25 | 26 | (nfailures / decoders.size.to_f) > 0.50 27 | end 28 | 29 | warn "[+] pass: filter-xed-find-overaccept" 30 | 31 | count = 0 32 | $stdin.each_line do |line| 33 | result = JSON.parse line, symbolize_names: true 34 | 35 | bddisasm = result[:outputs].find { |o| o[:worker_so] == BDDISASM_SO } 36 | xed = result[:outputs].find { |o| o[:worker_so] == XED_SO } 37 | zydis = result[:outputs].find { |o| o[:worker_so] == ZYDIS_SO } 38 | iced = result[:outputs].find { |o| o[:worker_so] == ICED_SO } 39 | 40 | # If XED reports success when other high-quality decoders don't, keep it. 41 | if success?(xed) && failure_by_consensus?(bddisasm, zydis, iced) 42 | $stdout.puts result.to_json 43 | end 44 | 45 | count += 1 46 | end 47 | 48 | warn "[+] pass: filter-xed-find-overaccept done: #{count} filtered" 49 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-xed-find-overaccept/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-xed-find-overaccept 2 | desc: Find results that XED potentially overaccepts 3 | run: filter-xed-find-overaccept 4 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-xed-find-underaccept/filter-xed-find-underaccept: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # filter-xed-find-underaccept: find inputs that XED potentially underaccepts 5 | # (i.e., inputs the other high-quality decoders think are valid) 6 | 7 | require "json" 8 | 9 | # TODO(ww): Remove this. 10 | BDDISASM_SO = "./src/worker/bddisasm/bddisasm.so" 11 | XED_SO = "./src/worker/xed/xed.so" 12 | ZYDIS_SO = "./src/worker/zydis/zydis.so" 13 | ICED_SO = "./src/worker/iced/iced.so" 14 | 15 | def success?(decoder) 16 | decoder[:status][:value] == 1 17 | end 18 | 19 | def failure?(decoder) 20 | !success?(decoder) 21 | end 22 | 23 | def success_by_consensus?(*decoders) 24 | nsuccesses = decoders.select { |d| success?(d) }.size 25 | 26 | (nsuccesses / decoders.size.to_f) > 0.50 27 | end 28 | 29 | warn "[+] pass: filter-xed-find-underaccept" 30 | 31 | count = 0 32 | $stdin.each_line do |line| 33 | result = JSON.parse line, symbolize_names: true 34 | 35 | bddisasm = result[:outputs].find { |o| o[:worker_so] == BDDISASM_SO } 36 | xed = result[:outputs].find { |o| o[:worker_so] == XED_SO } 37 | zydis = result[:outputs].find { |o| o[:worker_so] == ZYDIS_SO } 38 | iced = result[:outputs].find { |o| o[:worker_so] == ICED_SO } 39 | 40 | # If XED reports failure when other high-quality decoders don't, keep it. 41 | $stdout.puts result.to_json if failure?(xed) && success_by_consensus?(bddisasm, zydis, iced) 42 | 43 | count += 1 44 | end 45 | 46 | warn "[+] pass: filter-xed-find-underaccept done: #{count} filtered" 47 | -------------------------------------------------------------------------------- /src/analysis/pass/filter-xed-find-underaccept/spec.yml: -------------------------------------------------------------------------------- 1 | name: filter-xed-find-underaccept 2 | desc: Find results that XED potentially underaccepts 3 | run: filter-xed-find-underaccept 4 | -------------------------------------------------------------------------------- /src/analysis/pass/find-size-discrepancies/find-size-discrepancies: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # find-size-discrepancies: find cohorts whose successful results differ in decoded size. 5 | 6 | require "json" 7 | 8 | warn "[+] pass: find-size-discrepancies" 9 | 10 | def success?(decoder) 11 | decoder[:status][:value] == 1 12 | end 13 | 14 | count = 0 15 | $stdin.each_line do |line| 16 | result = JSON.parse line, symbolize_names: true 17 | 18 | outputs = result[:outputs] 19 | successes = outputs.select { |o| success? o } 20 | 21 | if successes.map { |o| o[:ndecoded] }.uniq.size == 1 22 | count += 1 23 | next 24 | end 25 | 26 | $stdout.puts result.to_json 27 | end 28 | 29 | warn "[+] pass: find-size-discrepancies done: #{count} filtered" 30 | -------------------------------------------------------------------------------- /src/analysis/pass/find-size-discrepancies/spec.yml: -------------------------------------------------------------------------------- 1 | name: find-size-discrepancies 2 | desc: Find results that have at least two successful results that disagree on size 3 | run: find-size-discrepancies 4 | -------------------------------------------------------------------------------- /src/analysis/pass/minimize-input/minimize-input: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # minimize-input: trim each cohort's input to no greater than the longest output's decode length 5 | 6 | require "json" 7 | 8 | warn "[+] pass: minimize-input" 9 | 10 | count = 0 11 | $stdin.each_line do |line| 12 | result = JSON.parse line, symbolize_names: true 13 | 14 | max_ndecoded = result[:outputs].map { |o| o[:ndecoded] }.max 15 | 16 | # If the maximum ndecoded is 0, then all are 0 and we should skip 17 | # this cohort entirely. 18 | # In effect, this is probably identical to filter-all-failure. 19 | if max_ndecoded.zero? 20 | count += 1 21 | next 22 | end 23 | 24 | # input is hex formatted, so the trimmed length is max_ndecoded * 2 25 | result[:input] = result[:input][0, max_ndecoded * 2] 26 | 27 | $stdout.puts result.to_json 28 | end 29 | 30 | warn "[+] pass: minimize-input done: #{count} filtered" 31 | -------------------------------------------------------------------------------- /src/analysis/pass/minimize-input/spec.yml: -------------------------------------------------------------------------------- 1 | name: minimize-input 2 | desc: Trim each cohort's input to no greater than the longest output's decode length 3 | run: minimize-input 4 | -------------------------------------------------------------------------------- /src/analysis/pass/normalize/normalize: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # normalize: perform some basic normalization of each worker's output 5 | 6 | require "json" 7 | 8 | warn "[+] pass: normalize" 9 | 10 | $stdin.each_line do |line| 11 | result = JSON.parse line, symbolize_names: true 12 | 13 | result[:outputs].map! do |output| 14 | next output if output[:result].nil? 15 | 16 | normalized = output[:result].each_line.map do |dec_line| 17 | dec_line.split.join(" ") 18 | end.join("\n") 19 | 20 | output[:result] = normalized 21 | output[:len] = normalized.size 22 | output 23 | end 24 | 25 | $stdout.puts result.to_json 26 | end 27 | 28 | warn "[+] pass: normalize done" 29 | -------------------------------------------------------------------------------- /src/analysis/pass/normalize/spec.yml: -------------------------------------------------------------------------------- 1 | name: normalize 2 | desc: Normalize analysis results 3 | run: normalize 4 | -------------------------------------------------------------------------------- /src/analysis/passes.yml: -------------------------------------------------------------------------------- 1 | default: 2 | - filter-all-failure 3 | - filter-ndecoded-same 4 | - dedupe 5 | - minimize-input 6 | - normalize 7 | 8 | # Run just the dedupe pass, as a convenient filter. 9 | dedupe: 10 | - dedupe 11 | 12 | # Find inputs that all workers agree are one size, but one or more 13 | # decodes differently. 14 | same-size-different-decodings: 15 | - filter-any-failure 16 | - filter-ndecoded-different 17 | - filter-same-effects 18 | - minimize-input 19 | - normalize 20 | 21 | # Finds disagreements in size between workers. 22 | size-discrepancies: 23 | - filter-all-failure 24 | - filter-ndecoded-same 25 | - filter-incomparable 26 | - dedupe 27 | - find-size-discrepancies 28 | - minimize-input 29 | - normalize 30 | 31 | # Find inputs that not all workers either succeed or fail on. 32 | status-discrepancies: 33 | - filter-all-failure 34 | - filter-all-success 35 | - dedupe 36 | - minimize-input 37 | - normalize 38 | 39 | destroy-capstone: 40 | - filter-all-success 41 | - filter-ndecoded-same 42 | - dedupe 43 | - filter-destroy-capstone 44 | - minimize-input 45 | - normalize 46 | 47 | destroy-bddisasm: 48 | - filter-all-success 49 | - filter-ndecoded-same 50 | - dedupe 51 | - filter-destroy-bddisasm 52 | - minimize-input 53 | - normalize 54 | 55 | destroy-ghidra: 56 | - filter-all-success 57 | - filter-ndecoded-same 58 | - dedupe 59 | - normalize 60 | - filter-ghidra-lock 61 | - filter-destroy-ghidra 62 | - minimize-input 63 | 64 | xed-overaccept: 65 | - filter-all-success 66 | - filter-ndecoded-same 67 | - dedupe 68 | - filter-xed-find-overaccept 69 | - minimize-input 70 | - normalize 71 | 72 | xed-underaccept: 73 | - filter-all-success 74 | - filter-ndecoded-same 75 | - dedupe 76 | - filter-bddisasm-salc 77 | - filter-xed-find-underaccept 78 | - minimize-input 79 | - normalize 80 | -------------------------------------------------------------------------------- /src/include/mish_common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef DEBUG 4 | #define DLOG(fmt, ...) \ 5 | fprintf(stderr, "%s:%d %s: " fmt "\n", __FILE__, __LINE__, __func__, ##__VA_ARGS__); 6 | #undef NDEBUG 7 | #define _unused(x) 8 | #else 9 | #define DLOG(...) 10 | #define NDEBUG 11 | #define _unused(x) ((void)(x)) 12 | #endif 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #define MISHEGOS_INSN_MAXLEN 15 19 | #define MISHEGOS_DEC_MAXLEN 248 20 | // This limit is rather arbitrary at the moment. 21 | #define MISHEGOS_MAX_NWORKERS 31 22 | 23 | typedef enum { 24 | S_NONE = 0, 25 | S_SUCCESS, 26 | S_FAILURE, 27 | S_CRASH, 28 | S_PARTIAL, 29 | S_UNKNOWN, 30 | } decode_status; 31 | 32 | typedef enum { 33 | W_IGNORE_CRASHES, 34 | } worker_config_mask; 35 | 36 | typedef struct { 37 | uint8_t len; 38 | uint8_t raw_insn[MISHEGOS_INSN_MAXLEN]; 39 | } input_slot; 40 | 41 | typedef struct __attribute__((packed)) { 42 | decode_status status; 43 | uint16_t ndecoded; 44 | uint16_t len; 45 | char result[MISHEGOS_DEC_MAXLEN]; 46 | } output_slot; 47 | static_assert(sizeof(output_slot) == 256, "output_slot should be 256 bytes"); 48 | -------------------------------------------------------------------------------- /src/mish2jsonl/Makefile: -------------------------------------------------------------------------------- 1 | SRCS = $(wildcard *.c) 2 | OBJS = $(SRCS:.c=.o) 3 | 4 | PROG = mish2jsonl 5 | 6 | .PHONY: all 7 | all: $(PROG) 8 | 9 | $(PROG): $(OBJS) 10 | 11 | .PHONY: clean 12 | clean: 13 | rm -rf $(PROG) $(OBJS) 14 | 15 | -------------------------------------------------------------------------------- /src/mish2jsonl/mish2jsonl.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "mish_common.h" 9 | 10 | typedef struct m_string { 11 | uint64_t len; 12 | char *string; 13 | } m_string; 14 | 15 | typedef struct worker_output { 16 | uint32_t status; // 4 17 | uint16_t ndecoded; // 2 18 | uint32_t workerno; // 4 19 | m_string workerso; // 16 20 | m_string result; // 16 21 | } worker_output; 22 | 23 | typedef struct cohort_results { 24 | uint32_t nworkers; 25 | input_slot input; 26 | worker_output *outputs; 27 | } cohort_results; 28 | 29 | static cohort_results results; 30 | static int m_finished_parsing; 31 | 32 | static const char *status2str(decode_status status) { 33 | switch (status) { 34 | case S_NONE: 35 | return "none"; 36 | case S_SUCCESS: 37 | return "success"; 38 | case S_FAILURE: 39 | return "failure"; 40 | case S_CRASH: 41 | return "crash"; 42 | case S_PARTIAL: 43 | return "partial"; 44 | case S_UNKNOWN: 45 | default: 46 | return "unknown"; 47 | } 48 | } 49 | 50 | static void m_cohort_print_json(FILE *f, cohort_results *r) { 51 | char hexbuf[MISHEGOS_INSN_MAXLEN * 2 + 1]; 52 | for (size_t i = 0; i < r->input.len; i++) { 53 | hexbuf[i * 2] = "0123456789abcdef"[r->input.raw_insn[i] / 0x10]; 54 | hexbuf[i * 2 + 1] = "0123456789abcdef"[r->input.raw_insn[i] % 0x10]; 55 | } 56 | hexbuf[r->input.len * 2] = '\0'; 57 | fprintf(f, "{ \"nworkers\": %u, \"input\": \"%s\", \"outputs\": [", r->nworkers, hexbuf); 58 | for (int i = 0; i < r->nworkers; i++) { 59 | if (i != 0) { 60 | fprintf(f, ","); 61 | } 62 | fprintf(f, 63 | "{ \"status\": { \"value\": %u, \"name\": \"%s\" }, \"ndecoded\": %u, \"workerno\": " 64 | "%u, \"worker_so\": \"%s\",\"len\": %ld, \"result\": \"%s\" }", 65 | r->outputs[i].status, status2str(r->outputs[i].status), r->outputs[i].ndecoded, 66 | r->outputs[i].workerno, r->outputs[i].workerso.string, r->outputs[i].result.len, 67 | r->outputs[i].result.string); 68 | } 69 | 70 | fprintf(f, "]}"); 71 | return; 72 | } 73 | 74 | static void m_fread(void *ref, size_t size, size_t times, FILE *file) { 75 | size_t rd = fread(ref, size, times, file); 76 | 77 | // reading a 0 length string can be valid if disassembeling failed 78 | if (rd == 0 && size * times != 0) { 79 | m_finished_parsing = 1; 80 | } 81 | } 82 | 83 | /* 84 | There are a few subtleties that this functions catches 85 | 1) Not all strings we will read actually have an null byte 86 | even when we read the entire string, leading to extra 87 | junk being inserted in for example an output file 88 | and hence crashing the analysis tool 89 | 90 | 2) Some disassemblers like to insert \n in their output 91 | this breaks values from being a valid string 92 | 93 | 3) at the moment we alloc the memory of the string here 94 | using malloc will result in padded whitespaces showing 95 | old data that we don't want. Hence the use of calloc 96 | if this is only used for printing one-by-one 97 | we can optimize this out 98 | 99 | we implicitly calloc 1 extra byte to guarantee 100 | the string ends with a null byte (implicit calloc logic) 101 | */ 102 | 103 | static void read_string(FILE *file, m_string *s, int len_size) { 104 | uint64_t string_length = 0; 105 | m_fread(&string_length, len_size, 1, file); 106 | 107 | // calloc instead of malloc because we want to zero out the memory. 108 | char *input = calloc(1, sizeof(char) * string_length + 1); 109 | // this is because we tend to reuse the same memory alot (we optimize this out) 110 | m_fread(input, sizeof(char), string_length, file); 111 | 112 | int newsize = strcspn(input, "\n"); 113 | input[newsize] = '\0'; 114 | s->len = newsize; // should this be new or old size? 115 | s->string = input; 116 | } 117 | 118 | static int read_next(FILE *file) { 119 | fread(&results.nworkers, sizeof(uint32_t), 1, file); 120 | results.outputs = malloc(sizeof(worker_output) * results.nworkers); 121 | m_fread(&results.input, sizeof(results.input), 1, file); 122 | 123 | for (int i = 0; i < results.nworkers; i++) { 124 | read_string(file, &results.outputs[i].workerso, 8); 125 | m_fread(&results.outputs[i].status, sizeof(uint32_t), 1, file); 126 | m_fread(&results.outputs[i].ndecoded, sizeof(uint16_t), 1, file); 127 | 128 | read_string(file, &results.outputs[i].result, 2); 129 | } 130 | 131 | return m_finished_parsing == 0; 132 | } 133 | 134 | static void free_cohort_results(cohort_results *result) { 135 | for (int i = 0; i < results.nworkers; i++) { 136 | free(results.outputs[i].workerso.string); 137 | free(results.outputs[i].result.string); 138 | } 139 | free(results.outputs); 140 | } 141 | 142 | void m_print_results_json(FILE *input_file, FILE *output_file) { 143 | m_finished_parsing = 0; 144 | int is_first = 1; 145 | 146 | fprintf(output_file, "["); 147 | while (read_next(input_file)) { 148 | if (!m_finished_parsing) { 149 | if (!is_first) { 150 | printf(","); 151 | } 152 | m_cohort_print_json(output_file, &results); 153 | free_cohort_results(&results); 154 | fprintf(output_file, "\n"); 155 | is_first = 0; 156 | } 157 | } 158 | fprintf(output_file, "]"); 159 | } 160 | 161 | void m_print_results_jsonl(FILE *input_file, FILE *output_file) { 162 | m_finished_parsing = 0; 163 | 164 | while (read_next(input_file)) { 165 | m_cohort_print_json(output_file, &results); 166 | free_cohort_results(&results); 167 | fprintf(output_file, "\n"); 168 | } 169 | } 170 | 171 | int main(int argc, char **argv) { 172 | enum { JSON, JSONL } mode = JSONL; 173 | int opt; 174 | while ((opt = getopt(argc, argv, "hn")) != -1) { 175 | switch (opt) { 176 | case 'h': 177 | fprintf(stdout, 178 | "Convert mishegos output to JSON or JSONL\n" 179 | "OPTIONS: -n switches output from JSONL (default) to JSON\n" 180 | "Usage: %s [-n] [input]\n", 181 | argv[0]); 182 | return 0; 183 | case 'n': 184 | mode = JSON; 185 | break; 186 | default: 187 | fprintf(stderr, "Usage: %s [-n] [input]\n", argv[0]); 188 | return 1; 189 | } 190 | } 191 | 192 | // Default to stdin. 193 | FILE *input; 194 | if (argc - optind != 1) { 195 | input = stdin; 196 | } else { 197 | input = fopen(argv[optind], "r"); 198 | if (input == NULL) { 199 | err(errno, "fopen"); 200 | } 201 | } 202 | 203 | if (mode == JSONL) { 204 | m_print_results_jsonl(input, stdout); 205 | } else { 206 | m_print_results_json(input, stdout); 207 | } 208 | 209 | fclose(input); 210 | return 0; 211 | } 212 | -------------------------------------------------------------------------------- /src/mishegos/Makefile: -------------------------------------------------------------------------------- 1 | SRCS = $(wildcard *.c) 2 | OBJS = $(SRCS:.c=.o) 3 | 4 | PROG = mishegos 5 | 6 | .PHONY: all 7 | all: $(PROG) 8 | 9 | $(PROG): $(OBJS) 10 | 11 | .PHONY: clean 12 | clean: 13 | rm -rf $(PROG) $(OBJS) 14 | -------------------------------------------------------------------------------- /src/mishegos/mishegos.c: -------------------------------------------------------------------------------- 1 | 2 | #include "mish_common.h" 3 | #include "mutator.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #define WITH_FUTEX 27 | 28 | typedef struct { 29 | _Atomic uint32_t val; 30 | #ifdef WITH_FUTEX 31 | _Atomic uint32_t waiters; 32 | #endif 33 | } mish_atomic_uint; 34 | 35 | static void mish_atomic_wait_for(mish_atomic_uint *var, uint32_t target) { 36 | uint32_t old; 37 | size_t cnt = 0; 38 | while ((old = atomic_load(&var->val)) != target) { 39 | #ifdef __x86_64__ 40 | __asm__ volatile("pause"); 41 | #endif 42 | (void)cnt; 43 | #ifdef WITH_FUTEX 44 | if (++cnt > 10000) { 45 | atomic_fetch_add_explicit(&var->waiters, 1, memory_order_relaxed); 46 | syscall(SYS_futex, &var->val, FUTEX_WAIT, old, NULL); 47 | atomic_fetch_sub_explicit(&var->waiters, 1, memory_order_relaxed); 48 | } 49 | #endif 50 | } 51 | } 52 | 53 | static uint32_t mish_atomic_fetch_add(mish_atomic_uint *var, uint32_t val) { 54 | return atomic_fetch_add(&var->val, val); 55 | } 56 | 57 | static uint32_t mish_atomic_load(mish_atomic_uint *var) { 58 | return atomic_load(&var->val); 59 | } 60 | 61 | static void mish_atomic_store(mish_atomic_uint *var, uint32_t val) { 62 | atomic_store(&var->val, val); 63 | } 64 | 65 | static void mish_atomic_notify(mish_atomic_uint *var) { 66 | #ifdef WITH_FUTEX 67 | if (atomic_load_explicit(&var->waiters, memory_order_relaxed)) 68 | syscall(SYS_futex, &var->val, FUTEX_WAKE, INT_MAX); 69 | #endif 70 | } 71 | 72 | #define MISHEGOS_NUM_SLOTS_PER_CHUNK 4096 73 | #define MISHEGOS_NUM_CHUNKS 16 74 | 75 | typedef struct { 76 | mish_atomic_uint generation; 77 | mish_atomic_uint remaining_workers; 78 | uint32_t input_count; 79 | input_slot inputs[MISHEGOS_NUM_SLOTS_PER_CHUNK]; 80 | } input_chunk; 81 | 82 | typedef struct { 83 | mish_atomic_uint remaining; 84 | output_slot outputs[MISHEGOS_NUM_SLOTS_PER_CHUNK]; 85 | } output_chunk; 86 | 87 | struct worker_config { 88 | size_t soname_len; 89 | const char *soname; 90 | int workerno; 91 | input_chunk *input_chunks; 92 | output_chunk *output_chunks; 93 | size_t start_gen; 94 | size_t start_idx; 95 | bool sigchld; 96 | pthread_t thread; 97 | pid_t pid; 98 | }; 99 | 100 | static struct worker_config workers[MISHEGOS_MAX_NWORKERS]; 101 | 102 | static void *alloc_shared(size_t size) { 103 | void *res = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON | MAP_POPULATE, -1, 0); 104 | if (res == MAP_FAILED) { 105 | perror("mmap"); 106 | exit(1); 107 | } 108 | return res; 109 | } 110 | 111 | static void *worker(void *wc_vp) { 112 | const struct worker_config *wc = wc_vp; 113 | void *so = dlopen(wc->soname, RTLD_LAZY); 114 | if (!so) { 115 | perror(wc->soname); 116 | return NULL; 117 | } 118 | 119 | void (*worker_ctor)() = (void (*)())dlsym(so, "worker_ctor"); 120 | void (*worker_dtor)() = (void (*)())dlsym(so, "worker_dtor"); 121 | typedef void (*try_decode_t)(output_slot * result, uint8_t * raw_insn, uint8_t length); 122 | try_decode_t try_decode = (try_decode_t)dlsym(so, "try_decode"); 123 | char *worker_name = *((char **)dlsym(so, "worker_name")); 124 | 125 | if (worker_ctor != NULL) { 126 | worker_ctor(); 127 | } 128 | 129 | uint32_t gen = wc->start_gen; 130 | size_t idx = wc->start_idx; 131 | 132 | input_chunk *input_chunks = wc->input_chunks; 133 | output_chunk *output_chunks = wc->output_chunks; 134 | while (1) { 135 | mish_atomic_wait_for(&input_chunks[idx].generation, gen); 136 | 137 | /* Track remaining slots; if we crash, we know where we are. If we start 138 | * with a non-zero remaining count, we continue where we left, but skip the 139 | * slot that caused us to crash. */ 140 | size_t old_remaining = mish_atomic_load(&output_chunks[idx].remaining); 141 | size_t start = old_remaining == 0 ? 0 : input_chunks[idx].input_count - old_remaining + 1; 142 | mish_atomic_store(&output_chunks[idx].remaining, input_chunks[idx].input_count - start); 143 | for (size_t i = start; i < input_chunks[idx].input_count; i++) { 144 | output_chunks[idx].outputs[i].len = 0; 145 | output_chunks[idx].outputs[i].ndecoded = 0; 146 | try_decode(&output_chunks[idx].outputs[i], input_chunks[idx].inputs[i].raw_insn, 147 | input_chunks[idx].inputs[i].len); 148 | /* Note: this is no atomic subtraction. It atomic, however, to ensure that 149 | * the decode result is written to memory before we decrement the counter */ 150 | mish_atomic_store(&output_chunks[idx].remaining, input_chunks[idx].input_count - i - 1); 151 | } 152 | 153 | if (mish_atomic_fetch_add(&input_chunks[idx].remaining_workers, -1) == 1) 154 | mish_atomic_notify(&input_chunks[idx].remaining_workers); 155 | 156 | /* Not getting a full chunk indicates that we are exiting. */ 157 | if (input_chunks[idx].input_count != MISHEGOS_NUM_SLOTS_PER_CHUNK) 158 | break; 159 | 160 | idx++; 161 | if (idx == MISHEGOS_NUM_CHUNKS) { 162 | idx = 0; 163 | gen++; 164 | } 165 | } 166 | 167 | if (worker_dtor != NULL) { 168 | worker_dtor(); 169 | } 170 | dlclose(so); 171 | 172 | return NULL; 173 | } 174 | 175 | /* By default, filter all inputs which all decoders identify as invalid. */ 176 | static int filter_min_success = 1; 177 | static int filter_max_success = MISHEGOS_MAX_NWORKERS; 178 | static bool filter_ndecoded_same = false; 179 | 180 | static void process(size_t slot, size_t idx, input_chunk *input_chunks, int nworkers, 181 | struct worker_config *workers) { 182 | int num_success = 0; 183 | bool ndecoded_same = true; 184 | int last_ndecoded = -1; 185 | for (int j = 0; j < nworkers; j++) { 186 | output_slot *output = &workers[j].output_chunks[slot].outputs[idx]; 187 | num_success += output->status == S_SUCCESS; 188 | if (output->status == S_SUCCESS) { 189 | if (last_ndecoded == -1) 190 | last_ndecoded = output->ndecoded; 191 | else if (last_ndecoded != output->ndecoded) 192 | ndecoded_same = false; 193 | } 194 | } 195 | if (num_success >= filter_min_success && num_success <= filter_max_success) 196 | goto keep; 197 | if (filter_ndecoded_same && !ndecoded_same) 198 | goto keep; 199 | return; 200 | 201 | keep:; 202 | fwrite(&nworkers, sizeof(nworkers), 1, stdout); 203 | 204 | input_slot *input = &input_chunks[slot].inputs[idx]; 205 | fwrite(input, sizeof(*input), 1, stdout); 206 | for (int j = 0; j < nworkers; j++) { 207 | fwrite(&workers[j].soname_len, sizeof(workers[j].soname_len), 1, stdout); 208 | fwrite(workers[j].soname, 1, workers[j].soname_len, stdout); 209 | 210 | output_slot *output = &workers[j].output_chunks[slot].outputs[idx]; 211 | static_assert(offsetof(output_slot, result) == sizeof(output_slot) - MISHEGOS_DEC_MAXLEN, 212 | "expect result buffer to be at end of slot"); 213 | fwrite(output, sizeof(*output) - MISHEGOS_DEC_MAXLEN + output->len, 1, stdout); 214 | } 215 | } 216 | 217 | static int worker_for_pid(pid_t pid) { 218 | for (int i = 0; i < MISHEGOS_MAX_NWORKERS; i++) { 219 | if (workers[i].pid == pid) { 220 | return i; 221 | } 222 | } 223 | return -1; 224 | } 225 | 226 | static bool thread_mode = false; 227 | 228 | static void worker_start(struct worker_config *wc) { 229 | if (thread_mode) { 230 | pthread_create(&wc->thread, NULL, worker, wc); 231 | } else { 232 | /* pipe to notify child that we are ready. */ 233 | int pipe_fds[2]; 234 | char tmp = 0; 235 | if (pipe(pipe_fds) < 0) { 236 | perror("pipe"); 237 | exit(1); 238 | } 239 | 240 | pid_t child = fork(); 241 | if (child < 0) { 242 | perror("fork"); 243 | exit(1); 244 | } else if (child == 0) { 245 | prctl(PR_SET_PDEATHSIG, SIGHUP); 246 | close(pipe_fds[1]); 247 | if (read(pipe_fds[0], &tmp, 1) != 1) { 248 | /* parent died without us being killed by SIGHUP -- so exit. */ 249 | exit(1); 250 | } 251 | close(pipe_fds[0]); 252 | worker(wc); 253 | exit(0); 254 | } 255 | wc->pid = child; 256 | close(pipe_fds[0]); 257 | write(pipe_fds[1], &tmp, 1); 258 | close(pipe_fds[1]); 259 | } 260 | } 261 | 262 | static void sigchld_handler(int sig) { 263 | (void)sig; 264 | 265 | /* Multiple children might have died at the same time, but we get only one signal. */ 266 | int wstatus; 267 | pid_t wpid; 268 | while ((wpid = waitpid(-1, &wstatus, WNOHANG)) > 0) { 269 | int workerno = worker_for_pid(wpid); 270 | assert(workerno >= 0); 271 | if (workerno < 0) { 272 | /* worker died before we even had the chance to store its pid. */ 273 | abort(); 274 | } 275 | input_chunk *ic = workers[workerno].input_chunks; 276 | output_chunk *oc = workers[workerno].output_chunks; 277 | for (size_t widx = 0; widx < MISHEGOS_NUM_CHUNKS; widx++) { 278 | uint32_t remaining = mish_atomic_load(&oc[widx].remaining); 279 | if (remaining == 0) 280 | continue; 281 | /* we found the position where the worker crashed. */ 282 | oc[widx].outputs[ic[widx].input_count - remaining].status = S_CRASH; 283 | /* update generation and chunk index so that worker can restart. */ 284 | workers[workerno].start_gen = mish_atomic_load(&ic[widx].generation); 285 | workers[workerno].start_idx = widx; 286 | /* Mark worker as sigchld-received s.t. we can restart them. We obviously 287 | * can't do that in a signal handler. */ 288 | workers[workerno].sigchld = true; 289 | /* Reduce remaining_workers temporarily s.t. we always wake up. No need to 290 | * explicitly wake, however: the futex syscall will be restarted and 291 | * detect that the value changed */ 292 | mish_atomic_fetch_add(&ic[widx].remaining_workers, -1); 293 | break; 294 | } 295 | /* We might get here because the worker terminated ordinarily -- ignore. 296 | * There's also the case that the worker crashed outside decoding. This must 297 | * be a bug and therefore should never happen(TM). Ignore this case, too. */ 298 | } 299 | } 300 | 301 | int main(int argc, char **argv) { 302 | const char *mutator_name = NULL; 303 | 304 | int opt; 305 | while ((opt = getopt(argc, argv, "htm:s:n")) != -1) { 306 | switch (opt) { 307 | case 't': 308 | thread_mode = false; 309 | break; 310 | case 'm': 311 | mutator_name = optarg; 312 | break; 313 | case 's': { 314 | char *next; 315 | /* Both values are capped to nworkers below, s.t. -1 => nworkers - 1. */ 316 | filter_min_success = strtol(optarg, &next, 0); 317 | if (*next == ':') 318 | filter_max_success = strtol(next + 1, &next, 0); 319 | if (*next != '\0') 320 | errx(1, "-s needs format or :"); 321 | break; 322 | } 323 | case 'n': 324 | filter_ndecoded_same = true; 325 | break; 326 | case 'h': 327 | default: 328 | fprintf(stderr, "usage: %s [-t] [-m mutator] [-s min[:max]] [-n]\n", argv[0]); 329 | fprintf(stderr, " -t: use thread mode\n"); 330 | fprintf(stderr, " -m: specify mutator\n"); 331 | fprintf(stderr, " -s: keep samples where success count is in range; default is 1:-1\n"); 332 | fprintf(stderr, " (0 = all; 1 = #success >= 1; -1 = #success = nworkers - 1;\n"); 333 | fprintf(stderr, " 1:-2 = #success >= 1 && <= nworkers - 1;\n"); 334 | fprintf(stderr, " 1:0 = filter all (e.g., for use with -n); etc.)\n"); 335 | fprintf(stderr, " -n: keep samples where successful ndecoded differs\n"); 336 | return 1; 337 | } 338 | } 339 | 340 | if (optind + 1 != argc) { 341 | fprintf(stderr, "expected worker file as positional argument\n"); 342 | return 1; 343 | } 344 | 345 | if (!thread_mode) { 346 | struct sigaction sigchld_action = {0}; 347 | sigchld_action.sa_handler = sigchld_handler; 348 | sigchld_action.sa_flags = SA_NOCLDSTOP; 349 | if (sigaction(SIGCHLD, &sigchld_action, NULL)) { 350 | perror("sigaction"); 351 | return 1; 352 | } 353 | } 354 | 355 | mutator_t mutator = mutator_create(mutator_name); 356 | 357 | FILE *file = fopen(argv[optind], "r"); 358 | if (file == NULL) { 359 | perror(argv[optind]); 360 | return 1; 361 | } 362 | 363 | input_chunk *input_chunks = alloc_shared(sizeof(input_chunk) * MISHEGOS_NUM_CHUNKS); 364 | 365 | int nworkers = 0; 366 | uint64_t gen = 1; 367 | uint64_t idx = 0; 368 | 369 | while (nworkers < MISHEGOS_MAX_NWORKERS) { 370 | size_t size = 0; 371 | char *line = NULL; 372 | if (getline(&line, &size, file) < 0 || feof(file) != 0) { 373 | break; 374 | } 375 | if (line[0] == '#') { 376 | continue; 377 | } 378 | 379 | /* getline retains the newline if present, so chop it off. */ 380 | line[strcspn(line, "\n")] = '\0'; 381 | if (access(line, R_OK) < 0) { 382 | perror(line); 383 | return 1; 384 | } 385 | 386 | workers[nworkers].soname_len = strlen(line); 387 | workers[nworkers].soname = line; 388 | workers[nworkers].workerno = nworkers; 389 | workers[nworkers].input_chunks = input_chunks; 390 | workers[nworkers].output_chunks = alloc_shared(sizeof(output_chunk) * MISHEGOS_NUM_CHUNKS); 391 | workers[nworkers].start_gen = gen; 392 | workers[nworkers].start_idx = idx; 393 | worker_start(&workers[nworkers]); 394 | nworkers++; 395 | } 396 | 397 | if (filter_min_success < 0) { 398 | filter_min_success += nworkers + 1; 399 | } 400 | if (filter_max_success < 0) { 401 | filter_max_success += nworkers + 1; 402 | } 403 | fprintf(stderr, "filter min=%d max=%d\n", filter_min_success, filter_max_success); 404 | 405 | uint64_t total = 0; 406 | uint64_t exit_idx = MISHEGOS_NUM_CHUNKS; 407 | while (true) { 408 | mish_atomic_wait_for(&input_chunks[idx].remaining_workers, 0); 409 | 410 | if (!thread_mode) { 411 | bool worker_restarted = false; 412 | for (int i = 0; i < nworkers; i++) { 413 | if (workers[i].sigchld) { 414 | /* undo hack to forcefully wake us up. */ 415 | mish_atomic_fetch_add(&input_chunks[workers[i].start_idx].remaining_workers, 1); 416 | workers[i].sigchld = false; 417 | worker_start(&workers[i]); 418 | worker_restarted = true; 419 | } 420 | } 421 | if (worker_restarted) { 422 | /* if we restarted a worker for current idx, wait for it again. */ 423 | continue; 424 | } 425 | } 426 | 427 | if (gen > 1) { 428 | for (size_t i = 0; i < input_chunks[idx].input_count; i++) { 429 | process(idx, i, input_chunks, nworkers, workers); 430 | } 431 | } 432 | 433 | if (idx == exit_idx) { 434 | break; 435 | } 436 | 437 | // Not yet exiting, so fill another chunk. 438 | if (exit_idx == MISHEGOS_NUM_CHUNKS) { 439 | size_t count = 0; 440 | for (size_t i = 0; i < MISHEGOS_NUM_SLOTS_PER_CHUNK; i++) { 441 | bool filled = mutator(&input_chunks[idx].inputs[i]); 442 | if (filled) { 443 | count++; 444 | } else { // no more mutations 445 | exit_idx = idx; 446 | break; 447 | } 448 | } 449 | 450 | input_chunks[idx].input_count = count; 451 | mish_atomic_store(&input_chunks[idx].remaining_workers, nworkers); 452 | mish_atomic_store(&input_chunks[idx].generation, gen); 453 | mish_atomic_notify(&input_chunks[idx].generation); 454 | } 455 | 456 | idx++; 457 | if (idx == MISHEGOS_NUM_CHUNKS) { 458 | idx = 0; 459 | gen++; 460 | } 461 | } 462 | } 463 | -------------------------------------------------------------------------------- /src/mishegos/mutator.c: -------------------------------------------------------------------------------- 1 | #include "mutator.h" 2 | 3 | #include "mish_common.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | /* An x86 instruction's opcode is no longer than 3 bytes. 11 | */ 12 | typedef struct __attribute__((packed)) { 13 | uint8_t len; 14 | uint8_t op[3]; 15 | } opcode; 16 | static_assert(sizeof(opcode) == 4, "opcode should be 4 bytes"); 17 | 18 | /* An x86 instruction is no longer than 15 bytes, 19 | * but the longest (potentially) structurally valid x86 instruction 20 | * is 26 bytes: 21 | * 4 byte legacy prefix 22 | * 1 byte prefix 23 | * 3 byte opcode 24 | * 1 byte ModR/M 25 | * 1 byte SIB 26 | * 8 byte displacement 27 | * 8 byte immediate 28 | * 29 | * We want to be able to "slide" around inside of a structurally valid 30 | * instruction in order to find errors, so we give ourselves enough space 31 | * here. 32 | */ 33 | typedef struct { 34 | uint8_t off; 35 | uint8_t len; 36 | uint8_t insn[26]; 37 | } insn_candidate; 38 | 39 | static uint64_t rng_state[4]; 40 | static insn_candidate insn_cand; 41 | 42 | /* An x86 instruction can have up to 4 legacy prefixes, 43 | * in any order, with no more than 1 prefix from each group. 44 | */ 45 | static uint8_t legacy_prefixes[] = { 46 | // Prefix group 1. 47 | 0xf0, // repeat/lock 48 | 0xf3, // rep, repe 49 | 0xf2, // repne 50 | // Prefix group 2. 51 | 0x2e, // segment override, cs 52 | 0x36, // segment override, ss 53 | 0x3e, // segment override, ds 54 | 0x26, // segment override, es 55 | 0x64, // segment override, fs 56 | 0x65, // segment override, gs 57 | // Prefix group 3. 58 | 0x66, // operand size override 59 | // Prefix group 4. 60 | 0x67, // address size override 61 | }; 62 | 63 | /* REX prefixes apply in long (64-bit) mode, and are made up 64 | * of a fixed 4-bit pattern + extension bits for operand size, 65 | * ModR/M and SIB. 66 | * 67 | * Each instruction should only have one REX prefix. 68 | */ 69 | static uint8_t rex_prefixes[] = { 70 | 0b01000000, // ---- 71 | 0b01000001, // ---B 72 | 0b01000010, // --X- 73 | 0b01000011, // --BX 74 | 0b01000100, // -R-- 75 | 0b01000101, // -R-B 76 | 0b01000110, // -RX- 77 | 0b01000111, // -RXB 78 | 0b01001000, // W--- 79 | 0b01001001, // W--B 80 | 0b01001010, // W-X- 81 | 0b01001011, // W-XB 82 | 0b01001100, // WR-- 83 | 0b01001101, // WR-B 84 | 0b01001110, // WRX- 85 | 0b01001111, // WRXB 86 | }; 87 | 88 | #if defined __GLIBC__ && defined __linux__ 89 | 90 | #include 91 | static int mish_getrandom(void *buf, size_t buflen, unsigned int flags) { 92 | return getrandom(buf, buflen, flags); 93 | } 94 | 95 | #elif defined __APPLE__ && defined __MACH__ 96 | 97 | #include 98 | static int mish_getrandom(void *buf, size_t buflen, unsigned int flags) { 99 | return getentropy(buf, buflen); 100 | } 101 | 102 | #else 103 | #error "we only support linux + glibc at the moment; help us out!" 104 | #endif 105 | 106 | #ifndef NO_XOROSHIRO_RNG 107 | static inline uint64_t xoroshiro256_rotl(const uint64_t x, int k) { 108 | return (x << k) | (x >> (64 - k)); 109 | } 110 | 111 | uint64_t xoroshiro256_next(void) { 112 | const uint64_t result_starstar = xoroshiro256_rotl(rng_state[1] * 5, 7) * 9; 113 | 114 | const uint64_t t = rng_state[1] << 17; 115 | 116 | rng_state[2] ^= rng_state[0]; 117 | rng_state[3] ^= rng_state[1]; 118 | rng_state[1] ^= rng_state[2]; 119 | rng_state[0] ^= rng_state[3]; 120 | 121 | rng_state[2] ^= t; 122 | 123 | rng_state[3] = xoroshiro256_rotl(rng_state[3], 45); 124 | 125 | return result_starstar; 126 | } 127 | 128 | static inline uint64_t rand_long() { 129 | return xoroshiro256_next(); 130 | } 131 | 132 | static inline uint8_t rand_byte() { 133 | return (uint8_t)rand_long(); 134 | } 135 | #else 136 | static uint64_t rand_long() { 137 | uint64_t it; 138 | mish_getrandom(&it, sizeof(it), 0); 139 | return it; 140 | } 141 | 142 | static uint8_t rand_byte() { 143 | uint8_t it; 144 | mish_getrandom(&it, sizeof(it), 0); 145 | return it; 146 | } 147 | #endif 148 | 149 | /* Creates a random (potentially invalid) opcode. 150 | * Opcodes are 1-3 bytes long, and come in three formats: 151 | * 1. Single byte (raw opcode) 152 | * 2. Two bytes (escape byte, opcode) 153 | * 3. Three bytes (escape byte 1, escape byte 2, opcode) 154 | */ 155 | static void rand_opcode(opcode *opc) { 156 | switch (rand_byte() % 4) { 157 | case 0: { 158 | opc->len = 1; 159 | opc->op[0] = rand_byte(); 160 | break; 161 | } 162 | case 1: { 163 | opc->len = 2; 164 | opc->op[0] = 0x0f; 165 | opc->op[1] = rand_byte(); 166 | break; 167 | } 168 | case 2: { 169 | opc->len = 3; 170 | opc->op[0] = 0x0f; 171 | opc->op[1] = 0x38; 172 | opc->op[2] = rand_byte(); 173 | break; 174 | } 175 | case 3: { 176 | opc->len = 3; 177 | opc->op[0] = 0x0f; 178 | opc->op[1] = 0x3a; 179 | opc->op[2] = rand_byte(); 180 | break; 181 | } 182 | } 183 | } 184 | 185 | static void build_sliding_candidate() { 186 | memset(&insn_cand, 0, sizeof(insn_candidate)); 187 | 188 | /* 4 random legacy prefixes. 189 | * 190 | * Observe that we don't attempt to enforce the "1 prefix from each group" rule. 191 | */ 192 | for (int i = 0; i < 4; ++i) { 193 | insn_cand.insn[i] = legacy_prefixes[rand_byte() % sizeof(legacy_prefixes)]; 194 | } 195 | insn_cand.len += 4; 196 | 197 | /* REX prefix choices: 198 | * 0. Random prefix from rex_prefixes table 199 | * 1. Completely randomized prefix 200 | * 3. No REX prefix 201 | */ 202 | switch (rand_byte() % 3) { 203 | case 0: { 204 | insn_cand.insn[insn_cand.len] = rex_prefixes[rand_byte() % sizeof(rex_prefixes)]; 205 | insn_cand.len++; 206 | break; 207 | } 208 | case 1: { 209 | insn_cand.insn[insn_cand.len] = rand_byte(); 210 | insn_cand.len++; 211 | break; 212 | } 213 | case 2: { 214 | break; 215 | } 216 | } 217 | 218 | /* Opcode, up to 3 bytes. 219 | */ 220 | opcode opc; 221 | rand_opcode(&opc); 222 | memcpy(insn_cand.insn + insn_cand.len, opc.op, opc.len); 223 | insn_cand.len += opc.len; 224 | 225 | /* ModR/M and SIB. For now, just two random bytes. 226 | */ 227 | insn_cand.insn[insn_cand.len++] = rand_byte(); 228 | insn_cand.insn[insn_cand.len++] = rand_byte(); 229 | 230 | /* Displacement. Either none, or 1, 2, 4, or 8 bytes. 231 | */ 232 | if (rand_byte() % 2 == 0) { 233 | uint8_t displen = 1 << (rand_byte() % 4); 234 | uint64_t disp = rand_long(); 235 | memcpy(insn_cand.insn + insn_cand.len, &disp, displen); 236 | insn_cand.len += displen; 237 | } 238 | 239 | /* Immediate. Either none, or 1, 2, 4, or 8 bytes. 240 | */ 241 | if (rand_byte() % 2 == 0) { 242 | uint8_t immlen = 1 << (rand_byte() % 4); 243 | uint64_t imm = rand_long(); 244 | memcpy(insn_cand.insn + insn_cand.len, &imm, immlen); 245 | insn_cand.len += immlen; 246 | } 247 | } 248 | 249 | /* Havoc: generate a random instruction candidate. 250 | */ 251 | static bool havoc_candidate(input_slot *slot) { 252 | slot->len = (rand_byte() % MISHEGOS_INSN_MAXLEN) + 1; 253 | uint64_t lower = rand_long(); 254 | uint64_t upper = rand_long(); 255 | memcpy(slot->raw_insn, &lower, 8); 256 | memcpy(slot->raw_insn + 8, &upper, 7); 257 | 258 | return true; 259 | } 260 | 261 | /* Sliding: generate an instruction candidate with the 262 | * "sliding" approach. 263 | */ 264 | static bool sliding_candidate(input_slot *slot) { 265 | /* An offset of zero into our sliding candidate indicates that we've slid 266 | * all the way through and need to build a new candidate. 267 | */ 268 | if (insn_cand.off == 0) { 269 | build_sliding_candidate(); 270 | } 271 | 272 | /* If our sliding candidate is less than the maximum instruction size, 273 | * then we have nothing to slide. Just copy it try a new candidate on the next 274 | * call. 275 | * 276 | * Otherwise, take the maximum instruction size from our sliding 277 | * candidate, plus the current offset. This gives us a bunch of 278 | * high quality instruction "windows". 279 | */ 280 | if (insn_cand.len <= MISHEGOS_INSN_MAXLEN) { 281 | memcpy(slot->raw_insn, insn_cand.insn, insn_cand.len); 282 | slot->len = insn_cand.len; 283 | insn_cand.off = 0; // Shouldn't be necessary, but just to be explicit. 284 | } else { 285 | memcpy(slot->raw_insn, insn_cand.insn + insn_cand.off, MISHEGOS_INSN_MAXLEN); 286 | slot->len = MISHEGOS_INSN_MAXLEN; 287 | insn_cand.off = (insn_cand.off + 1) % (insn_cand.len - MISHEGOS_INSN_MAXLEN + 1); 288 | } 289 | 290 | return true; 291 | } 292 | 293 | /* Structured: generate an instruction candidate with the 294 | * "structured" approach. 295 | */ 296 | static bool structured_candidate(input_slot *slot) { 297 | /* We mirror build_sliding_candidate here, but with the constraint that 298 | * we never overapproximate: we constrain ourselves to trying 299 | * to build something that looks like an instruction of no more 300 | * than 15 bytes. 301 | */ 302 | 303 | uint8_t len = 0; 304 | 305 | /* Up to 4 legacy prefixes. Like sliding, we don't try to enforce group rules. 306 | * Unlike sliding, we allow for the possibility of no legacy prefixes. 307 | * Running max: 4. 308 | */ 309 | uint8_t prefix_count = (rand_byte() % 5); 310 | for (int i = 0; i < prefix_count; ++i) { 311 | slot->raw_insn[i] = legacy_prefixes[rand_byte() % sizeof(legacy_prefixes)]; 312 | } 313 | len = prefix_count; 314 | 315 | /* One or none REX prefixes. 316 | * Always choose a valid REX prefix if we're inserting one. 317 | * Running max: 5. 318 | */ 319 | if (rand_byte() % 2) { 320 | slot->raw_insn[len] = rex_prefixes[rand_byte() % sizeof(rex_prefixes)]; 321 | len++; 322 | } 323 | 324 | /* Random (but structured) opcode. Same as sliding. 325 | * Running max: 8 326 | */ 327 | opcode opc; 328 | rand_opcode(&opc); 329 | memcpy(slot->raw_insn + len, opc.op, opc.len); 330 | len += opc.len; 331 | 332 | /* One or none ModR/M bytes, and one or none SIB bytes. 333 | * Both of these are just 8-bit LUTs, so they can be fully random. 334 | * Running max: 10. 335 | */ 336 | if (rand_byte() % 2) { 337 | slot->raw_insn[len] = rand_byte(); 338 | len++; 339 | } 340 | 341 | if (rand_byte() % 2) { 342 | slot->raw_insn[len] = rand_byte(); 343 | len++; 344 | } 345 | 346 | /* Finally, we have up to 5 bytes to play with for the immediate and 347 | * displacement. Fill some amount of that (maybe not all) with randomness. 348 | */ 349 | uint64_t tail = rand_long(); 350 | uint8_t tail_size = rand_byte() % 6; 351 | memcpy(slot->raw_insn + len, &tail, tail_size); 352 | len += tail_size; 353 | 354 | slot->len = len; 355 | 356 | return true; 357 | } 358 | 359 | /* Dummy: Generates a single NOP for debugging purposes. 360 | */ 361 | static bool dummy_candidate(input_slot *slot) { 362 | slot->raw_insn[0] = 0x90; 363 | slot->len = 1; 364 | 365 | /* NOTE(ww): We only ever want to fill one input slot with our dummy candidate, 366 | * since other parts of mishegos disambiguate worker outputs by keying on the input. 367 | */ 368 | return false; 369 | } 370 | 371 | static void hex2bytes(uint8_t *outbuf, const char *const input, size_t input_len) { 372 | for (size_t i = 0, j = 0; j < input_len / 2; i += 2, ++j) { 373 | outbuf[j] = (input[i] % 32 + 9) % 25 * 16 + (input[i + 1] % 32 + 9) % 25; 374 | } 375 | } 376 | 377 | /* Manual: reads instruction candidates from stdin, one per line. 378 | * Candidates are expected to be in hex format, with no 0x or \x prefix. 379 | */ 380 | static bool manual_candidate(input_slot *slot) { 381 | char *line = NULL; 382 | size_t size; 383 | if (getline(&line, &size, stdin) < 0) { 384 | /* Input exhausted. 385 | */ 386 | return false; 387 | } 388 | 389 | line[strcspn(line, "\n")] = '\0'; 390 | size_t linelen = strlen(line); 391 | if (linelen == 0 || linelen > MISHEGOS_INSN_MAXLEN * 2) { 392 | return false; 393 | } 394 | 395 | hex2bytes(slot->raw_insn, line, linelen); 396 | slot->len = linelen / 2; 397 | 398 | return true; 399 | } 400 | 401 | mutator_t mutator_create(const char *name) { 402 | mish_getrandom(rng_state, sizeof(rng_state), 0); 403 | 404 | if (name == NULL) // default is sliding candidate 405 | return sliding_candidate; 406 | if (!strcmp(name, "dummy")) 407 | return dummy_candidate; 408 | else if (!strcmp(name, "sliding")) 409 | return sliding_candidate; 410 | else if (!strcmp(name, "structured")) 411 | return structured_candidate; 412 | else if (!strcmp(name, "havoc")) 413 | return havoc_candidate; 414 | else if (!strcmp(name, "manual")) 415 | return manual_candidate; 416 | errx(1, "invalid mutator: %s", name); 417 | } 418 | -------------------------------------------------------------------------------- /src/mishegos/mutator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "mish_common.h" 4 | 5 | #include 6 | 7 | /* Generate a single fuzzing candidate and populate the given input slot with it. 8 | * Returns false if the configured mutation mode has been exhausted. 9 | */ 10 | typedef bool (*mutator_t)(input_slot *); 11 | 12 | mutator_t mutator_create(const char *name); 13 | -------------------------------------------------------------------------------- /src/mishmat/mishmat: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # mishmat: Generate a matrix visualization of mishegos results in HTML 5 | 6 | require "erb" 7 | require "optparse" 8 | require "json" 9 | 10 | COLOR_TABLE = Hash.new("#808080").update( 11 | "success" => "#00FF00", 12 | "failure" => "#FF0000", 13 | "none" => "#0000FF", 14 | "partial" => "#808080", 15 | "unknown" => "#4B0082" 16 | ) 17 | 18 | HEADER = ERB.new <<~HTML 19 | 20 | 21 | 22 | 23 | 24 | 25 | 44 | 45 | 46 | 47 |

mishmat (mishegos)

48 | 49 |

legend

50 | 51 | <% COLOR_TABLE.each do |color, hex| %> 52 | 53 | 54 | 55 | 56 | <% end %> 57 |
<%= color %>
58 | 59 |


60 | 61 | 62 | HTML 63 | 64 | HEADER_ROWS = ERB.new <<~HTML 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | <% row[:outputs].each do |col| %> 73 | 76 | <% end %> 77 | 78 | HTML 79 | 80 | ENTRY_ROW = ERB.new <<~HTML 81 | 82 | 85 | <% row[:outputs].each do |col| %> 86 | 89 | <% end %> 90 | 91 | HTML 92 | 93 | FOOTER = <<~HTML 94 |
worker
input 74 | <%= col[:worker_so] %> 75 |
83 | <%= row[:input] %> 84 | 87 | <%= col[:result] %> (<%= col[:ndecoded] %> / <%= col[:len] %>) 88 |
95 | 96 | 97 | HTML 98 | 99 | opts = { 100 | limit: Float::INFINITY, 101 | } 102 | 103 | def write_header!(row) 104 | STDOUT.puts HEADER.result(binding) 105 | STDOUT.puts HEADER_ROWS.result(binding) 106 | end 107 | 108 | def write_footer! 109 | STDOUT.puts FOOTER 110 | end 111 | 112 | def write_row!(row) 113 | STDOUT.puts ENTRY_ROW.result(binding) 114 | end 115 | 116 | OptionParser.new do |o| 117 | o.banner = "Usage: mishmat [options]" 118 | 119 | o.on "-l", "--limit LIMIT", Integer, "Entry cap" do |limit| 120 | opts[:limit] = limit 121 | end 122 | end.parse! 123 | 124 | # Special-case the first row: We need to grab the figure out the appropriate number of 125 | # columns and their headers. 126 | row = JSON.parse STDIN.gets, symbolize_names: true 127 | write_header! row 128 | write_row! row 129 | 130 | STDIN.each_line.with_index do |line, i| 131 | break if i >= opts[:limit] 132 | 133 | write_row! JSON.parse(line, symbolize_names: true) 134 | end 135 | 136 | write_footer! 137 | -------------------------------------------------------------------------------- /src/worker/Makefile: -------------------------------------------------------------------------------- 1 | WORKERS = bfd \ 2 | capstone \ 3 | dynamorio \ 4 | fadec \ 5 | xed \ 6 | zydis \ 7 | bddisasm \ 8 | iced \ 9 | yaxpeax-x86 \ 10 | ghidra \ 11 | llvm 12 | 13 | .PHONY: all 14 | all: $(WORKERS) 15 | 16 | .PHONY: $(WORKERS) 17 | $(WORKERS): 18 | $(MAKE) \ 19 | CFLAGS="$(CFLAGS) -fPIC" \ 20 | LDFLAGS="-shared -Wl,-z,defs" \ 21 | -C $@/ 22 | 23 | .PHONY: clean 24 | clean: 25 | for dir in $(WORKERS); do \ 26 | $(MAKE) -C $$dir/ clean; \ 27 | done 28 | -------------------------------------------------------------------------------- /src/worker/bddisasm/Makefile: -------------------------------------------------------------------------------- 1 | override CPPFLAGS := $(CPPFLAGS) -Ibddisasm/inc 2 | override LDFLAGS := $(LDFLAGS) -Lbddisasm/build 3 | override LDLIBS := $(LDLIBS) -lbddisasm 4 | 5 | .PHONY: all 6 | all: bddisasm.so 7 | 8 | bddisasm/build/bddisasm.a: 9 | cmake -B bddisasm/build -S bddisasm -DCMAKE_BUILD_TYPE=Release -DBDD_INCLUDE_TOOL=OFF -DBDD_INCLUDE_ISAGENERATOR=OFF 10 | cmake --build bddisasm/build --target bddisasm --parallel 11 | 12 | bddisasm.so: bddisasm/build/bddisasm.a bddisasm.o 13 | $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) bddisasm.o $(LDLIBS) -o $@ 14 | 15 | .PHONY: clean 16 | clean: 17 | rm -rf bddisasm/build 18 | rm -rf *.o *.so 19 | -------------------------------------------------------------------------------- /src/worker/bddisasm/bddisasm.c: -------------------------------------------------------------------------------- 1 | #include "bddisasm/inc/bddisasm.h" 2 | #include "../worker.h" 3 | 4 | char *worker_name = "bddisasm"; 5 | 6 | static const char *bddisasm_strerror(NDSTATUS ndstatus) { 7 | switch (ndstatus) { 8 | case ND_STATUS_BUFFER_TOO_SMALL: { 9 | return "The provided input buffer is too small."; 10 | } 11 | case ND_STATUS_INVALID_ENCODING: { 12 | return "Invalid encoding/instruction."; 13 | } 14 | case ND_STATUS_INSTRUCTION_TOO_LONG: { 15 | return "Instruction exceeds the maximum 15 bytes."; 16 | } 17 | case ND_STATUS_INVALID_PREFIX_SEQUENCE: { 18 | return "Invalid prefix sequence is present."; 19 | } 20 | case ND_STATUS_INVALID_REGISTER_IN_INSTRUCTION: { 21 | return "The instruction uses an invalid register."; 22 | } 23 | case ND_STATUS_XOP_WITH_PREFIX: { 24 | return "XOP is present, but also a legacy prefix."; 25 | } 26 | case ND_STATUS_VEX_WITH_PREFIX: { 27 | return "VEX is present, but also a legacy prefix."; 28 | } 29 | case ND_STATUS_EVEX_WITH_PREFIX: { 30 | return "EVEX is present, but also a legacy prefix."; 31 | } 32 | case ND_STATUS_INVALID_ENCODING_IN_MODE: { 33 | return "Invalid encoding/instruction."; 34 | } 35 | case ND_STATUS_BAD_LOCK_PREFIX: { 36 | return "Invalid usage of LOCK."; 37 | } 38 | case ND_STATUS_CS_LOAD: { 39 | return "An attempt to load the CS register."; 40 | } 41 | case ND_STATUS_66_NOT_ACCEPTED: { 42 | return "0x66 prefix is not accepted."; 43 | } 44 | case ND_STATUS_16_BIT_ADDRESSING_NOT_SUPPORTED: { 45 | return "16 bit addressing mode not supported."; 46 | } 47 | case ND_STATUS_RIP_REL_ADDRESSING_NOT_SUPPORTED: { 48 | return "RIP-relative addressing not supported."; 49 | } 50 | case ND_STATUS_VSIB_WITHOUT_SIB: { 51 | return "Instruction uses VSIB, but SIB is not present."; 52 | } 53 | case ND_STATUS_INVALID_VSIB_REGS: { 54 | return "VSIB addressing, same vector reg used more than once."; 55 | } 56 | case ND_STATUS_VEX_VVVV_MUST_BE_ZERO: { 57 | return "VEX.VVVV field must be zero."; 58 | } 59 | case ND_STATUS_MASK_NOT_SUPPORTED: { 60 | return "Masking is not supported."; 61 | } 62 | case ND_STATUS_MASK_REQUIRED: { 63 | return "Masking is mandatory."; 64 | } 65 | case ND_STATUS_ER_SAE_NOT_SUPPORTED: { 66 | return "Embedded rounding/SAE not supported."; 67 | } 68 | case ND_STATUS_ZEROING_NOT_SUPPORTED: { 69 | return "Zeroing not supported."; 70 | } 71 | case ND_STATUS_ZEROING_ON_MEMORY: { 72 | return "Zeroing on memory."; 73 | } 74 | case ND_STATUS_ZEROING_NO_MASK: { 75 | return "Zeroing without masking."; 76 | } 77 | case ND_STATUS_BROADCAST_NOT_SUPPORTED: { 78 | return "Broadcast not supported."; 79 | } 80 | case ND_STATUS_INVALID_PARAMETER: { 81 | return "An invalid parameter was provided."; 82 | } 83 | case ND_STATUS_INVALID_INSTRUX: { 84 | return "The INSTRUX contains unexpected values."; 85 | } 86 | case ND_STATUS_BUFFER_OVERFLOW: { 87 | return "Not enough space is available to format textual disasm."; 88 | } 89 | case ND_STATUS_INTERNAL_ERROR: { 90 | return "Internal error occurred."; 91 | } 92 | default: { 93 | return "unknown"; 94 | } 95 | } 96 | } 97 | 98 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) { 99 | _unused(bddisasm_strerror); 100 | 101 | INSTRUX instruction; 102 | NDSTATUS ndstatus; 103 | 104 | ndstatus = NdDecodeEx(&instruction, raw_insn, length, ND_CODE_64, ND_DATA_64); 105 | if (!ND_SUCCESS(ndstatus)) { 106 | DLOG("bddisasm decoding failed: %s", bddisasm_strerror(ndstatus)); 107 | result->status = S_FAILURE; 108 | return; 109 | } 110 | 111 | ndstatus = NdToText(&instruction, 0, sizeof(result->result), result->result); 112 | if (!ND_SUCCESS(ndstatus)) { 113 | DLOG("bddisasm formatting failed: %s", bddisasm_strerror(ndstatus)); 114 | result->status = S_FAILURE; 115 | return; 116 | } 117 | 118 | result->status = S_SUCCESS; 119 | result->len = strlen(result->result); 120 | result->ndecoded = instruction.Length; 121 | } 122 | -------------------------------------------------------------------------------- /src/worker/bfd/Makefile: -------------------------------------------------------------------------------- 1 | UNAME := $(shell uname) 2 | 3 | ifeq ($(UNAME), Darwin) 4 | # I can't even begin to describe how annoying this is: 5 | # 1. libbfd's headers error out if some defines provided by a config.h are 6 | # missing. But config.h is also missing, because libbfd is considered 7 | # an "internal" library by the GNU binutils maintainers. 8 | # So we stub those in. 9 | # 2. By default, binutils isn't packaged with libiberty on macOS. 10 | # It has to be built manually with `--enable-install-libiberty`, which 11 | # then needs to be manually linked in. The code below assumes the manual 12 | # build was done with Homebrew. 13 | SNEAKY_MAKE_BFD_INCLUDES_WORK_DEFINES := -DPACKAGE=nice-try-bfd-maintainers -DPACKAGE_VERSION=1 14 | override CPPFLAGS := $(CPPFLAGS) \ 15 | $(SNEAKY_MAKE_BFD_INCLUDES_WORK_DEFINES) \ 16 | -I/usr/local/opt/binutils/include 17 | override LDFLAGS := $(LDFLAGS) -L/usr/local/opt/binutils/lib -liberty -lz 18 | endif 19 | 20 | override LDLIBS := $(LDLIBS) -lbfd -lopcodes 21 | 22 | .PHONY: all 23 | all: bfd.so 24 | 25 | bfd.so: bfd.o 26 | $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ 27 | 28 | .PHONY: clean 29 | clean: 30 | rm -rf *.o *.so 31 | -------------------------------------------------------------------------------- /src/worker/bfd/bfd.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../worker.h" 5 | 6 | /* BFD (libopcodes) adapter for mishegos. 7 | * 8 | * Some notes: 9 | * 1. libopcodes is (almost) completely undocumented. As such, a lot of the calls 10 | * here are educated guesses + me reading the header file + reading the objdump 11 | * source. 12 | * 2. This code is terrible. My original idea was to use a memfd to slurp the data 13 | * coming from the fprintf callback, but didn't work for reasons. 14 | * So, I switched it to a big old buffer + (v)snprintf, and that 15 | * seems to mostly work. libopcodes isn't nice enough to print newlines 16 | * after each instruction for us, so I do that manually. 17 | */ 18 | 19 | static char disasm_buf[MISHEGOS_DEC_MAXLEN]; 20 | static size_t disasm_off; 21 | 22 | static disassembler_ftype disasm; 23 | static struct disassemble_info disasm_info; 24 | 25 | char *worker_name = "bfd"; 26 | 27 | static int dis_fprintf(void *_stream, const char *fmt, ...) { 28 | assert(disasm_off <= MISHEGOS_DEC_MAXLEN && "disassembly buffer overrun?"); 29 | 30 | size_t remaining_size = MISHEGOS_DEC_MAXLEN - disasm_off; 31 | assert(remaining_size > 0); 32 | 33 | va_list arg; 34 | va_start(arg, fmt); 35 | size_t bytes_written = vsnprintf(disasm_buf + disasm_off, remaining_size, fmt, arg); 36 | disasm_off += bytes_written; 37 | va_end(arg); 38 | return 0; 39 | } 40 | 41 | static void init_dis() { 42 | disasm = disassembler(bfd_arch_i386, false, bfd_mach_x86_64, NULL); 43 | if (disasm == NULL) { 44 | errx(1, "disassembler creation failed"); 45 | } 46 | } 47 | 48 | void worker_ctor() { 49 | init_dis(); 50 | } 51 | 52 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) { 53 | /* dis_fprintf doesn't actually use the stream argument, it just takes 54 | * disasm_buf from the module scope. 55 | * 56 | * I'm pretty sure most of this setup could go in init_dis, but it's here 57 | * because I ran into problems with the original memfd implementation. 58 | * Worth re-trying later. 59 | */ 60 | init_disassemble_info(&disasm_info, NULL, dis_fprintf); 61 | disasm_info.disassembler_options = "intel-mnemonic"; 62 | disasm_info.arch = bfd_arch_i386; 63 | disasm_info.mach = bfd_mach_x86_64; 64 | disasm_info.read_memory_func = buffer_read_memory; 65 | disasm_info.buffer = raw_insn; 66 | disasm_info.buffer_vma = 0; 67 | disasm_info.buffer_length = length; 68 | disassemble_init_for_target(&disasm_info); 69 | 70 | memset(disasm_buf, 0, MISHEGOS_DEC_MAXLEN); 71 | 72 | disasm_off = 0; 73 | size_t pc = disasm(0, &disasm_info); 74 | 75 | /* Make sure each instruction is on its own line in the disassembly buffer. 76 | */ 77 | size_t nl = snprintf(disasm_buf + disasm_off, MISHEGOS_DEC_MAXLEN - disasm_off, "\n"); 78 | assert(nl == 1 && "should have written exactly one byte"); 79 | _unused(nl); 80 | disasm_off++; 81 | 82 | if (pc <= 0 || strstr(disasm_buf, "(bad)") != NULL) { 83 | result->status = S_FAILURE; 84 | } else { 85 | result->status = S_SUCCESS; 86 | } 87 | 88 | memcpy(result->result, disasm_buf, disasm_off); 89 | result->len = disasm_off; 90 | result->ndecoded = pc; 91 | } 92 | -------------------------------------------------------------------------------- /src/worker/capstone/Makefile: -------------------------------------------------------------------------------- 1 | override CPPFLAGS := $(CPPFLAGS) -Icapstone/include 2 | override LDFLAGS := $(LDFLAGS) -L./capstone 3 | override LDLIBS := $(LDLIBS) -lcapstone 4 | 5 | .PHONY: all 6 | all: capstone.so 7 | 8 | # This is some stupidity thanks to Capstone's misbehaving Makefile. 9 | MAKEOVERRIDES := $(filter-out CFLAGS=%,$(MAKEOVERRIDES)) 10 | MAKEOVERRIDES := $(filter-out CPPFLAGS=%,$(MAKEOVERRIDES)) 11 | MAKEOVERRIDES := $(filter-out LDFLAGS=%,$(MAKEOVERRIDES)) 12 | MAKEOVERRIDES := $(filter-out LDLIBS=%,$(MAKEOVERRIDES)) 13 | capstone/libcapstone.so.5: 14 | $(MAKE) -C capstone/ \ 15 | CAPSTONE_ARCHS="x86" CAPSTONE_X86_ATT_DISABLE=yes CAPSTONE_BUILD_CORE_ONLY=yes V=1 16 | 17 | capstone.so: capstone/libcapstone.so.5 capstone.o 18 | $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) capstone.o $(LDLIBS) -o $@ 19 | 20 | .PHONY: clean 21 | clean: 22 | rm -rf *.o *.so 23 | $(MAKE) -C capstone/ clean 24 | -------------------------------------------------------------------------------- /src/worker/capstone/capstone.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../worker.h" 4 | 5 | static csh cs_hnd; 6 | 7 | char *worker_name = "capstone"; 8 | 9 | void worker_ctor() { 10 | if (cs_open(CS_ARCH_X86, CS_MODE_64, &cs_hnd) != CS_ERR_OK) { 11 | errx(1, "cs_open"); 12 | } 13 | } 14 | 15 | void worker_dtor() { 16 | cs_close(&cs_hnd); 17 | } 18 | 19 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) { 20 | cs_insn *insn; 21 | size_t count = cs_disasm(cs_hnd, raw_insn, length, 0, 1, &insn); 22 | if (count > 0) { 23 | result->status = S_SUCCESS; 24 | result->len = 25 | snprintf(result->result, MISHEGOS_DEC_MAXLEN, "%s %s\n", insn[0].mnemonic, insn[0].op_str); 26 | result->ndecoded = insn[0].size; 27 | 28 | cs_free(insn, count); 29 | } else { 30 | result->status = S_FAILURE; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/worker/dynamorio/.gitignore: -------------------------------------------------------------------------------- 1 | obj/ 2 | -------------------------------------------------------------------------------- /src/worker/dynamorio/Makefile: -------------------------------------------------------------------------------- 1 | override CPPFLAGS := $(CPPFLAGS) -DLINUX -DX86_64 -DDR_FAST_IR -Iobj/include 2 | override LDFLAGS := $(LDFLAGS) -Lobj/lib64 3 | override LDLIBS := $(LDLIBS) -ldrdecode -ldrlibc 4 | 5 | DYNAMORIO_BUILD_OPTS := -DBUILD_DRSTATS=NO \ 6 | -DBUILD_SAMPLES=NO \ 7 | -DBUILD_EXT=NO \ 8 | -DBUILD_CLIENTS=NO 9 | 10 | .PHONY: all 11 | all: dynamorio.so 12 | 13 | dynamorio.so: dynamorio.o 14 | $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) dynamorio.o $(LDLIBS) -o $@ 15 | 16 | obj/lib64/libdrdecode.a: 17 | mkdir -p obj && \ 18 | cd obj && \ 19 | env -u CFLAGS -u CXXFLAGS \ 20 | cmake $(DYNAMORIO_BUILD_OPTS) ../dynamorio && \ 21 | cmake --build . -- -j4 22 | 23 | dynamorio.o: dynamorio.c obj/lib64/libdrdecode.a 24 | 25 | .PHONY: clean 26 | clean: 27 | rm -rf *.so 28 | rm -rf *.o 29 | rm -rf obj 30 | -------------------------------------------------------------------------------- /src/worker/dynamorio/dynamorio.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../worker.h" 4 | 5 | char *worker_name = "dynamorio"; 6 | 7 | void worker_ctor() { 8 | disassemble_set_syntax(DR_DISASM_INTEL); 9 | } 10 | 11 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) { 12 | instr_t instr; 13 | instr_init(GLOBAL_DCONTEXT, &instr); 14 | uint8_t *next_pc = decode(GLOBAL_DCONTEXT, raw_insn, &instr); 15 | if (next_pc == NULL) { 16 | DLOG("dr decode failed"); 17 | result->status = S_FAILURE; 18 | return; 19 | } 20 | 21 | size_t len = 22 | instr_disassemble_to_buffer(GLOBAL_DCONTEXT, &instr, result->result, MISHEGOS_DEC_MAXLEN); 23 | instr_free(GLOBAL_DCONTEXT, &instr); 24 | result->status = S_SUCCESS; 25 | result->len = len; 26 | result->ndecoded = next_pc - raw_insn; 27 | } 28 | -------------------------------------------------------------------------------- /src/worker/fadec/Makefile: -------------------------------------------------------------------------------- 1 | override CPPFLAGS := $(CPPFLAGS) -Ifadec -Ifadec/build 2 | override LDFLAGS := $(LDFLAGS) -Lfadec/build 3 | override LDLIBS := $(LDLIBS) -lfadec 4 | 5 | .PHONY: all 6 | all: fadec.so 7 | 8 | fadec/build/libfadec.a: 9 | mkdir -p fadec/build && cd fadec && \ 10 | env -u CPPFLAGS -u LDFLAGS -u LDLIBS meson build -Dbuildtype=release -Darchmode=only64 && \ 11 | ninja -C build -v 12 | 13 | fadec.so: fadec.c fadec/build/libfadec.a 14 | $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $< $(LDLIBS) -o $@ 15 | 16 | .PHONY: clean 17 | clean: 18 | rm -rf fadec/build 19 | rm -rf *.o *.so 20 | 21 | -------------------------------------------------------------------------------- /src/worker/fadec/fadec.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../worker.h" 4 | 5 | char *worker_name = "fadec"; 6 | 7 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) { 8 | FdInstr inst; 9 | int r = fd_decode(raw_insn, length, 64, 0, &inst); 10 | if (r > 0) { 11 | result->status = S_SUCCESS; 12 | fd_format(&inst, result->result, MISHEGOS_DEC_MAXLEN); 13 | result->len = strlen(result->result); 14 | result->ndecoded = FD_SIZE(&inst); 15 | } else { 16 | result->status = r == FD_ERR_PARTIAL ? S_PARTIAL : S_FAILURE; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/worker/ghidra/.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | /ghidra.so 3 | -------------------------------------------------------------------------------- /src/worker/ghidra/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | project(mishegos_ghidra) 4 | 5 | # Pull in the CMake sleigh build support 6 | set(sleigh_BUILD_SLEIGHSPECS ON CACHE BOOL "" FORCE) 7 | add_subdirectory(sleigh-cmake EXCLUDE_FROM_ALL) 8 | 9 | add_library(mishegos_ghidra SHARED ghidra.cc sleighMishegos.cc) 10 | set_target_properties(mishegos_ghidra 11 | PROPERTIES 12 | OUTPUT_NAME ghidra 13 | PREFIX "" 14 | ) 15 | target_compile_features(mishegos_ghidra PUBLIC cxx_std_11) 16 | 17 | target_link_libraries(mishegos_ghidra PRIVATE sleigh::sla sleigh::decomp) 18 | 19 | # TODO: Not sure how to get this into the project for only linking against the 20 | # mishegos shared library 21 | get_filename_component(mishegos_include_dir "../../include" 22 | ABSOLUTE BASE_DIR ${PROJECT_SOURCE_DIRECTORY} 23 | ) 24 | target_include_directories(mishegos_ghidra 25 | PRIVATE "$" 26 | ) 27 | 28 | add_dependencies(mishegos_ghidra 29 | sleigh_spec_x86-64 30 | ) 31 | -------------------------------------------------------------------------------- /src/worker/ghidra/Makefile: -------------------------------------------------------------------------------- 1 | override LDFLAGS := 2 | override CXXFLAGS := 3 | 4 | .PHONY: all 5 | all: ghidra.so 6 | 7 | ghidra.so: 8 | cmake -B build -S . \ 9 | -DCMAKE_BUILD_TYPE=Release \ 10 | -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ 11 | -Dsleigh_RELEASE_TYPE=HEAD \ 12 | "-DFETCHCONTENT_SOURCE_DIR_GHIDRASOURCE=./ghidra" && \ 13 | cmake --build build --verbose && \ 14 | cp build/ghidra.$(SO_SUFFIX) ./ghidra.so 15 | 16 | # Uncomment to build with address sanitizer. Remember to remove 'build' 17 | # directory if switching between the two 18 | #.PHONY: ghidra.so 19 | #.ONESHELL: 20 | #ghidra.so: 21 | # cmake -B build -S . \ 22 | # -DCMAKE_BUILD_TYPE=Debug \ 23 | # -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ 24 | # -Dsleigh_RELEASE_TYPE=HEAD \ 25 | # "-DCMAKE_C_FLAGS=-Wall -Wpedantic -Wextra -fsanitize=address" \ 26 | # "-DCMAKE_CXX_FLAGS=-Wall -Wpedantic -Wextra -Wconversion -Wsign-conversion -Wcast-qual -Wshadow -Wformat=2 -Wundef -fsanitize=address" \ 27 | # "-DCMAKE_MODULE_LINKER_FLAGS=-fsanitize=address" \ 28 | # "-DFETCHCONTENT_SOURCE_DIR_GHIDRASOURCE=./ghidra" 29 | # cmake --build build -j --verbose 30 | # cp build/ghidra.so ./ghidra.so 31 | 32 | .PHONY: clean 33 | clean: 34 | rm -rf build 35 | rm -f ./ghidra.so 36 | -------------------------------------------------------------------------------- /src/worker/ghidra/ghidra.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is based loosely on Ghidra's sleighexample.cc file for 3 | * initializing and loading bytes for sleigh to disassemble 4 | * https://github.com/NationalSecurityAgency/ghidra/blob/47f76c78d6b7d5c56a9256b0666620863805ff30/Ghidra/Features/Decompiler/src/decompile/cpp/sleighexample.cc 5 | */ 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "sleighMishegos.hh" 16 | #include "../worker.h" 17 | #include "mish_common.h" 18 | 19 | using namespace ghidra; 20 | 21 | extern "C" { 22 | const char *worker_name = (const char *)"ghidra"; 23 | 24 | void worker_ctor(); 25 | 26 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length); 27 | } 28 | 29 | // This is a tiny LoadImage class which feeds the executable bytes to the translator 30 | class MyLoadImage : public LoadImage { 31 | uintb baseaddr; 32 | int4 length; 33 | uint1 *data; 34 | 35 | public: 36 | // "nofile" doesn't have any special meaning. Just doing what was done in 37 | // sleighExample.cc 38 | MyLoadImage(uintb ad, uint1 *ptr, int4 sz) 39 | : LoadImage("nofile"), baseaddr{ad}, length{sz}, data{ptr} { 40 | } 41 | virtual void loadFill(uint1 *ptr, int4 size, const Address &addr) override; 42 | string getArchType(void) const override { 43 | return "x86:LE:64:default"; 44 | } 45 | virtual void adjustVma(long) override { 46 | } 47 | virtual void setData(uint1 *ptr, int4 sz) { 48 | this->data = ptr; 49 | this->length = sz; 50 | } 51 | }; 52 | 53 | // This is the only important method for the LoadImage. It returns bytes from the static array 54 | // depending on the address range requested 55 | void MyLoadImage::loadFill(uint1 *ptr, int4 size, const Address &addr) { 56 | uintb start = addr.getOffset(); 57 | uintb max = baseaddr + (length - 1); 58 | for (int4 i = 0; i < size; ++i) { // For every byte requestes 59 | uintb curoff = start + i; // Calculate offset of byte 60 | if ((curoff < baseaddr) || (curoff > max)) { // If byte does not fall in window 61 | ptr[i] = 0; // return 0 62 | continue; 63 | } 64 | uintb diff = curoff - baseaddr; 65 | ptr[i] = data[(int4)diff]; // Otherwise return data from our window 66 | } 67 | } 68 | 69 | // Here is a simple class for emitting assembly. In this case, we send the strings straight 70 | // to the result. 71 | class AssemblyMishegos : public AssemblyEmit { 72 | decode_result *result; 73 | 74 | public: 75 | AssemblyMishegos(decode_result *dr) : result(dr){}; 76 | virtual void dump(const Address &, const string &mnem, const string &body) { 77 | result->status = S_SUCCESS; 78 | result->len = 79 | snprintf(result->result, MISHEGOS_DEC_MAXLEN, "%s %s\n", mnem.c_str(), body.c_str()); 80 | } 81 | }; 82 | 83 | static const uintb START_ADDRESS = 0x0; 84 | 85 | // Storing data files 86 | DocumentStorage &g_docstorage() { 87 | static DocumentStorage docstorage; 88 | return docstorage; 89 | } 90 | 91 | // Context for disassembly 92 | ContextInternal &g_context() { 93 | static ContextInternal context; 94 | return context; 95 | } 96 | 97 | // Loader for reading instruction bytes 98 | MyLoadImage &g_loader() { 99 | static MyLoadImage loader(START_ADDRESS, nullptr, 0); 100 | return loader; 101 | } 102 | 103 | // Translator for doing disassembly 104 | SleighMishegos &g_trans() { 105 | static SleighMishegos trans(&g_loader(), &g_context()); 106 | return trans; 107 | } 108 | 109 | void worker_ctor() { 110 | AttributeId::initialize(); 111 | ElementId::initialize(); 112 | 113 | SleighMishegos &trans = g_trans(); 114 | 115 | // Set up the assembler/pcode-translator 116 | string sleighfilename = "src/worker/ghidra/build/sleigh-cmake/specfiles/Ghidra/Processors/x86/" 117 | "data/languages/x86-64.sla"; 118 | // Need this for correctly setting up the 64 bit x86 mode 119 | string pspecfilename = "src/worker/ghidra/build/sleigh-cmake/specfiles/Ghidra/Processors/x86/" 120 | "data/languages/x86-64.pspec"; 121 | 122 | // Read sleigh and spec file into DOM 123 | DocumentStorage &docstorage = g_docstorage(); 124 | Element *sleighroot = docstorage.openDocument(sleighfilename)->getRoot(); 125 | docstorage.registerTag(sleighroot); 126 | Element *specroot = docstorage.openDocument(pspecfilename)->getRoot(); 127 | docstorage.registerTag(specroot); 128 | 129 | trans.initialize(docstorage); // Initialize the translator 130 | 131 | // Now that context symbol names are loaded by the translator 132 | // we can set the default context 133 | // This imitates what is done in 134 | // void Architecture::parseProcessorConfig(DocumentStorage &store) 135 | const Element *el = docstorage.getTag("processor_spec"); 136 | if (el == (const Element *)0) 137 | throw LowlevelError("No processor configuration tag found"); 138 | XmlDecode decoder(&trans, el); 139 | 140 | uint4 elemId = decoder.openElement(ELEM_PROCESSOR_SPEC); 141 | for (;;) { 142 | uint4 subId = decoder.peekElement(); 143 | if (subId == 0) 144 | break; 145 | else if (subId == ELEM_CONTEXT_DATA) { 146 | g_context().decodeFromSpec(decoder); 147 | break; 148 | } else { 149 | decoder.openElement(); 150 | decoder.closeElementSkipping(subId); 151 | } 152 | } 153 | decoder.closeElement(elemId); 154 | 155 | // Single instruction disasm. Prevent instructions from messing up future 156 | // instruction disassembly 157 | trans.allowContextSet(false); 158 | } 159 | 160 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) { 161 | MyLoadImage &loader = g_loader(); 162 | const SleighMishegos &trans = g_trans(); 163 | 164 | loader.setData(raw_insn, length); 165 | 166 | // Set up the disassembly dumper 167 | AssemblyMishegos assememit(result); 168 | 169 | // Starting disassembly address 170 | Address addr(trans.getDefaultCodeSpace(), START_ADDRESS); 171 | 172 | try { 173 | result->ndecoded = trans.printAssembly(assememit, addr); 174 | } catch (...) { 175 | // Uncomment for debugging exception info 176 | // std::exception_ptr p = std::current_exception(); 177 | // std::cout << (p ? p.__cxa_exception_type()->name() : "null") << std::endl; 178 | 179 | result->status = S_FAILURE; 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /src/worker/ghidra/sleighMishegos.cc: -------------------------------------------------------------------------------- 1 | /* ### 2 | * IP: GHIDRA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * This file was copied from upstream 17 | * https://github.com/NationalSecurityAgency/ghidra/blob/2536099c0eb2683ee0e416a127f8a8795f8de853/Ghidra/Features/Decompiler/src/decompile/cpp/sleigh.cc 18 | * 19 | * Modified by Eric Kilmer at Trail of Bits 2022 20 | * Modified to better support mishegos single-shot disassembly by not using the 21 | * disassembly cache. This allows us to get new disassembly results at the same 22 | * address without having to reinitialize everything. 23 | * 24 | * This file has been modified in a way to minimize the diff from upstream. 25 | * There is dead code and other code artifacts that probably wouldn't be 26 | * written in the same way had this functionality been written fresh. 27 | */ 28 | #include "sleighMishegos.hh" 29 | #include 30 | 31 | namespace ghidra { 32 | 33 | PcodeCacher::PcodeCacher(void) 34 | 35 | { 36 | // We aim to allocate this array only once 37 | uint4 maxsize = 600; 38 | poolstart = new VarnodeData[ maxsize ]; 39 | endpool = poolstart + maxsize; 40 | curpool = poolstart; 41 | } 42 | 43 | PcodeCacher::~PcodeCacher(void) 44 | 45 | { 46 | delete [] poolstart; 47 | } 48 | 49 | /// Expand the VarnodeData pool so that \e size more elements fit, and return 50 | /// a pointer to first available element. 51 | /// \param size is the number of elements to expand the pool by 52 | /// \return the first available VarnodeData 53 | VarnodeData *PcodeCacher::expandPool(uint4 size) 54 | 55 | { 56 | uint4 curmax = endpool - poolstart; 57 | uint4 cursize = curpool - poolstart; 58 | if (cursize + size <= curmax) 59 | return curpool; // No expansion necessary 60 | uint4 increase = (cursize + size) - curmax; 61 | if (increase < 100) // Increase by at least 100 62 | increase = 100; 63 | 64 | uint4 newsize = curmax + increase; 65 | 66 | VarnodeData *newpool = new VarnodeData[newsize]; 67 | for(uint4 i=0;i::iterator iter; 83 | for(iter=label_refs.begin();iter!=label_refs.end();++iter) { 84 | VarnodeData *ref = (*iter).dataptr; 85 | (*iter).dataptr = newpool + (ref - poolstart); 86 | } 87 | 88 | delete [] poolstart; // Free up old pool 89 | poolstart = newpool; 90 | curpool = newpool + (cursize + size); 91 | endpool = newpool + newsize; 92 | return newpool + cursize; 93 | } 94 | 95 | /// Store off a reference to the Varnode and the absolute index of the next 96 | /// instruction. The Varnode must be an operand of the current instruction. 97 | /// \param ptr is the Varnode reference 98 | void PcodeCacher::addLabelRef(VarnodeData *ptr) 99 | 100 | { 101 | label_refs.emplace_back(); 102 | label_refs.back().dataptr = ptr; 103 | label_refs.back().calling_index = issued.size(); 104 | } 105 | 106 | /// The label has an id that is referred to by Varnodes holding 107 | /// intra-instruction branch targets, prior to converting 108 | /// them to a \e relative \e branch offset. The label is associated with 109 | /// the absolute index of the next PcodeData object to be issued, 110 | /// facilitating this conversion. 111 | /// \param id is the given id of the label 112 | void PcodeCacher::addLabel(uint4 id) 113 | 114 | { 115 | while(labels.size() <= id) 116 | labels.push_back(0xbadbeef); 117 | labels[ id ] = issued.size(); 118 | } 119 | 120 | void PcodeCacher::clear(void) 121 | 122 | { 123 | curpool = poolstart; 124 | issued.clear(); 125 | label_refs.clear(); 126 | labels.clear(); 127 | } 128 | 129 | /// Assuming all the PcodeData has been generated for an 130 | /// instruction, go resolve any relative offsets and back 131 | /// patch their value(s) into the PcodeData 132 | void PcodeCacher::resolveRelatives(void) 133 | 134 | { 135 | list::const_iterator iter; 136 | for(iter=label_refs.begin();iter!=label_refs.end();++iter) { 137 | VarnodeData *ptr = (*iter).dataptr; 138 | uint4 id = ptr->offset; 139 | if ((id >= labels.size())||(labels[id] == 0xbadbeef)) 140 | throw LowlevelError("Reference to non-existant sleigh label"); 141 | // Calculate the relative index given the two absolute indices 142 | uintb res = labels[id] - (*iter).calling_index; 143 | res &= calc_mask( ptr->size ); 144 | ptr->offset = res; 145 | } 146 | } 147 | 148 | /// Each p-code operation is presented to the emitter via its dump() method. 149 | /// \param addr is the Address associated with the p-code operation 150 | /// \param emt is the emitter 151 | void PcodeCacher::emit(const Address &addr,PcodeEmit *emt) const 152 | 153 | { 154 | vector::const_iterator iter; 155 | 156 | for(iter=issued.begin();iter!=issued.end();++iter) 157 | emt->dump(addr,(*iter).opc,(*iter).outvar,(*iter).invar,(*iter).isize); 158 | } 159 | 160 | /// \brief Generate a concrete VarnodeData object from the given template (VarnodeTpl) 161 | /// 162 | /// \param vntpl is the template to reference 163 | /// \param vn is the object to fill in with concrete values 164 | void SleighBuilder::generateLocation(const VarnodeTpl *vntpl,VarnodeData &vn) 165 | 166 | { 167 | vn.space = vntpl->getSpace().fixSpace(*walker); 168 | vn.size = vntpl->getSize().fix(*walker); 169 | if (vn.space == const_space) 170 | vn.offset = vntpl->getOffset().fix(*walker) & calc_mask(vn.size); 171 | else if (vn.space == uniq_space) { 172 | vn.offset = vntpl->getOffset().fix(*walker); 173 | vn.offset |= uniqueoffset; 174 | } 175 | else 176 | vn.offset = vn.space->wrapOffset(vntpl->getOffset().fix(*walker)); 177 | } 178 | 179 | /// \brief Generate a pointer VarnodeData from a dynamic template (VarnodeTpl) 180 | /// 181 | /// The symbol represents a value referenced through a dynamic pointer. 182 | /// This method generates the varnode representing the pointer itself and also 183 | /// returns the address space in anticipation of generating the LOAD or STORE 184 | /// that actually manipulates the value. 185 | /// \param vntpl is the dynamic template to reference 186 | /// \param vn is the object to fill with concrete values 187 | /// \return the address space being pointed to 188 | AddrSpace *SleighBuilder::generatePointer(const VarnodeTpl *vntpl,VarnodeData &vn) 189 | 190 | { 191 | const FixedHandle &hand(walker->getFixedHandle(vntpl->getOffset().getHandleIndex())); 192 | vn.space = hand.offset_space; 193 | vn.size = hand.offset_size; 194 | if (vn.space == const_space) 195 | vn.offset = hand.offset_offset & calc_mask(vn.size); 196 | else if (vn.space == uniq_space) 197 | vn.offset = hand.offset_offset | uniqueoffset; 198 | else 199 | vn.offset = vn.space->wrapOffset(hand.offset_offset); 200 | return hand.space; 201 | } 202 | 203 | void SleighBuilder::generatePointerAdd(PcodeData *op,const VarnodeTpl *vntpl) 204 | 205 | { 206 | uintb offsetPlus = vntpl->getOffset().getReal() & 0xffff; 207 | if (offsetPlus == 0) { 208 | return; 209 | } 210 | PcodeData *nextop = cache->allocateInstruction(); 211 | nextop->opc = op->opc; 212 | nextop->invar = op->invar; 213 | nextop->isize = op->isize; 214 | nextop->outvar = op->outvar; 215 | op->isize = 2; 216 | op->opc = CPUI_INT_ADD; 217 | VarnodeData *newparams = op->invar = cache->allocateVarnodes(2); 218 | newparams[0] = nextop->invar[1]; 219 | newparams[1].space = const_space; // Add in V_OFFSET_PLUS 220 | newparams[1].offset = offsetPlus; 221 | newparams[1].size = newparams[0].size; 222 | op->outvar = nextop->invar + 1; // Output of ADD is input to original op 223 | op->outvar->space = uniq_space; // Result of INT_ADD in special runtime temp 224 | op->outvar->offset = uniq_space->getTrans()->getUniqueStart(Translate::RUNTIME_BITRANGE_EA); 225 | } 226 | 227 | void SleighBuilder::dump(OpTpl *op) 228 | 229 | { // Dump on op through low-level dump interface 230 | // filling in dynamic loads and stores if necessary 231 | PcodeData *thisop; 232 | VarnodeData *invars; 233 | VarnodeData *loadvars; 234 | VarnodeData *storevars; 235 | VarnodeTpl *vn,*outvn; 236 | int4 isize = op->numInput(); 237 | // First build all the inputs 238 | invars = cache->allocateVarnodes(isize); 239 | for(int4 i=0;igetIn(i); 241 | if (vn->isDynamic(*walker)) { 242 | generateLocation(vn,invars[i]); // Input of -op- is really temporary storage 243 | PcodeData *load_op = cache->allocateInstruction(); 244 | load_op->opc = CPUI_LOAD; 245 | load_op->outvar = invars + i; 246 | load_op->isize = 2; 247 | loadvars = load_op->invar = cache->allocateVarnodes(2); 248 | AddrSpace *spc = generatePointer(vn,loadvars[1]); 249 | loadvars[0].space = const_space; 250 | loadvars[0].offset = (uintb)(uintp)spc; 251 | loadvars[0].size = sizeof(spc); 252 | if (vn->getOffset().getSelect() == ConstTpl::v_offset_plus) 253 | generatePointerAdd(load_op, vn); 254 | } 255 | else 256 | generateLocation(vn,invars[i]); 257 | } 258 | if ((isize>0)&&(op->getIn(0)->isRelative())) { 259 | invars->offset += getLabelBase(); 260 | cache->addLabelRef(invars); 261 | } 262 | thisop = cache->allocateInstruction(); 263 | thisop->opc = op->getOpcode(); 264 | thisop->invar = invars; 265 | thisop->isize = isize; 266 | outvn = op->getOut(); 267 | if (outvn != (VarnodeTpl *)0) { 268 | if (outvn->isDynamic(*walker)) { 269 | storevars = cache->allocateVarnodes(3); 270 | generateLocation(outvn,storevars[2]); // Output of -op- is really temporary storage 271 | thisop->outvar = storevars+2; 272 | PcodeData *store_op = cache->allocateInstruction(); 273 | store_op->opc = CPUI_STORE; 274 | store_op->isize = 3; 275 | // store_op->outvar = (VarnodeData *)0; 276 | store_op->invar = storevars; 277 | AddrSpace *spc = generatePointer(outvn,storevars[1]); // pointer 278 | storevars[0].space = const_space; 279 | storevars[0].offset = (uintb)(uintp)spc; // space in which to store 280 | storevars[0].size = sizeof(spc); 281 | if (outvn->getOffset().getSelect() == ConstTpl::v_offset_plus) 282 | generatePointerAdd(store_op,outvn); 283 | } 284 | else { 285 | thisop->outvar = cache->allocateVarnodes(1); 286 | generateLocation(outvn,*thisop->outvar); 287 | } 288 | } 289 | } 290 | 291 | /// \brief Build a named p-code section of a constructor that contains only implied BUILD directives 292 | /// 293 | /// If a named section of a constructor is empty, we still need to walk 294 | /// through any subtables that might contain p-code in their named sections. 295 | /// This method treats each subtable operand as an implied \e build directive, 296 | /// in the otherwise empty section. 297 | /// \param ct is the matching currently Constructor being built 298 | /// \param secnum is the particular \e named section number to build 299 | void SleighBuilder::buildEmpty(Constructor *ct,int4 secnum) 300 | 301 | { 302 | int4 numops = ct->getNumOperands(); 303 | 304 | for(int4 i=0;igetOperand(i)->getDefiningSymbol(); 306 | if (sym == (SubtableSymbol *)0) continue; 307 | if (sym->getType() != SleighSymbol::subtable_symbol) continue; 308 | 309 | walker->pushOperand(i); 310 | ConstructTpl *construct = walker->getConstructor()->getNamedTempl(secnum); 311 | if (construct == (ConstructTpl *)0) 312 | buildEmpty(walker->getConstructor(),secnum); 313 | else 314 | build(construct,secnum); 315 | walker->popOperand(); 316 | } 317 | } 318 | 319 | /// Bits used to make temporary registers unique across multiple instructions 320 | /// are generated based on the given address. 321 | /// \param addr is the given Address 322 | void SleighBuilder::setUniqueOffset(const Address &addr) 323 | 324 | { 325 | uniqueoffset = (addr.getOffset() & uniquemask)<<4; 326 | } 327 | 328 | /// \brief Constructor 329 | /// 330 | /// \param w is the parsed instruction 331 | /// \param dcache is a cache of nearby instruction parses 332 | /// \param pc will hold the PcodeData and VarnodeData objects produced by \b this builder 333 | /// \param cspc is the constant address space 334 | /// \param uspc is the unique address space 335 | /// \param umask is the mask to use to find unique bits within an Address 336 | SleighBuilder::SleighBuilder(ParserWalker *w,DisassemblyCache *dcache,PcodeCacher *pc,AddrSpace *cspc, 337 | AddrSpace *uspc,uint4 umask) 338 | : PcodeBuilder(0) 339 | { 340 | walker = w; 341 | discache = dcache; 342 | cache = pc; 343 | const_space = cspc; 344 | uniq_space = uspc; 345 | uniquemask = umask; 346 | uniqueoffset = (walker->getAddr().getOffset() & uniquemask)<<4; 347 | } 348 | 349 | void SleighBuilder::appendBuild(OpTpl *bld,int4 secnum) 350 | 351 | { 352 | // Append p-code for a particular build statement 353 | int4 index = bld->getIn(0)->getOffset().getReal(); // Recover operand index from build statement 354 | // Check if operand is a subtable 355 | SubtableSymbol *sym = (SubtableSymbol *)walker->getConstructor()->getOperand(index)->getDefiningSymbol(); 356 | if ((sym==(SubtableSymbol *)0)||(sym->getType() != SleighSymbol::subtable_symbol)) return; 357 | 358 | walker->pushOperand(index); 359 | Constructor *ct = walker->getConstructor(); 360 | if (secnum >=0) { 361 | ConstructTpl *construct = ct->getNamedTempl(secnum); 362 | if (construct == (ConstructTpl *)0) 363 | buildEmpty(ct,secnum); 364 | else 365 | build(construct,secnum); 366 | } 367 | else { 368 | ConstructTpl *construct = ct->getTempl(); 369 | build(construct,-1); 370 | } 371 | walker->popOperand(); 372 | } 373 | 374 | void SleighBuilder::delaySlot(OpTpl *op) 375 | 376 | { 377 | // Append pcode for an entire instruction (delay slot) 378 | // in the middle of the current instruction 379 | ParserWalker *tmp = walker; 380 | uintb olduniqueoffset = uniqueoffset; 381 | 382 | Address baseaddr = tmp->getAddr(); 383 | int4 fallOffset = tmp->getLength(); 384 | int4 delaySlotByteCnt = tmp->getParserContext()->getDelaySlot(); 385 | int4 bytecount = 0; 386 | do { 387 | Address newaddr = baseaddr + fallOffset; 388 | setUniqueOffset(newaddr); 389 | const ParserContext *pos = discache->getParserContext(newaddr); 390 | if (pos->getParserState() != ParserContext::pcode) 391 | throw LowlevelError("Could not obtain cached delay slot instruction"); 392 | int4 len = pos->getLength(); 393 | 394 | ParserWalker newwalker( pos ); 395 | walker = &newwalker; 396 | walker->baseState(); 397 | build(walker->getConstructor()->getTempl(),-1); // Build the whole delay slot 398 | fallOffset += len; 399 | bytecount += len; 400 | } while(bytecount < delaySlotByteCnt); 401 | walker = tmp; // Restore original context 402 | uniqueoffset = olduniqueoffset; 403 | } 404 | 405 | void SleighBuilder::setLabel(OpTpl *op) 406 | 407 | { 408 | cache->addLabel( op->getIn(0)->getOffset().getReal()+getLabelBase() ); 409 | } 410 | 411 | void SleighBuilder::appendCrossBuild(OpTpl *bld,int4 secnum) 412 | 413 | { 414 | // Weave in the p-code section from an instruction at another address 415 | // bld-param(0) contains the address of the instruction 416 | // bld-param(1) contains the section number 417 | if (secnum>=0) 418 | throw LowlevelError("CROSSBUILD directive within a named section"); 419 | secnum = bld->getIn(1)->getOffset().getReal(); 420 | VarnodeTpl *vn = bld->getIn(0); 421 | AddrSpace *spc = vn->getSpace().fixSpace(*walker); 422 | uintb addr = spc->wrapOffset( vn->getOffset().fix(*walker) ); 423 | 424 | ParserWalker *tmp = walker; 425 | uintb olduniqueoffset = uniqueoffset; 426 | 427 | Address newaddr(spc,addr); 428 | setUniqueOffset(newaddr); 429 | const ParserContext *pos = discache->getParserContext( newaddr ); 430 | if (pos->getParserState() != ParserContext::pcode) 431 | throw LowlevelError("Could not obtain cached crossbuild instruction"); 432 | 433 | ParserWalker newwalker( pos, tmp->getParserContext() ); 434 | walker = &newwalker; 435 | 436 | walker->baseState(); 437 | Constructor *ct = walker->getConstructor(); 438 | ConstructTpl *construct = ct->getNamedTempl(secnum); 439 | if (construct == (ConstructTpl *)0) 440 | buildEmpty(ct,secnum); 441 | else 442 | build(construct,secnum); 443 | walker = tmp; 444 | uniqueoffset = olduniqueoffset; 445 | } 446 | 447 | /// \param min is the minimum number of allocations before a reuse is expected 448 | /// \param hashsize is the number of elements in the hash-table 449 | void DisassemblyCache::initialize(int4 min,int4 hashsize) 450 | 451 | { 452 | minimumreuse = min; 453 | mask = hashsize-1; 454 | uintb masktest = coveringmask((uintb)mask); 455 | if (masktest != (uintb)mask) // -hashsize- must be a power of 2 456 | throw LowlevelError("Bad windowsize for disassembly cache"); 457 | list = new ParserContext *[minimumreuse]; 458 | nextfree = 0; 459 | hashtable = new ParserContext *[hashsize]; 460 | for(int4 i=0;iinitialize(75,20,constspace); 463 | list[i] = pos; 464 | } 465 | ParserContext *pos = list[0]; 466 | for(int4 i=0;igetAddr() == addr) 507 | return res; 508 | res = list[ nextfree ]; 509 | nextfree += 1; // Advance the circular index 510 | if (nextfree >= minimumreuse) 511 | nextfree = 0; 512 | res->setAddr(addr); 513 | res->setParserState(ParserContext::uninitialized); // Need to start over with parsing 514 | hashtable[ hashindex ] = res; // Stick it into the hashtable 515 | return res; 516 | } 517 | 518 | /// \param ld is the LoadImage to draw program bytes from 519 | /// \param c_db is the context database 520 | SleighMishegos::SleighMishegos(LoadImage *ld,ContextDatabase *c_db) 521 | : SleighBase() 522 | 523 | { 524 | loader = ld; 525 | context_db = c_db; 526 | cache = new ContextCache(c_db); 527 | discache = (DisassemblyCache *)0; 528 | pos = (ParserContext *)0; 529 | } 530 | 531 | void SleighMishegos::clearForDelete(void) 532 | 533 | { 534 | delete cache; 535 | if (discache != (DisassemblyCache *)0) 536 | delete discache; 537 | if (pos != (ParserContext *)0) 538 | delete pos; 539 | } 540 | 541 | SleighMishegos::~SleighMishegos(void) 542 | 543 | { 544 | clearForDelete(); 545 | } 546 | 547 | /// Completely clear everything except the base and reconstruct 548 | /// with a new LoadImage and ContextDatabase 549 | /// \param ld is the new LoadImage 550 | /// \param c_db is the new ContextDatabase 551 | void SleighMishegos::reset(LoadImage *ld,ContextDatabase *c_db) 552 | 553 | { 554 | clearForDelete(); 555 | pcode_cache.clear(); 556 | loader = ld; 557 | context_db = c_db; 558 | cache = new ContextCache(c_db); 559 | discache = (DisassemblyCache *)0; 560 | pos = (ParserContext *)0; 561 | } 562 | 563 | /// The .sla file from the document store is loaded and cache objects are prepared 564 | /// \param store is the document store containing the main \ tag. 565 | void SleighMishegos::initialize(DocumentStorage &store) 566 | 567 | { 568 | if (!isInitialized()) { // Initialize the base if not already 569 | const Element *el = store.getTag("sleigh"); 570 | if (el == (const Element *)0) 571 | throw LowlevelError("Could not find sleigh tag"); 572 | restoreXml(el); 573 | } 574 | else 575 | reregisterContext(); 576 | uint4 parser_cachesize = 2; 577 | uint4 parser_windowsize = 32; 578 | if ((maxdelayslotbytes > 1)||(unique_allocatemask != 0)) { 579 | parser_cachesize = 8; 580 | parser_windowsize = 256; 581 | } 582 | pos = new ParserContext(cache,this); 583 | // Values taken from (now removed) DisassemblyCache::initialize. No 584 | // explanation for magic values 585 | pos->initialize(75, 20, getConstantSpace()); 586 | } 587 | 588 | /// \brief Obtain a parse tree for the instruction at the given address 589 | /// 590 | /// The tree may be cached from a previous access. If the address 591 | /// has not been parsed, disassembly is performed, and a new parse tree 592 | /// is prepared. Depending on the desired \e state, the parse tree 593 | /// can be prepared either for disassembly or for p-code generation. 594 | /// \param addr is the given address of the instruction 595 | /// \param state is the desired parse state. 596 | /// \return the parse tree object (ParseContext) 597 | ParserContext *SleighMishegos::obtainContext(const Address &addr,int4 state) const 598 | 599 | { 600 | pos->setAddr(addr); 601 | pos->setParserState(ParserContext::uninitialized); 602 | int4 curstate = pos->getParserState(); 603 | if (curstate >= state) 604 | return pos; 605 | if (curstate == ParserContext::uninitialized) { 606 | resolve(*pos); 607 | if (state == ParserContext::disassembly) 608 | return pos; 609 | } 610 | // If we reach here, state must be ParserContext::pcode 611 | resolveHandles(*pos); 612 | return pos; 613 | } 614 | 615 | /// Resolve \e all the constructors involved in the instruction at the indicated address 616 | /// \param pos is the parse object that will hold the resulting tree 617 | void SleighMishegos::resolve(ParserContext &pos) const 618 | 619 | { 620 | loader->loadFill(pos.getBuffer(),16,pos.getAddr()); 621 | ParserWalkerChange walker(&pos); 622 | pos.deallocateState(walker); // Clear the previous resolve and initialize the walker 623 | Constructor *ct,*subct; 624 | uint4 off; 625 | int4 oper,numoper; 626 | 627 | pos.setDelaySlot(0); 628 | walker.setOffset(0); // Initial offset 629 | pos.clearCommits(); // Clear any old context commits 630 | pos.loadContext(); // Get context for current address 631 | ct = root->resolve(walker); // Base constructor 632 | walker.setConstructor(ct); 633 | ct->applyContext(walker); 634 | while(walker.isState()) { 635 | ct = walker.getConstructor(); 636 | oper = walker.getOperand(); 637 | numoper = ct->getNumOperands(); 638 | while(oper < numoper) { 639 | OperandSymbol *sym = ct->getOperand(oper); 640 | off = walker.getOffset(sym->getOffsetBase()) + sym->getRelativeOffset(); 641 | pos.allocateOperand(oper,walker); // Descend into new operand and reserve space 642 | walker.setOffset(off); 643 | TripleSymbol *tsym = sym->getDefiningSymbol(); 644 | if (tsym != (TripleSymbol *)0) { 645 | subct = tsym->resolve(walker); 646 | if (subct != (Constructor *)0) { 647 | walker.setConstructor(subct); 648 | subct->applyContext(walker); 649 | break; 650 | } 651 | } 652 | walker.setCurrentLength(sym->getMinimumLength()); 653 | walker.popOperand(); 654 | oper += 1; 655 | } 656 | if (oper >= numoper) { // Finished processing constructor 657 | walker.calcCurrentLength(ct->getMinimumLength(),numoper); 658 | walker.popOperand(); 659 | // Check for use of delayslot 660 | ConstructTpl *templ = ct->getTempl(); 661 | if ((templ != (ConstructTpl *)0)&&(templ->delaySlot() > 0)) 662 | pos.setDelaySlot(templ->delaySlot()); 663 | } 664 | } 665 | pos.setNaddr(pos.getAddr()+pos.getLength()); // Update Naddr to pointer after instruction 666 | pos.setParserState(ParserContext::disassembly); 667 | } 668 | 669 | /// Resolve handle templates for the given parse tree, assuming Constructors 670 | /// are already resolved. 671 | /// \param pos is the given parse tree 672 | void SleighMishegos::resolveHandles(ParserContext &pos) const 673 | 674 | { 675 | TripleSymbol *triple; 676 | Constructor *ct; 677 | int4 oper,numoper; 678 | 679 | ParserWalker walker(&pos); 680 | walker.baseState(); 681 | while(walker.isState()) { 682 | ct = walker.getConstructor(); 683 | oper = walker.getOperand(); 684 | numoper = ct->getNumOperands(); 685 | while(oper < numoper) { 686 | OperandSymbol *sym = ct->getOperand(oper); 687 | walker.pushOperand(oper); // Descend into node 688 | triple = sym->getDefiningSymbol(); 689 | if (triple != (TripleSymbol *)0) { 690 | if (triple->getType() == SleighSymbol::subtable_symbol) 691 | break; 692 | else // Some other kind of symbol as an operand 693 | triple->getFixedHandle(walker.getParentHandle(),walker); 694 | } 695 | else { // Must be an expression 696 | PatternExpression *patexp = sym->getDefiningExpression(); 697 | intb res = patexp->getValue(walker); 698 | FixedHandle &hand(walker.getParentHandle()); 699 | hand.space = pos.getConstSpace(); // Result of expression is a constant 700 | hand.offset_space = (AddrSpace *)0; 701 | hand.offset_offset = (uintb)res; 702 | hand.size = 0; // This size should not get used 703 | } 704 | walker.popOperand(); 705 | oper += 1; 706 | } 707 | if (oper >= numoper) { // Finished processing constructor 708 | ConstructTpl *templ = ct->getTempl(); 709 | if (templ != (ConstructTpl *)0) { 710 | HandleTpl *res = templ->getResult(); 711 | if (res != (HandleTpl *)0) // Pop up handle to containing operand 712 | res->fix(walker.getParentHandle(),walker); 713 | // If we need an indicator that the constructor exports nothing try 714 | // else 715 | // walker.getParentHandle().setInvalid(); 716 | } 717 | walker.popOperand(); 718 | } 719 | } 720 | pos.setParserState(ParserContext::pcode); 721 | } 722 | 723 | int4 SleighMishegos::instructionLength(const Address &baseaddr) const 724 | 725 | { 726 | ParserContext *pos = obtainContext(baseaddr,ParserContext::disassembly); 727 | return pos->getLength(); 728 | } 729 | 730 | int4 SleighMishegos::printAssembly(AssemblyEmit &emit,const Address &baseaddr) const 731 | 732 | { 733 | int4 sz; 734 | 735 | ParserContext *pos = obtainContext(baseaddr,ParserContext::disassembly); 736 | ParserWalker walker(pos); 737 | walker.baseState(); 738 | 739 | Constructor *ct = walker.getConstructor(); 740 | ostringstream mons; 741 | ct->printMnemonic(mons,walker); 742 | ostringstream body; 743 | ct->printBody(body,walker); 744 | emit.dump(baseaddr,mons.str(),body.str()); 745 | sz = pos->getLength(); 746 | return sz; 747 | } 748 | 749 | int4 SleighMishegos::oneInstruction(PcodeEmit &emit,const Address &baseaddr) const 750 | 751 | { 752 | throw UnimplError("Unimplemented oneInstruction", 0); 753 | int4 fallOffset; 754 | if (alignment != 1) { 755 | if ((baseaddr.getOffset() % alignment)!=0) { 756 | ostringstream s; 757 | s << "Instruction address not aligned: " << baseaddr; 758 | throw UnimplError(s.str(),0); 759 | } 760 | } 761 | 762 | ParserContext *pos = obtainContext(baseaddr,ParserContext::pcode); 763 | pos->applyCommits(); 764 | fallOffset = pos->getLength(); 765 | 766 | if (pos->getDelaySlot()>0) { 767 | int4 bytecount = 0; 768 | do { 769 | // Do not pass pos->getNaddr() to obtainContext, as pos may have been previously cached and had naddr adjusted 770 | ParserContext *delaypos = obtainContext(pos->getAddr() + fallOffset,ParserContext::pcode); 771 | delaypos->applyCommits(); 772 | int4 len = delaypos->getLength(); 773 | fallOffset += len; 774 | bytecount += len; 775 | } while(bytecount < pos->getDelaySlot()); 776 | pos->setNaddr(pos->getAddr()+fallOffset); 777 | } 778 | ParserWalker walker(pos); 779 | walker.baseState(); 780 | pcode_cache.clear(); 781 | SleighBuilder builder(&walker,discache,&pcode_cache,getConstantSpace(),getUniqueSpace(),unique_allocatemask); 782 | try { 783 | builder.build(walker.getConstructor()->getTempl(),-1); 784 | pcode_cache.resolveRelatives(); 785 | pcode_cache.emit(baseaddr,&emit); 786 | } catch(UnimplError &err) { 787 | ostringstream s; 788 | s << "Instruction not implemented in pcode:\n "; 789 | ParserWalker *cur = builder.getCurrentWalker(); 790 | cur->baseState(); 791 | Constructor *ct = cur->getConstructor(); 792 | cur->getAddr().printRaw(s); 793 | s << ": "; 794 | ct->printMnemonic(s,*cur); 795 | s << " "; 796 | ct->printBody(s,*cur); 797 | err.explain = s.str(); 798 | err.instruction_length = fallOffset; 799 | throw err; 800 | } 801 | return fallOffset; 802 | } 803 | 804 | void SleighMishegos::registerContext(const string &name,int4 sbit,int4 ebit) 805 | 806 | { 807 | context_db->registerVariable(name,sbit,ebit); 808 | } 809 | 810 | void SleighMishegos::setContextDefault(const string &name,uintm val) 811 | 812 | { 813 | context_db->setVariableDefault(name,val); 814 | } 815 | 816 | void SleighMishegos::allowContextSet(bool val) const 817 | 818 | { 819 | cache->allowSet(val); 820 | } 821 | 822 | } // End namespace ghidra 823 | -------------------------------------------------------------------------------- /src/worker/ghidra/sleighMishegos.hh: -------------------------------------------------------------------------------- 1 | /* ### 2 | * IP: GHIDRA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | * This file was copied from upstream 17 | * https://github.com/NationalSecurityAgency/ghidra/blob/2536099c0eb2683ee0e416a127f8a8795f8de853/Ghidra/Features/Decompiler/src/decompile/cpp/sleigh.hh 18 | * 19 | * Modified by Eric Kilmer at Trail of Bits 2022 20 | * Modified to better support mishegos single-shot disassembly by not using the 21 | * disassembly cache. This allows us to get new disassembly results at the same 22 | * address without having to reinitialize everything. 23 | * 24 | * This file has been modified in a way to minimize the diff from upstream. 25 | * There is dead code and other code artifacts that probably wouldn't be 26 | * written in the same way had this functionality been written fresh. 27 | */ 28 | /// \file sleigh.hh 29 | /// \brief Classes and utilities for the main SLEIGH engine 30 | 31 | #ifndef __SLEIGHMISHEGOS__ 32 | #define __SLEIGHMISHEGOS__ 33 | 34 | #include 35 | 36 | namespace ghidra { 37 | 38 | class LoadImage; 39 | 40 | /// \brief Class for describing a relative p-code branch destination 41 | /// 42 | /// An intra-instruction p-code branch takes a \e relative operand. 43 | /// The actual value produced during p-code generation is calculated at 44 | /// the last second using \b this. It stores the index of the BRANCH 45 | /// instruction and a reference to its destination operand. This initially 46 | /// holds a reference to a destination \e label symbol, but is later updated 47 | /// with the final relative value. 48 | struct RelativeRecord { 49 | VarnodeData *dataptr; ///< Varnode indicating relative offset 50 | uintb calling_index; ///< Index of instruction containing relative offset 51 | }; 52 | 53 | /// \brief Data for building one p-code instruction 54 | /// 55 | /// Raw data used by the emitter to produce a single PcodeOp 56 | struct PcodeData { 57 | OpCode opc; ///< The op code 58 | VarnodeData *outvar; ///< Output Varnode data (or null) 59 | VarnodeData *invar; ///< Array of input Varnode data 60 | int4 isize; ///< Number of input Varnodes 61 | }; 62 | 63 | /// \brief Class for caching a chunk of p-code, prior to emitting 64 | /// 65 | /// The engine accumulates PcodeData and VarnodeData objects for 66 | /// a single instruction. Once the full instruction is constructed, 67 | /// the objects are passed to the emitter (PcodeEmit) via the emit() method. 68 | /// The class acts as a pool of memory for PcodeData and VarnodeData objects 69 | /// that can be reused repeatedly to emit multiple instructions. 70 | class PcodeCacher { 71 | VarnodeData *poolstart; ///< Start of the pool of VarnodeData objects 72 | VarnodeData *curpool; ///< First unused VarnodeData 73 | VarnodeData *endpool; ///< End of the pool of VarnodeData objects 74 | vector issued; ///< P-code ops issued for the current instruction 75 | list label_refs; ///< References to labels 76 | vector labels; ///< Locations of labels 77 | VarnodeData *expandPool(uint4 size); ///< Expand the memory pool 78 | public: 79 | PcodeCacher(void); ///< Constructor 80 | ~PcodeCacher(void); ///< Destructor 81 | 82 | /// \brief Allocate data objects for a new set of Varnodes 83 | /// 84 | /// \param size is the number of objects to allocate 85 | /// \return a pointer to the array of available VarnodeData objects 86 | VarnodeData *allocateVarnodes(uint4 size) { 87 | VarnodeData *newptr = curpool + size; 88 | if (newptr <= endpool) { 89 | VarnodeData *res = curpool; 90 | curpool = newptr; 91 | return res; 92 | } 93 | return expandPool(size); 94 | } 95 | 96 | /// \brief Allocate a data object for a new p-code operation 97 | /// 98 | /// \return the new PcodeData object 99 | PcodeData *allocateInstruction(void) { 100 | issued.emplace_back(); 101 | PcodeData *res = &issued.back(); 102 | res->outvar = (VarnodeData *)0; 103 | res->invar = (VarnodeData *)0; 104 | return res; 105 | } 106 | void addLabelRef(VarnodeData *ptr); ///< Denote a Varnode holding a \e relative \e branch offset 107 | void addLabel(uint4 id); ///< Attach a label to the \e next p-code instruction 108 | void clear(void); ///< Reset the cache so that all objects are unallocated 109 | void resolveRelatives(void); ///< Rewrite branch target Varnodes as \e relative offsets 110 | void emit(const Address &addr,PcodeEmit *emt) const; ///< Pass the cached p-code data to the emitter 111 | }; 112 | 113 | /// \brief A container for disassembly context used by the SLEIGH engine 114 | /// 115 | /// This acts as a factor for the ParserContext objects which are used to disassemble 116 | /// a single instruction. These all share a ContextCache which is a front end for 117 | /// accessing the ContextDatabase and resolving context variables from the SLEIGH spec. 118 | /// ParserContext objects are stored in a hash-table keyed by the address of the instruction. 119 | class DisassemblyCache { 120 | Translate *translate; ///< The Translate object that owns this cache 121 | ContextCache *contextcache; ///< Cached values from the ContextDatabase 122 | AddrSpace *constspace; ///< The constant address space 123 | int4 minimumreuse; ///< Can call getParserContext this many times, before a ParserContext is reused 124 | uint4 mask; ///< Size of the hashtable in form 2^n-1 125 | ParserContext **list; ///< (circular) array of currently cached ParserContext objects 126 | int4 nextfree; ///< Current end/beginning of circular list 127 | ParserContext **hashtable; ///< Hashtable for looking up ParserContext via Address 128 | void initialize(int4 min,int4 hashsize); ///< Initialize the hash-table of ParserContexts 129 | void free(void); ///< Free the hash-table of ParserContexts 130 | public: 131 | DisassemblyCache(Translate *trans,ContextCache *ccache,AddrSpace *cspace,int4 cachesize,int4 windowsize); ///< Constructor 132 | ~DisassemblyCache(void) { free(); } ///< Destructor 133 | ParserContext *getParserContext(const Address &addr); ///< Get the parser for a particular Address 134 | }; 135 | 136 | /// \brief Build p-code from a pre-parsed instruction 137 | /// 138 | /// Through the build() method, \b this walks the parse tree and prepares data 139 | /// for final emission as p-code. (The final emitting is done separately through the 140 | /// PcodeCacher.emit() method). Generally, only p-code for one instruction is prepared. 141 | /// But, through the \b delay-slot mechanism, build() may recursively visit 142 | /// additional instructions. 143 | class SleighBuilder : public PcodeBuilder { 144 | virtual void dump( OpTpl *op ); 145 | AddrSpace *const_space; ///< The constant address space 146 | AddrSpace *uniq_space; ///< The unique address space 147 | uintb uniquemask; ///< Mask of address bits to use to uniquify temporary registers 148 | uintb uniqueoffset; ///< Uniquifier bits for \b this instruction 149 | DisassemblyCache *discache; ///< Cache of disassembled instructions 150 | PcodeCacher *cache; ///< Cache accumulating p-code data for the instruction 151 | void buildEmpty(Constructor *ct,int4 secnum); 152 | void generateLocation(const VarnodeTpl *vntpl,VarnodeData &vn); 153 | AddrSpace *generatePointer(const VarnodeTpl *vntpl,VarnodeData &vn); 154 | void generatePointerAdd(PcodeData *op,const VarnodeTpl *vntpl); 155 | void setUniqueOffset(const Address &addr); ///< Set uniquifying bits for the current instruction 156 | public: 157 | SleighBuilder(ParserWalker *w,DisassemblyCache *dcache,PcodeCacher *pc,AddrSpace *cspc,AddrSpace *uspc,uint4 umask); 158 | virtual void appendBuild(OpTpl *bld,int4 secnum); 159 | virtual void delaySlot(OpTpl *op); 160 | virtual void setLabel(OpTpl *op); 161 | virtual void appendCrossBuild(OpTpl *bld,int4 secnum); 162 | }; 163 | 164 | /// \brief A full SLEIGH engine 165 | /// 166 | /// Its provided with a LoadImage of the bytes to be disassembled and 167 | /// a ContextDatabase. 168 | /// 169 | /// Assembly is produced via the printAssembly() method, provided with an 170 | /// AssemblyEmit object and an Address. 171 | /// 172 | /// P-code is produced via the oneInstruction() method, provided with a PcodeEmit 173 | /// object and an Address. 174 | class SleighMishegos : public SleighBase { 175 | LoadImage *loader; ///< The mapped bytes in the program 176 | ContextDatabase *context_db; ///< Database of context values steering disassembly 177 | ContextCache *cache; ///< Cache of recently used context values 178 | mutable DisassemblyCache *discache; ///< Cache of recently parsed instructions 179 | mutable PcodeCacher pcode_cache; ///< Cache of p-code data just prior to emitting 180 | ParserContext *pos; 181 | void clearForDelete(void); ///< Delete the context and disassembly caches 182 | protected: 183 | ParserContext *obtainContext(const Address &addr,int4 state) const; 184 | void resolve(ParserContext &pos) const; ///< Generate a parse tree suitable for disassembly 185 | void resolveHandles(ParserContext &pos) const; ///< Prepare the parse tree for p-code generation 186 | public: 187 | SleighMishegos(LoadImage *ld,ContextDatabase *c_db); ///< Constructor 188 | virtual ~SleighMishegos(void); ///< Destructor 189 | void reset(LoadImage *ld,ContextDatabase *c_db); ///< Reset the engine for a new program 190 | virtual void initialize(DocumentStorage &store); 191 | virtual void registerContext(const string &name,int4 sbit,int4 ebit); 192 | virtual void setContextDefault(const string &nm,uintm val); 193 | virtual void allowContextSet(bool val) const; 194 | virtual int4 instructionLength(const Address &baseaddr) const; 195 | virtual int4 oneInstruction(PcodeEmit &emit,const Address &baseaddr) const; 196 | virtual int4 printAssembly(AssemblyEmit &emit,const Address &baseaddr) const; 197 | }; 198 | 199 | } // End namespace ghidra 200 | 201 | /** \page sleigh SLEIGH 202 | 203 | \section sleightoc Table of Contents 204 | 205 | - \ref sleighoverview 206 | - \ref sleighbuild 207 | - \ref sleighuse 208 | - \subpage sleighAPIbasic 209 | - \subpage sleighAPIemulate 210 | 211 | \b Key \b Classes 212 | - \ref Translate 213 | - \ref AssemblyEmit 214 | - \ref PcodeEmit 215 | - \ref LoadImage 216 | - \ref ContextDatabase 217 | 218 | \section sleighoverview Overview 219 | 220 | Welcome to \b SLEIGH, a machine language translation and 221 | dissassembly engine. SLEIGH is both a processor 222 | specification language and the associated library and 223 | tools for using such a specification to generate assembly 224 | and to generate \b pcode, a reverse engineering Register 225 | Transfer Language (RTL), from binary machine instructions. 226 | 227 | SLEIGH was originally based on \b SLED, a 228 | \e Specification \e Language \e for \e Encoding \e and 229 | \e Decoding, designed by Norman Ramsey and Mary F. Fernandez, 230 | which performed disassembly (and assembly). SLEIGH 231 | extends SLED by providing semantic descriptions (via the 232 | RTL) of machine instructions and other practical enhancements 233 | for doing real world reverse engineering. 234 | 235 | SLEIGH is part of Project \b GHIDRA. It provides the core 236 | of the GHIDRA disassembler and the data-flow and 237 | decompilation analysis. However, SLEIGH can serve as a 238 | standalone library for use in other applications for 239 | providing a generic disassembly and RTL translation interface. 240 | 241 | \section sleighbuild Building SLEIGH 242 | 243 | There are a couple of \e make targets for building the SLEIGH 244 | library from source. These are: 245 | 246 | \code 247 | make libsla.a # Build the main library 248 | 249 | make libsla_dbg.a # Build the library with debug symbols 250 | \endcode 251 | 252 | The source code file \e sleighexample.cc has a complete example 253 | of initializing the Translate engine and using it to generate 254 | assembly and pcode. The source has a hard-coded file name, 255 | \e x86testcode, as the example binary executable it attempts 256 | to decode, but this can easily be changed. It also needs 257 | a SLEIGH specification file (\e .sla) to be present. 258 | 259 | Building the example application can be done with something 260 | similar to the following makefile fragment. 261 | 262 | \code 263 | # The C compiler 264 | CXX=g++ 265 | 266 | # Debug flags 267 | DBG_CXXFLAGS=-g -Wall -Wno-sign-compare 268 | 269 | OPT_CXXFLAGS=-O2 -Wall -Wno-sign-compare 270 | 271 | # libraries 272 | INCLUDES=-I./src 273 | 274 | LNK=src/libsla_dbg.a 275 | 276 | sleighexample.o: sleighexample.cc 277 | $(CXX) -c $(DBG_CXXFLAGS) -o sleighexample sleighexample.o $(LNK) 278 | 279 | clean: 280 | rm -rf *.o sleighexample 281 | \endcode 282 | 283 | \section sleighuse Using SLEIGH 284 | 285 | SLEIGH is a generic reverse engineering tool in the sense 286 | that the API is designed to be completely processor 287 | independent. In order to process binary executables for a 288 | specific processor, The library reads in a \e 289 | specification \e file, which describes how instructions 290 | are encoded and how they are interpreted by the processor. 291 | An application which needs to do disassembly or generate 292 | \b pcode can design to the SLEIGH API once, and then the 293 | application will automatically support any processor for 294 | which there is a specification. 295 | 296 | For working with a single processor, the SLEIGH library 297 | needs to load a single \e compiled form of the processor 298 | specification, which is traditionally given a ".sla" suffix. 299 | Most common processors already have a ".sla" file available. 300 | So to use SLEIGH with these processors, the library merely 301 | needs to be made aware of the desired file. This documentation 302 | covers the use of the SLEIGH API, assuming that this 303 | specification file is available. 304 | 305 | The ".sla" files themselves are created by running 306 | the \e compiler on a file written in the formal SLEIGH 307 | language. These files traditionally have the suffix ".slaspec" 308 | For those who want to design such a specification for a new 309 | processor, please refer to the document, "SLEIGH: A Language 310 | for Rapid Processor Specification." 311 | 312 | */ 313 | 314 | /** 315 | \page sleighAPIbasic The Basic SLEIGH Interface 316 | 317 | To use SLEIGH as a library within an application, there 318 | are basically five classes that you need to be aware of. 319 | 320 | - \ref sleightranslate 321 | - \ref sleighassememit 322 | - \ref sleighpcodeemit 323 | - \ref sleighloadimage 324 | - \ref sleighcontext 325 | 326 | \section sleightranslate Translate (or Sleigh) 327 | 328 | The core SLEIGH class is Sleigh, which is derived from the 329 | interface, Translate. In order to instantiate it in your code, 330 | you need a LoadImage object, and a ContextDatabase object. 331 | The load image is responsible for retrieving instruction 332 | bytes, based on address, from a binary executable. The context 333 | database provides the library extra mode information that may 334 | be necessary to do the disassembly or translation. This can 335 | be used, for instance, to specify that an x86 binary is running 336 | in 32-bit mode, or to specify that an ARM processor is running 337 | in THUMB mode. Once these objects are built, the Sleigh 338 | object can be immediately instantiated. 339 | 340 | \code 341 | LoadImageBfd *loader; 342 | ContextDatabase *context; 343 | Translate *trans; 344 | 345 | // Set up the loadimage 346 | // Providing an executable name and architecture 347 | string loadimagename = "x86testcode"; 348 | string bfdtarget= "default"; 349 | 350 | loader = new LoadImageBfd(loadimagename,bfdtarget); 351 | loader->open(); // Load the executable from file 352 | 353 | context = new ContextInternal(); // Create a processor context 354 | 355 | trans = new Sleigh(loader,context); // Instantiate the translator 356 | \endcode 357 | 358 | Once the Sleigh object is in hand, the only required 359 | initialization step left is to inform it of the ".sla" file. 360 | The file is in XML format and needs to be read in using 361 | SLEIGH's built-in XML parser. The following code accomplishes 362 | this. 363 | 364 | \code 365 | string sleighfilename = "specfiles/x86.sla"; 366 | DocumentStorage docstorage; 367 | Element *sleighroot = docstorage.openDocument(sleighfilename)->getRoot(); 368 | docstorage.registerTag(sleighroot); 369 | trans->initialize(docstorage); // Initialize the translator 370 | \endcode 371 | 372 | \section sleighassememit AssemblyEmit 373 | 374 | In order to do disassembly, you need to derive a class from 375 | AssemblyEmit, and implement the method \e dump. The library 376 | will call this method exactly once, for each instruction 377 | disassembled. 378 | 379 | This routine simply needs to decide how (and where) to print 380 | the corresponding portion of the disassembly. For instance, 381 | 382 | \code 383 | class AssemblyRaw : public AssemblyEmit { 384 | public: 385 | virtual void dump(const Address &addr,const string &mnem,const string &body) { 386 | addr.printRaw(cout); 387 | cout << ": " << mnem << ' ' << body << endl; 388 | } 389 | }; 390 | \endcode 391 | 392 | This is a minimal implementation that simply dumps the 393 | disassembly straight to standard out. Once this object is 394 | instantiated, the Sleigh object can use it to write out 395 | assembly via the Translate::printAssembly() method. 396 | 397 | \code 398 | AssemblyEmit *assememit = new AssemblyRaw(); 399 | 400 | Address addr(trans->getDefaultCodeSpace(),0x80484c0); 401 | int4 length; // Length of instruction in bytes 402 | 403 | length = trans->printAssembly(*assememit,addr); 404 | addr = addr + length; // Advance to next instruction 405 | length = trans->printAssembly(*assememit,addr); 406 | addr = addr + length; 407 | length = trans->printAssembly(*assememit,addr); 408 | \endcode 409 | 410 | \section sleighpcodeemit PcodeEmit 411 | 412 | In order to generate a \b pcode translation of a machine 413 | instruction, you need to derive a class from PcodeEmit and 414 | implement the virtual method \e dump. This method will be 415 | invoked once for each \b pcode operation in the translation 416 | of a machine instruction. There will likely be multiple calls 417 | per instruction. Each call passes in a single \b pcode 418 | operation, complete with its possible varnode output, and 419 | all of its varnode inputs. Here is an example of a PcodeEmit 420 | object that simply prints out the \b pcode. 421 | 422 | \code 423 | class PcodeRawOut : public PcodeEmit { 424 | public: 425 | virtual void dump(const Address &addr,OpCode opc,VarnodeData *outvar,VarnodeData *vars,int4 isize); 426 | }; 427 | 428 | static void print_vardata(ostream &s,VarnodeData &data) 429 | 430 | { 431 | s << '(' << data.space->getName() << ','; 432 | data.space->printOffset(s,data.offset); 433 | s << ',' << dec << data.size << ')'; 434 | } 435 | 436 | void PcodeRawOut::dump(const Address &addr,OpCode opc,VarnodeData *outvar,VarnodeData *vars,int4 isize) 437 | 438 | { 439 | if (outvar != (VarnodeData *)0) { // The output is optional 440 | print_vardata(cout,*outvar); 441 | cout << " = "; 442 | } 443 | cout << get_opname(opc); 444 | // Possibly check for a code reference or a space reference 445 | for(int4 i=0;igetDefaultCodeSpace(),0x80484c0); 474 | int4 length; // Length of instruction in bytes 475 | 476 | length = trans->oneInstruction(*pcodeemit,addr); 477 | addr = addr + length; // Advance to next instruction 478 | length = trans->oneInstruction(*pcodeemit,addr); 479 | addr = addr + length; 480 | length = trans->oneInstruction(*pcodeemit,addr); 481 | \endcode 482 | 483 | For an application to properly \e follow \e flow, while translating 484 | machine instructions into pcode, the emitted pcode must be 485 | inspected for the various branch operations. 486 | 487 | \section sleighloadimage LoadImage 488 | 489 | A LoadImage holds all the binary data from an executable file 490 | in the format similar to how it would exist when being executed 491 | by a real processor. The interface to this from SLEIGH is 492 | actually very simple, although it can hide a complicated 493 | structure. One method does most of the work, LoadImage::loadFill(). 494 | It takes a byte pointer, a size, and an Address. The method 495 | is expected to fill in the \e ptr array with \e size bytes 496 | taken from the load image, corresponding to the address \e addr. 497 | There are two more virtual methods that are required for a 498 | complete implementation of LoadImage, \e getArchType and 499 | \e adjustVma, but these do not need to be implemented fully. 500 | 501 | \code 502 | class MyLoadImage : public LoadImage { 503 | public: 504 | MyLoadImage(const string &nm) : Loadimage(nm) {} 505 | virtual void loadFill(uint1 *ptr,int4 size,const Address &addr); 506 | virtual string getArchType(void) const { return "mytype"; } 507 | virtual void adjustVma(long adjust) {} 508 | }; 509 | \endcode 510 | 511 | \section sleighcontext ContextDatabase 512 | 513 | The ContextDatabase needs to keep track of any possible 514 | context variable and its value, over different address ranges. 515 | In most cases, you probably don't need to override the class 516 | yourself, but can use the built-in class, ContextInternal. 517 | This provides the basic functionality required and will work 518 | for different architectures. What you may need to do is 519 | set values for certain variables, depending on the processor 520 | and the environment it is running in. For instance, for 521 | the x86 platform, you need to set the \e addrsize and \e opsize 522 | bits, to indicate the processor would be running in 32-bit 523 | mode. The context variables specific to a particular processor 524 | are established by the SLEIGH spec. So the variables can 525 | only be set \e after the spec has been loaded. 526 | 527 | \code 528 | ... 529 | context = new ContextInternal(); 530 | trans = new Sleigh(loader,context); 531 | DocumentStorage docstorage; 532 | Element *root = docstorage.openDocument("specfiles/x86.sla")->getRoot(); 533 | docstorage.registerTag(root); 534 | trans->initialize(docstorage); 535 | 536 | context->setVariableDefault("addrsize",1); // Address size is 32-bits 537 | context->setVariableDefault("opsize",1); // Operand size is 32-bits 538 | \endcode 539 | 540 | 541 | */ 542 | #endif 543 | -------------------------------------------------------------------------------- /src/worker/iced/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "iced" 3 | version = "1.0.0" 4 | authors = ["mishegos"] 5 | edition = "2018" 6 | 7 | [lib] 8 | crate-type = ["cdylib"] 9 | 10 | [dependencies.iced-x86] 11 | default-features = false 12 | features = ["std", "decoder", "intel"] 13 | path = "./iced/src/rust/iced-x86" 14 | 15 | [build-dependencies] 16 | bindgen = "*" 17 | 18 | [profile.release] 19 | codegen-units = 1 20 | lto = true 21 | opt-level = 3 22 | -------------------------------------------------------------------------------- /src/worker/iced/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all 2 | all: iced.so 3 | 4 | iced.so: target/release/libiced.$(SO_SUFFIX) 5 | cp target/release/libiced.$(SO_SUFFIX) $@ 6 | 7 | target/release/libiced.$(SO_SUFFIX): 8 | cargo test --release 9 | cargo build --release 10 | 11 | .PHONY: clean 12 | clean: 13 | cargo clean --release 14 | rm -f *.so 15 | -------------------------------------------------------------------------------- /src/worker/iced/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path::PathBuf; 3 | 4 | fn main() { 5 | println!("cargo:rerun-if-changed=build.rs"); 6 | println!("cargo:rerun-if-changed=wrapper.h"); 7 | 8 | let clang_args = env::var("RUST_BINDGEN_CLANG_ARGS").unwrap(); 9 | let bindings = bindgen::Builder::default() 10 | .header("wrapper.h") 11 | .clang_args(clang_args.split_ascii_whitespace()) 12 | .parse_callbacks(Box::new(bindgen::CargoCallbacks)) 13 | .generate() 14 | .expect("Unable to generate bindings"); 15 | 16 | let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); 17 | bindings 18 | .write_to_file(out_path.join("bindings.rs")) 19 | .expect("Couldn't write bindings!"); 20 | } 21 | -------------------------------------------------------------------------------- /src/worker/iced/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod mishegos; 2 | 3 | use iced_x86::*; 4 | use mishegos::{ 5 | decode_result, decode_status_S_FAILURE, decode_status_S_PARTIAL, decode_status_S_SUCCESS, 6 | }; 7 | 8 | // This is pretty ugly and assumes sizeof(char) == 1 9 | #[no_mangle] 10 | pub static mut worker_name: *const std::os::raw::c_char = 11 | WORKER_NAME.as_ptr() as *const std::os::raw::c_char; 12 | static WORKER_NAME: &str = "iced\0"; 13 | 14 | #[allow(clippy::missing_safety_doc)] 15 | #[no_mangle] 16 | pub unsafe extern "C" fn try_decode(result: *mut decode_result, raw_insn: *const u8, length: u8) { 17 | assert!(!result.is_null()); 18 | assert!(!raw_insn.is_null()); 19 | let data = std::slice::from_raw_parts(raw_insn, length as usize); 20 | let result = &mut *result; 21 | let error = match try_decode_safe(64, 0, data) { 22 | Err(error) => error, 23 | Ok((instr_len, output)) => { 24 | result.ndecoded = instr_len as u16; 25 | assert_eq!(std::mem::size_of::(), 1); 26 | if output.len() > result.result.len() { 27 | decode_status_S_FAILURE 28 | } else { 29 | std::ptr::copy( 30 | output.as_ptr(), 31 | result.result.as_mut_ptr() as *mut u8, 32 | output.len(), 33 | ); 34 | result.len = output.len() as u16; 35 | decode_status_S_SUCCESS 36 | } 37 | } 38 | }; 39 | result.status = error; 40 | } 41 | 42 | fn try_decode_safe(bitness: u32, ip: u64, data: &[u8]) -> Result<(usize, String), u32> { 43 | const DECODER_OPTIONS: u32 = DecoderOptions::NONE; 44 | 45 | let mut decoder = Decoder::new(bitness, data, DECODER_OPTIONS); 46 | decoder.set_ip(ip); 47 | let instr = decoder.decode(); 48 | 49 | if instr.is_invalid() { 50 | match decoder.last_error() { 51 | DecoderError::None => unreachable!(), 52 | DecoderError::NoMoreBytes => Err(decode_status_S_PARTIAL), 53 | _ => Err(decode_status_S_FAILURE), 54 | } 55 | } else { 56 | let mut formatter = IntelFormatter::new(); 57 | // Try to match default XED output 58 | formatter.options_mut().set_hex_suffix(""); 59 | formatter.options_mut().set_hex_prefix("0x"); 60 | formatter.options_mut().set_uppercase_hex(false); 61 | formatter 62 | .options_mut() 63 | .set_space_after_operand_separator(true); 64 | formatter 65 | .options_mut() 66 | .set_memory_size_options(MemorySizeOptions::Always); 67 | formatter.options_mut().set_always_show_scale(true); 68 | formatter.options_mut().set_rip_relative_addresses(true); 69 | formatter 70 | .options_mut() 71 | .set_small_hex_numbers_in_decimal(false); 72 | formatter.options_mut().set_cc_ge(CC_ge::nl); 73 | formatter.options_mut().set_cc_a(CC_a::nbe); 74 | formatter.options_mut().set_cc_e(CC_e::z); 75 | formatter.options_mut().set_cc_ne(CC_ne::nz); 76 | formatter.options_mut().set_cc_ae(CC_ae::nb); 77 | formatter.options_mut().set_cc_g(CC_g::nle); 78 | formatter.options_mut().set_show_branch_size(false); 79 | formatter.options_mut().set_branch_leading_zeroes(false); 80 | formatter.options_mut().set_use_pseudo_ops(false); 81 | 82 | let mut output = String::new(); 83 | formatter.format(&instr, &mut output); 84 | 85 | Ok((instr.len(), output)) 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/worker/iced/src/mishegos.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_upper_case_globals)] 2 | #![allow(non_camel_case_types)] 3 | #![allow(non_snake_case)] 4 | #![allow(dead_code)] 5 | // "warning: `extern` block uses type `u128`, which is not FFI-safe" 6 | // "note: 128-bit integers don't currently have a known stable ABI" 7 | #![allow(improper_ctypes)] 8 | #![allow(clippy::redundant_static_lifetimes)] 9 | 10 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 11 | -------------------------------------------------------------------------------- /src/worker/iced/wrapper.h: -------------------------------------------------------------------------------- 1 | #include "../worker.h" 2 | -------------------------------------------------------------------------------- /src/worker/llvm/Makefile: -------------------------------------------------------------------------------- 1 | LLVM_CONFIG=llvm-config 2 | override CPPFLAGS := $(CPPFLAGS) $(shell $(LLVM_CONFIG) --cppflags) 3 | override LDFLAGS := $(LDFLAGS) $(shell $(LLVM_CONFIG) --ldflags) -Wl,-z,defs 4 | override LDLIBS := $(LDLIBS) $(shell $(LLVM_CONFIG) --libs) 5 | 6 | .PHONY: all 7 | all: llvm.so 8 | 9 | llvm.so: llvm.c 10 | $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $< $(LDLIBS) -o $@ 11 | 12 | .PHONY: clean 13 | clean: 14 | rm -rf *.o *.so 15 | 16 | -------------------------------------------------------------------------------- /src/worker/llvm/llvm.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../worker.h" 5 | 6 | static LLVMDisasmContextRef dis; 7 | 8 | char *worker_name = "llvm"; 9 | 10 | void worker_ctor() { 11 | LLVMInitializeX86TargetInfo(); 12 | LLVMInitializeX86Target(); 13 | LLVMInitializeX86TargetMC(); 14 | LLVMInitializeX86Disassembler(); 15 | dis = LLVMCreateDisasm("x86_64-linux-gnu", NULL, 0, NULL, NULL); 16 | if (!dis) { 17 | errx(1, "LLVMCreateDisasm"); 18 | } 19 | // Hex immediates and Intel syntax 20 | // The first option doesn't seem to have an effect, though. 21 | if (!LLVMSetDisasmOptions(dis, LLVMDisassembler_Option_PrintImmHex | 22 | LLVMDisassembler_Option_AsmPrinterVariant)) 23 | errx(1, "LLVMSetDisasmOptions"); 24 | } 25 | 26 | void worker_dtor() { 27 | LLVMDisasmDispose(dis); 28 | } 29 | 30 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) { 31 | size_t len = LLVMDisasmInstruction(dis, raw_insn, length, 0, result->result, MISHEGOS_DEC_MAXLEN); 32 | if (len > 0) { 33 | result->status = S_SUCCESS; 34 | result->len = strlen(result->result); 35 | result->ndecoded = len; 36 | } else { 37 | result->status = S_FAILURE; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/worker/worker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "mish_common.h" 4 | 5 | /* frequently needed by workers. */ 6 | #include 7 | #include 8 | 9 | /* This is fine for now. */ 10 | typedef output_slot decode_result; 11 | 12 | typedef void (*try_decode_t)(decode_result *result, uint8_t *raw_insn, uint8_t length); 13 | -------------------------------------------------------------------------------- /src/worker/xed/Makefile: -------------------------------------------------------------------------------- 1 | # NOTE(ww): I don't fully understand why I need the RPATH here 2 | # but not in the capstone build. 3 | override CFLAGS := $(CFLAGS) -Wl,-rpath,$(shell pwd)/xed/kits/xed-mishegos/lib 4 | override CPPFLAGS := $(CPPFLAGS) -Ixed/kits/xed-mishegos/include 5 | override LDFLAGS := $(LDFLAGS) -Lxed/kits/xed-mishegos/lib 6 | override LDLIBS := $(LDLIBS) -lxed 7 | 8 | .PHONY: all 9 | all: xed.so 10 | 11 | xed/kits/xed-mishegos/libxed.so: 12 | cd xed && \ 13 | python3 ./mfile.py install --shared --install-dir=kits/xed-mishegos -j 4 14 | 15 | xed.so: xed.o 16 | $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) xed.o $(LDLIBS) -o $@ 17 | 18 | xed.o: xed/kits/xed-mishegos/libxed.so xed.c 19 | 20 | .PHONY: clean 21 | clean: 22 | cd xed && python3 ./mfile.py clean && rm -rf kits 23 | rm -rf *.o *.so 24 | -------------------------------------------------------------------------------- /src/worker/xed/xed.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../worker.h" 4 | 5 | char *worker_name = "xed"; 6 | 7 | void worker_ctor() { 8 | xed_tables_init(); 9 | } 10 | 11 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) { 12 | xed_decoded_inst_t xedd; 13 | xed_decoded_inst_zero(&xedd); 14 | xed_decoded_inst_set_mode(&xedd, XED_MACHINE_MODE_LONG_64, XED_ADDRESS_WIDTH_64b); 15 | xed_decoded_inst_set_input_chip(&xedd, XED_CHIP_ALL); 16 | 17 | xed_error_enum_t xed_error = xed_decode(&xedd, raw_insn, length); 18 | if (xed_error != XED_ERROR_NONE) { 19 | DLOG("xed_decode failed: %s", xed_error_enum_t2str(xed_error)); 20 | 21 | /* Special-case XED_ERROR_BUFFER_TOO_SHORT, since it's something 22 | * we have a status for beyond generic failure. 23 | */ 24 | if (xed_error == XED_ERROR_BUFFER_TOO_SHORT) { 25 | result->status = S_PARTIAL; 26 | } else { 27 | result->status = S_FAILURE; 28 | } 29 | return; 30 | } 31 | 32 | /* TODO(ww): Fixure out whether xed_format_context decodes up to MISHEGOS_DEC_MAXLEN, 33 | * or saves space for the NULL terminator. It probably doesn't matter in either case 34 | * since nothing will be nearly that long, but it'd be good to know. 35 | */ 36 | if (!xed_format_context(XED_SYNTAX_INTEL, &xedd, result->result, MISHEGOS_DEC_MAXLEN, 0, 0, 0)) { 37 | DLOG("xed_format_context failed!"); 38 | /* TODO(ww): Maybe distinguish this formatting failure from the decoding 39 | * failure above. 40 | */ 41 | result->status = S_FAILURE; 42 | return; 43 | } 44 | 45 | result->status = S_SUCCESS; 46 | result->len = strlen(result->result); 47 | result->ndecoded = xed_decoded_inst_get_length(&xedd); 48 | } 49 | -------------------------------------------------------------------------------- /src/worker/yaxpeax-x86/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | libyaxpeax_x86_mishegos.so 3 | -------------------------------------------------------------------------------- /src/worker/yaxpeax-x86/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "yaxpeax-x86-mishegos" 3 | version = "0.1.0" 4 | authors = ["iximeow "] 5 | edition = "2018" 6 | 7 | [lib] 8 | crate-type = ["cdylib"] 9 | 10 | [dependencies] 11 | yaxpeax-x86 = { version = "1.0.0" } 12 | yaxpeax-arch = { version = "0.2.0" } 13 | 14 | [build-dependencies] 15 | bindgen = "*" 16 | 17 | [profile.release] 18 | lto = true 19 | opt-level = 3 20 | -------------------------------------------------------------------------------- /src/worker/yaxpeax-x86/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all 2 | all: libyaxpeax_x86_mishegos.so 3 | 4 | libyaxpeax_x86_mishegos.so: target/release/libyaxpeax_x86_mishegos.$(SO_SUFFIX) 5 | cp target/release/libyaxpeax_x86_mishegos.$(SO_SUFFIX) $@ 6 | 7 | target/release/libyaxpeax_x86_mishegos.$(SO_SUFFIX): src/lib.rs Cargo.toml 8 | cargo build --release 9 | 10 | .PHONY: clean 11 | clean: 12 | cargo clean 13 | rm -f *.so 14 | -------------------------------------------------------------------------------- /src/worker/yaxpeax-x86/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path::PathBuf; 3 | 4 | fn main() { 5 | println!("cargo:rerun-if-changed=build.rs"); 6 | println!("cargo:rerun-if-changed=wrapper.h"); 7 | 8 | let clang_args = env::var("RUST_BINDGEN_CLANG_ARGS").unwrap_or_else(|_| "-I../../include".to_string()); 9 | 10 | let bindings = PathBuf::from(env::var("OUT_DIR").unwrap()).join("mishegos.rs"); 11 | bindgen::Builder::default() 12 | .header("../worker.h") 13 | .clang_args(clang_args.split_ascii_whitespace()) 14 | .parse_callbacks(Box::new(bindgen::CargoCallbacks)) 15 | .generate() 16 | .expect("failed to generate bindings") 17 | .write_to_file(bindings) 18 | .expect("failed to write bindings"); 19 | } 20 | -------------------------------------------------------------------------------- /src/worker/yaxpeax-x86/src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::os::raw::c_char; 2 | use yaxpeax_x86::long_mode as amd64; 3 | use yaxpeax_arch::{AddressBase, Decoder, LengthedInstruction}; 4 | 5 | // shhh no warnings please 6 | #[allow(warnings)] 7 | mod mishegos { 8 | include!(concat!(env!("OUT_DIR"), "/mishegos.rs")); 9 | } 10 | 11 | use crate::mishegos::{decode_result, MISHEGOS_DEC_MAXLEN, decode_status_S_SUCCESS, decode_status_S_FAILURE, decode_status_S_PARTIAL}; 12 | 13 | #[no_mangle] 14 | pub static mut worker_name: *const c_char = b"yaxpeax-x86-mishegos\x00".as_ptr() as *const i8; 15 | 16 | static mut INSTR: Option = None; 17 | #[no_mangle] 18 | pub extern "C" fn try_decode(decode_result: *mut decode_result, bytes: *const u8, length: u8) { 19 | unsafe { 20 | if INSTR.is_none() { 21 | INSTR = Some(amd64::Instruction::default()); 22 | } 23 | } 24 | let decode_result = unsafe { decode_result.as_mut().expect("decode_result is not null") }; 25 | let data = unsafe { 26 | std::slice::from_raw_parts(bytes.as_ref().expect("bytes is not null"), length as usize) 27 | }; 28 | let decoder = amd64::InstDecoder::default(); 29 | let mut reader = yaxpeax_arch::U8Reader::new(data); 30 | 31 | match decoder.decode_into(unsafe { INSTR.as_mut().unwrap() }, &mut reader) { 32 | Err(amd64::DecodeError::ExhaustedInput) => { 33 | decode_result.status = decode_status_S_PARTIAL; 34 | } 35 | Err(_error) => { 36 | decode_result.status = decode_status_S_FAILURE; 37 | } 38 | Ok(()) => { 39 | let instr = unsafe { INSTR.as_ref().unwrap() }; 40 | decode_result.ndecoded = 0u64.wrapping_offset(instr.len()) as u16; 41 | let text = instr.to_string(); 42 | assert!(text.len() < MISHEGOS_DEC_MAXLEN as usize); 43 | for (i, x) in text.as_bytes().iter().enumerate() { 44 | decode_result.result[i] = *x as i8; 45 | } 46 | decode_result.len = text.len() as u16; 47 | decode_result.status = decode_status_S_SUCCESS; 48 | } 49 | }; 50 | } 51 | -------------------------------------------------------------------------------- /src/worker/zydis/Makefile: -------------------------------------------------------------------------------- 1 | # Include stupidity. 2 | override CPPFLAGS := $(CPPFLAGS) \ 3 | -DZYDIS_STATIC_BUILD \ 4 | -Izydis/include \ 5 | -Izydis/dependencies/zycore/include \ 6 | -Izydis/build \ 7 | -Izydis/build/zycore 8 | 9 | .PHONY: all 10 | all: zydis.so 11 | 12 | zydis/build/libZydis.a: 13 | cd zydis && \ 14 | mkdir build && \ 15 | cd build && \ 16 | cmake -DZYDIS_BUILD_TOOLS=OFF -DZYDIS_BUILD_EXAMPLES=OFF -DZYDIS_STATIC_DEFINE=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo .. && \ 17 | cmake --build . -- -j4 18 | 19 | zydis.so: zydis.o 20 | $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) \ 21 | -Wl,--whole-archive zydis/build/libZydis.a -Wl,--no-whole-archive \ 22 | zydis.o $(LDLIBS) -o $@ 23 | 24 | zydis.o: zydis/build/libZydis.a zydis.c 25 | 26 | .PHONY: clean 27 | clean: 28 | rm -rf *.o *.so 29 | rm -rf zydis/build 30 | -------------------------------------------------------------------------------- /src/worker/zydis/zydis.c: -------------------------------------------------------------------------------- 1 | /* Dumbness. */ 2 | #define ZYDIS_STATIC_BUILD 3 | #include 4 | 5 | #include "../worker.h" 6 | 7 | char *worker_name = "zydis"; 8 | 9 | static ZydisDecoder zdecoder; 10 | static ZydisFormatter zformatter; 11 | 12 | /* I couldn't find this defined anywhere in zycore/zydis. 13 | */ 14 | static const char *ZyanStatus_strerror(ZyanStatus zstatus) { 15 | switch (zstatus) { 16 | case ZYDIS_STATUS_NO_MORE_DATA: { 17 | return "no more data"; 18 | } 19 | case ZYDIS_STATUS_DECODING_ERROR: { 20 | return "general decoding error"; 21 | } 22 | case ZYDIS_STATUS_INSTRUCTION_TOO_LONG: { 23 | return "instruction too long"; 24 | } 25 | case ZYDIS_STATUS_BAD_REGISTER: { 26 | return "invalid register"; 27 | } 28 | case ZYDIS_STATUS_ILLEGAL_LOCK: { 29 | return "illegal lock prefix"; 30 | } 31 | case ZYDIS_STATUS_ILLEGAL_LEGACY_PFX: { 32 | return "illegal legacy prefix"; 33 | } 34 | case ZYDIS_STATUS_ILLEGAL_REX: { 35 | return "illegal REX prefix"; 36 | } 37 | case ZYDIS_STATUS_INVALID_MAP: { 38 | return "illegal opcode map value"; 39 | } 40 | case ZYDIS_STATUS_MALFORMED_EVEX: { 41 | return "illegal EVEX prefix"; 42 | } 43 | case ZYDIS_STATUS_MALFORMED_MVEX: { 44 | return "illegal MVEX prefix"; 45 | } 46 | case ZYDIS_STATUS_INVALID_MASK: { 47 | return "invalid write mask"; 48 | } 49 | default: { 50 | return "unknown"; 51 | } 52 | } 53 | } 54 | 55 | void worker_ctor() { 56 | ZydisDecoderInit(&zdecoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64); 57 | ZydisFormatterInit(&zformatter, ZYDIS_FORMATTER_STYLE_INTEL); 58 | 59 | /* TODO(ww): Zydis has a bunch of formatter options; we probably 60 | * want to set some of them to make its output easier to normalize. 61 | */ 62 | } 63 | 64 | void try_decode(decode_result *result, uint8_t *raw_insn, uint8_t length) { 65 | _unused(ZyanStatus_strerror); 66 | 67 | ZydisDecodedInstruction insn; 68 | ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT_VISIBLE]; 69 | ZyanStatus zstatus = 70 | ZydisDecoderDecodeFull(&zdecoder, raw_insn, length, &insn, operands, 71 | ZYDIS_MAX_OPERAND_COUNT_VISIBLE, ZYDIS_DFLAG_VISIBLE_OPERANDS_ONLY); 72 | if (!ZYAN_SUCCESS(zstatus)) { 73 | DLOG("zydis decoding failed: %s", ZyanStatus_strerror(zstatus)); 74 | 75 | if (zstatus == ZYDIS_STATUS_NO_MORE_DATA) { 76 | result->status = S_PARTIAL; 77 | } else { 78 | result->status = S_FAILURE; 79 | } 80 | return; 81 | } 82 | 83 | zstatus = 84 | ZydisFormatterFormatInstruction(&zformatter, &insn, operands, insn.operand_count_visible, 85 | result->result, MISHEGOS_DEC_MAXLEN, 0); 86 | if (!ZYAN_SUCCESS(zstatus)) { 87 | DLOG("zydis formatting failed: %s", ZyanStatus_strerror(zstatus)); 88 | result->status = S_FAILURE; 89 | return; 90 | } 91 | 92 | result->status = S_SUCCESS; 93 | result->len = strlen(result->result); 94 | result->ndecoded = insn.length; 95 | } 96 | -------------------------------------------------------------------------------- /workers.spec: -------------------------------------------------------------------------------- 1 | ./src/worker/bfd/bfd.so 2 | ./src/worker/capstone/capstone.so 3 | ./src/worker/dynamorio/dynamorio.so 4 | ./src/worker/fadec/fadec.so 5 | ./src/worker/xed/xed.so 6 | ./src/worker/zydis/zydis.so 7 | ./src/worker/bddisasm/bddisasm.so 8 | ./src/worker/iced/iced.so 9 | ./src/worker/yaxpeax-x86/libyaxpeax_x86_mishegos.so 10 | ./src/worker/ghidra/ghidra.so 11 | ./src/worker/llvm/llvm.so 12 | --------------------------------------------------------------------------------