├── LICENSE ├── Makefile ├── README.md ├── bitspan.cpp ├── bitspan.h ├── combine.cpp ├── combine.h ├── examples ├── 8048.txt ├── ethereum.txt ├── sh2.sla └── sh2.txt ├── instruction.cpp ├── instruction.h ├── main.cpp ├── output.cpp ├── output.h ├── parser.cpp ├── parser.h ├── parser_sla.cpp ├── parser_sla.h ├── registers.h ├── slautil ├── slautil.cpp ├── slautil.h └── slaxml.cpp ├── thread_pool.cpp ├── thread_pool.h └── validator.cpp /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 Oberoi Security Solutions 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-O3 -pipe -march=native -flto=auto -Wall -Wextra -Wunused -Wunused-but-set-parameter -Wunused-but-set-variable -Wunused-function -I $(GHIDRA_TRUNK)/Ghidra/Features/Decompiler/src/decompile/cpp/ 3 | DEPS = bitspan.h combine.h instruction.h output.h parser.h parser_sla.h registers.h thread_pool.h validator.h 4 | OBJ = main.o bitspan.o combine.o instruction.o output.o parser.o parser_sla.o thread_pool.o slautil/slautil.o slautil/slaxml.o 5 | LIBS=-lboost_system -lboost_filesystem -lboost_regex -lboost_program_options -lboost_thread -lboost_timer 6 | VALIDATOR-DEPS = loadimage.hh sleigh.hh 7 | VALIDATOR-OBJ = validator.o 8 | VALIDATOR-LIBS= -lboost_system -lboost_filesystem -lboost_program_options -L . $(GHIDRA_TRUNK)/Ghidra/Features/Decompiler/src/decompile/cpp/libsla.a 9 | 10 | 11 | all: generator generator-validator 12 | 13 | validator.o: validator.cpp $(VALIDATOR_DEPS) 14 | $(CXX) -c -o $@ $< $(CXXFLAGS) $(VALIDATOR-LIBS) 15 | 16 | 17 | %.o: %.cpp $(DEPS) 18 | $(CXX) -c -o $@ $< $(CXXFLAGS) $(LIBS) 19 | 20 | generator: $(OBJ) 21 | $(CXX) -o $@ $^ $(CXXFLAGS) $(LIBS) 22 | 23 | generator-validator: $(VALIDATOR-OBJ) 24 | $(CXX) -o $@ $^ $(CXXFLAGS) $(VALIDATOR-LIBS) 25 | 26 | .PHONY: clean 27 | clean: 28 | rm -f *.o slautil/*.o generator generator-validator 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Generator 2 | 3 | Ghidra Processor Module Generator (Generator) is a step towards automating the creation of [Ghidra](https://github.com/NationalSecurityAgency/ghidra) processor modules. Generator takes as input one or more text files containing all disassembled instructions for a given instruction set architecture (ISA) and outputs a processor module directory that can be loaded into Ghidra. Specifically Generator: 4 | 5 | * combines duplicate instructions 6 | * combines instructions which differ by an immediate value 7 | * combines instructions which differ by a register 8 | * creates a valid processor module directory that includes: 9 | * Module.manifest 10 | * .slaspec 11 | * .cspec 12 | * .ldefs 13 | * .pspec 14 | 15 | The outputted processor module will be able to disassemble for your given architecture. As all instructions will have an empty p-code definition, Ghidra's decompiler will obviously not work. Generator only supports 1-4 byte ISAs. Both fixed length and variable length ISAs are supported. 16 | 17 | ## Performance 18 | Generator's runtime is varies based on the size/number of instructions of the input ISA. Generator is multithreaded and by default will use all available cores. 19 | 20 | |ISA Size|Time|Ram Usage|Notes| 21 | |---|---:|---:|---| 22 | |1 Byte|<1 sec|<1 GB|| 23 | |2 Byte|<2 sec|<1 GB|| 24 | |3 Byte|~60 sec|~40 GB|| 25 | |4 Byte|~4-5 hours|~40 GB|Requires 4-byte ISA instructions| 26 | 27 | Numbers are from an AMD Ryzen 9 7950X3D 16-Core Processor, 128 GB RAM, with NVMe SSD. 28 | 29 | ## Usage 30 | ### Overview 31 | The high-level steps for running Generator on 1-3 byte ISAs are to: 32 | 33 | 1) Create a newline delimited text file that contains a list of all valid hex opcode and instructions 34 | 2) Run Generator on the text file 35 | 3) Copy the created processor module directory to your Ghidra/Processors directory 36 | 4) Launch Ghidra. Your processor will show up in the list of supported processors 37 | 38 | See "Usage (1-3 Byte ISAs)" for detailed instructions. 4 byte ISAs require additional steps. See "Usage (4 Byte ISAs)". 39 | 40 | ### Generator Command-Line Arguments 41 | |Command|| 42 | |---|---| 43 | |-i [ --input-disassembly ] arg|Path to a newline delimited text file containing all opcodes and instructions for the processor module| 44 | |--input-disassembly-dir arg|Path to a directory with multiple newline delimited text files containing all opcodes and instructions for the processor module| 45 | |-s [ --input-sleigh ] arg|Path to a XML .sla file containing all opcodes and instructions for the processor module| 46 | |--input-sleigh-dir arg|Path to a directory with multiple XML .sla files containing all opcodes and instructions for the processor module| 47 | |-t [ --num-threads ] arg|Number of worker threads to use. Optional. Defaults to number of physical CPUs if not specified| 48 | |-n [ --processor-name ] arg|Name of the target processor. Defaults to "MyProc" if not specified| 49 | |-f [ --processor-family ] arg|Name of the target processor's family. Defaults to "MyProcFamily" if not specified| 50 | |-e [ --endian ] arg|Endianness of the processor. Must be either "little" or "big". Defaults to big if not specified| 51 | |-a [ --alignment ] arg|Instruction alignment of the processor. Defaults to 1 if not specified| 52 | |-b [ --bitness ] arg|Bitness of the processor. Defaults to 32 if not specified| 53 | |--print-registers-only|Only print parsed registers. Useful for debugging purposes. False by default| 54 | |--omit-opcodes|Don't print opcodes in the outputted.sla file. False by default| 55 | |--omit-example-instructions|Don't print example combined instructions in the outputted .sla file. False by default| 56 | |--skip-instruction-combining|Don't combine instructions. Useful for debugging purposes. False by default| 57 | |--additional-registers arg|List of additional registers. Use this option if --print-registers-only is missing registers for your instruction set| 58 | |-h [ --help ]|Help screen| 59 | 60 | ### Usage (1-3 Byte ISAs) 61 | 1) Create a newline delimited text file that contains a list of all valid hex opcodes + instructions. Example (SuperH SH-2): 62 | > 0x0002 stc sr,r0 63 | > 0x0003 bsrf r0 64 | > 0x0004 mov.b r0,@(r0,r0) 65 | > 0x0005 mov.w r0,@(r0,r0) 66 | > 0x0006 mov.l r0,@(r0,r0) 67 | > ... 68 | > ... 69 | > 0xEFFE mov #-0x2,r15 70 | > 0xEFFF mov #-0x1,r15 71 | 72 | Exclude any invalid instructions. The opcode must begin with 0x and must be byte aligned. 73 | 2) Run Generator with `generator --input-disassembly examples/sh-2.txt --print-registers-only` flag. This flag parses all the instructions and will print out only the registers. Verify the output is correct before proceeding. 74 | 75 | Ex: 76 | 77 | > ./generator --input-disassembly examples/sh-2.txt --print-registers-only 78 | > Ghidra Processor Module Generator 79 | > [\*] Using 16 worker thread(s) 80 | > [\*] Initializing default Ghidra registers 81 | > [\*] Parsing instructions examples/sh2.txt 82 | > [\*] Updating bit length from 0 to 16 83 | > [\*] Parsed 53752 instructions 84 | > [\*] Found registers: gbr mach macl pc pr r0 r1 r10 r11 r12 r13 r14 r15 r2 r3 r4 r5 r6 r7 r8 r9 sr vbr 85 | > [\*] Found mnemonics: # ( ) + , - @ add addc addv and and.b bf bf/s bra braf bsr bsrf bt bt/s clrmac clrt cmp/eq cmp/ge cmp/gt cmp/hi cmp/hs cmp/pl cmp/pz cmp/str div0s div0u div1 dmuls.l dmulu.l dt exts.b exts.w extu.b extu.w jmp jsr ldc ldc.l lds lds.l mac.l mac.w mov mov.b mov.l mov.w mova movt mul.l muls.w mulu.w neg negc nop not or or.b rotcl rotcr rotl rotr rte rts sett shal shar shll shll16 shll2 shll8 shlr shlr16 shlr2 shlr8 sleep stc stc.l sts sts.l sub subc subv swap.b swap.w tas.b trapa tst tst.b xor xor.b xtrct 86 | > If there are any issues edit registers.h before proceeding. 87 | 88 | 3) Manually verify that the registers and mnemonics lists are correct. You can use the `--additional-registers` command line option to add missing registers. On some architectures you may need to remove registers from registers.h and re-compile. **If the registers/mnemonics are incorrect Generator will not work**. 89 | 4) Now you are ready to run Generator: `./generator --input-file instructions.txt --processor-name MyProcessor --processor-family ProcessorFamily --endian big --alignment 2`. If all goes well Generator should create a "MyProcessor" directory with all of the required files. 90 | 5) Verify that the created processor module directory is valid and compiles with Ghidra's SLEIGH compiler. The SLEIGH compiler script can be found in `ghidra/support/`. Run `sleigh -a `. There should be warnings about unimplemented p-code instructions but otherwise there should be no issues. If the compilation step fails, please submit an issue and upload your instructions.txt file and I will take a look at it. 91 | 92 | Ex: 93 | > /ghidra/support/sleigh -a MyProcFamily/ 94 | > Compiling MyProcFamily/data/languages/MyProc.slaspec: 95 | > WARN 187 NOP constructors found (SleighCompile) 96 | > WARN Use -n switch to list each individually (SleighCompile) 97 | > 98 | > 1 languages successfully compiled 99 | 100 | 6) Now that you've compiled your processor module, you can run `generator-validator` to disassemble your input file and diff the results. This will help you find which instructions require modifications. Run with: `./generator-validator --input-file examples/sh2.txt --sla-file MyProcFamily/data/languages/MyProc.sla --output-file output.txt`. Diff the input file and the output file to find issues. If you find issues, manually correct the .slaspec and recompile with Ghidra's sleigh compiler. 101 | 7) If the processor successfully compiled you should be able to copy your MyProcessor directory to `/Ghidra/Processors/` directory. When you restart Ghidra your new processor should be listed. Make sure you open your binary as "raw" and manually select your processor module. 102 | 103 | ### Usage (4 Byte ISAs) 104 | A 4-byte ISA is too large for Generator to store in memory. To work around this, we split the the input disassembly file into multiple input files and run Generator multiple. The steps involved look this: 105 | 106 | 1) Create 256 newline delimited text files that contain a list of all valid hex opcode and instructions. Each file should be 1/256th of the total instruction set or approximately 16 million lines each. 107 | 2) Run Generator on the 256 input text files, creating 256 .slaspec files 108 | 3) Use Ghidra's SLEIGH compiler to compile the 256 .slaspec files into 256 .sla files 109 | 4) Re-run Generator, but with the 256 .sla files as input to combine them into a single .slaspec file 110 | 5) Copy the created processor module directory to your Ghidra/Processors directory 111 | 6) Launch Ghidra. Your processor will show up in the list of supported processors 112 | 113 | 1) Create 256 newline delimited texts that each contain 1/256th of the ISA. As before exclude any invalid instructions. Again the opcode must begin with 0x and must be byte aligned. 114 | 2) Run Generator with `generator --input-disassembly-dir examples/split --print-registers-only` flag. This will parse all the text files in the "examples/split" directory the instructions and will print out only the registers. Verify the output is correct before proceeding. Depending on how many files are present and the size of each file this can take a significant amount of time. In the examples/split directory there are two SH-2 files that will be combined. 115 | 116 | Ex: 117 | 118 | > ./generator --input-disassembly-dir examples/split --print-registers-only 119 | > Ghidra Processor Module Generator 120 | > [\*] Using 16 worker thread(s) 121 | > [\*] Initializing default Ghidra registers 122 | > [\*] Parsing instructions examples/split/sh2_1.txt 123 | > [\*] Updating bit length from 0 to 16 124 | > [\*] Parsed 26872 instructions 125 | > [\*] Freeing parser data 126 | > [\*] Parsing instructions examples/split/sh2_2.txt 127 | > [\*] Parsed 26880 instructions 128 | > [\*] Freeing parser data 129 | > [\*] Found registers: gbr mach macl pc pr r0 r1 r10 r11 r12 r13 r14 r15 r2 r3 r4 r5 r6 r7 r8 r9 sr vbr 130 | > [\*] Found mnemonics: # ( ) + , - @ add addc addv and and.b bf bf/s bra braf bsr bsrf bt bt/s clrmac clrt cmp/eq cmp/ge cmp/gt cmp/hi cmp/hs cmp/pl cmp/pz cmp/str div0s div0u div1 dmuls.l dmulu.l dt exts.b exts.w extu.b extu.w jmp jsr ldc ldc.l lds lds.l mac.l mac.w mov mov.b mov.l mov.w mova movt mul.l muls.w mulu.w neg > negc nop not or or.b rotcl rotcr rotl rotr rte rts sett shal shar shll shll16 shll2 shll8 shlr shlr16 shlr2 shlr8 sleep stc stc.l sts sts.l sub subc subv swap.b > swap.w tas.b trapa tst tst.b xor xor.b xtrct 131 | > If there are any issues edit registers.h before proceeding. 132 | > [\*] Freeing parser data 133 | 134 | 3) Manually verify that the registers and mnemonics lists are correct. You can use the `--additional-registers` command line option to add missing registers. On some architectures you may need to remove registers from registers.h and re-compile. **If the registers/mnemonics are incorrect Generator will not work**. 135 | 4) Now you are ready to run Generator: `./generator --input-disassembly-dir examples/split --processor-name MyProc --processor-family MyProcFamily --endian big --alignment 2`. If all goes well Generator should create a "MyProcFamily" directory with a .slaspec file for each of the input disassembly text files. 136 | 5) Verify that the created processor module directory is valid and compiles with Ghidra's SLEIGH compiler. The SLEIGH compiler script can be found in `ghidra/support/`. Run `sleigh -a -y `. **You must use the -y flag as it forces the SLEIGH compiler to output in the legacy XML format. This is required for the next step.** There should be warnings about unimplemented p-code instructions but otherwise there should be no issues. If the compilation step fails, please submit an issue and upload your instructions.txt file and I will take a look at it. When using examples/split it should successfully compile two languages, one for each input file. 137 | 138 | Ex: 139 | > /ghidra/support/sleigh -a MyProcFamily/ 140 | > Compiling MyProcFamily/data/languages/MyProc.slaspec: 141 | > WARN 104 NOP constructors found (SleighCompile) 142 | > WARN Use -n switch to list each individually (SleighCompile) 143 | > 144 | > WARN 30 NOP constructors found (SleighCompile) 145 | > WARN Use -n switch to list each individually (SleighCompile) 146 | > 147 | > 2 languages successfully compiled__ 148 | 149 | 6) Step 5 should create one .sla file for each input language. Copy those files .sla (not.slaspec) files into a seperate directory 150 | 7) Re-run Generator, but supplying the .sla directory as input: `./generator --input-sleigh-dir intermediate --processor-name SH2 --processor-family SuperH --endian big --alignment 2`. If all goes well Generator will parse and combine all of the .sla files into a single "SuperH" directory with all of the required files. 151 | 8) Verify that the created processor module directory is valid and compiles with Ghidra's SLEIGH compiler. The SLEIGH compiler script can be found in `ghidra/support/`. Run `sleigh -a `. There should be warnings about unimplemented p-code instructions but otherwise there should be no issues. If the compilation step fails, please submit an issue and upload your instructions.txt file and I will take a look at it. 152 | 9) Now that you've compiled your processor module, you can run `generator-validator` to disassemble your input file and diff the results. This will help you find which instructions require modifications. Run with: `./generator-validator --input-file examples/sh2.txt --sla-file MyProcFamily/data/languages/MyProc.sla --output-file output.txt`. Diff the input file and the output file to find issues. If you find issues, manually correct the .slaspec and recompile with Ghidra's sleigh compiler. 153 | 10) If the processor successfully compiled you should be able to copy your MyProcessor directory to `/Ghidra/Processors/` directory. When you restart Ghidra your new processor should be listed. Make sure you open your binary as "raw" and manually select your processor module. 154 | 155 | ### Troubleshooting 156 | The most important step in troubleshooting is verifying that the output of --print-registers-only is correct. The registers list must only registers and no mnemonics or instruction components. The mnemonics list must not contain any registers. To remedy this you can manually add\remove registers from regsiter.h or use the --additional-registers command line option to add missing registers. 157 | 158 | There are also register names that are also mnemonics. For example: "b" can be the “b” register or a “branch“ instruction, "lsr" can be “line shift register” or “logical shift right“ instrution. It's important that Generator be told what's a register if it can't figure it out on it's own. 159 | 160 | While most instructions will parse with Generator, there are certain types of instructions that won't merge properly. For example in ARM: 161 | 162 | > 0x9eb4 push {r1, r2, r3, r4, r7} 163 | > 0x9fb4 push {r0, r1, r2, r3, r4, r7} 164 | > 0xa0b4 push {r5, r7} 165 | 166 | There is a variable list of registers depending on the instruction. Unfortunately this breaks Generator's merging algorithm and must be implemented by hand. I would recommend dropping such instructions from the input disassembly. 167 | 168 | ## Manual Next Steps (See Existing Processors for Examples): 169 | Now that you have verified Ghidra can load your processor module you can begin implementing p-code and other changes to get the decompiler to work. 170 | 171 | 1) Edit the .pspec, .cspec, .ldef files. 172 | 2) Edit the .slaspec file. Rename registers to make more sense. If an instruction uses an immediate and modifies it before displaying you will have to edit the instruction. 173 | 3) Implement p-code for all of the instructions in the .slaspec file to get decompiler support. 174 | 175 | ## Issues 176 | * Not all instruction sets are compatible 177 | * Display fields for immediates aren't handled. generator-validator will help show you these. 178 | * Doesn't work with PC relative addressing. Will require manual fix-ups. generator-validator will help show you these. 179 | * Won't work on instruction sets where bitfields are not contigious. Example if bits 0-2 and 4-6 are combined to compute an immediate value. 180 | * Not tested with floating point 181 | 182 | Please attach your input file when creating an issue. 183 | 184 | ## Future Work 185 | * Add support for specifying bit patterns as input 186 | 187 | ## Build 188 | `make generator` 189 | `make generator-validator GHIDRA_TRUNK=` (requires Ghidra's decompiler headers and libsla.a. GHIDRA_TRUNK points to a clone of Ghidra from trunk, not a release build of Ghidra) 190 | 191 | ### Build Dependencies 192 | libboost >= 1.76 is required 193 | libboost-dev 194 | libboost-filesystem-dev 195 | libboost-program-options-dev 196 | libboost-regex-dev 197 | libboost-system-dev 198 | libboost-thread-dev 199 | libboost-timer-dev 200 | libsla.a (only needed for generator-validator) 201 | 202 | #### Building libsla.a 203 | If you want to use generator-validator to validate your processor module against your input file, you will need to build Ghidra's libsla. 204 | 1) Checkout Ghidra from trunk. A release build of Ghidra is not sufficient. `git clone https://github.com/NationalSecurityAgency/ghidra` This path will be your GHIDRA_TRUNK directory. 205 | 2) CD to the decompiler source directory: `cd ~/ghidra/Ghidra/Features/Decompiler/src/decompile/cpp` 206 | 3) Compile: `make libsla.a` 207 | 208 | ## License 209 | Licensed under the Apache 2.0 license. See LICENSE. 210 | -------------------------------------------------------------------------------- /bitspan.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: bitspan.cpp 3 | // 4 | // Calculate the longest span of bits that can be combined in an instruction 5 | // opcode 6 | // 7 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 8 | // Licensed under the Apache 2.0 License. 9 | //----------------------------------------------------------------------------- 10 | #include "bitspan.h" 11 | 12 | // initialize bitspan 13 | void initBitSpan(BITSPAN& bitSpan) 14 | { 15 | bitSpan.length = 0; 16 | bitSpan.replacementChar = '\0'; 17 | bitSpan.differencePosition = -1; 18 | bitSpan.bitPos = 0; 19 | bitSpan.hasZero = false; 20 | } 21 | 22 | // increment the bit span size 23 | void incrementBitSpan(BITSPAN& bitSpan) 24 | { 25 | bitSpan.length++; 26 | } 27 | 28 | // update longest bitspan if curr is longer 29 | // We only want to combine the longest bitspans 30 | void updateLongestBitSpan(BITSPAN& curr, BITSPAN& longest) 31 | { 32 | if(curr.length <= longest.length) 33 | { 34 | return; 35 | } 36 | 37 | // we only care if the current bitspan has a 0 that we can move to 1 38 | if(curr.hasZero == false) 39 | { 40 | // longer string but no zero 41 | return; 42 | } 43 | 44 | if(curr.replacementChar == '\0') 45 | { 46 | // should never happen 47 | return; 48 | } 49 | 50 | // update longest bitspan 51 | longest = curr; 52 | return; 53 | } 54 | 55 | // replaces all 0s and 1s in the string with replacementChar starting at 56 | // position pos 57 | void replacesBitsFromSpan(std::string& bitString, 58 | unsigned int pos, 59 | unsigned int count, 60 | char replacementChar) 61 | { 62 | bitString[pos] = replacementChar; 63 | 64 | for(unsigned int i = 0; i < count; i++) 65 | { 66 | if(bitString[pos - i - 1] == '0' || bitString[pos - i - 1] == '1') 67 | { 68 | bitString[pos - i - 1] = replacementChar; 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /bitspan.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: bitspan.h 3 | // 4 | // Calculate the longest span of bits that can be combined in an instruction 5 | // opcode 6 | // 7 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 8 | // Licensed under the Apache 2.0 License. 9 | //----------------------------------------------------------------------------- 10 | #pragma once 11 | 12 | #include 13 | 14 | // represents a span of bits that can be combined 15 | // together in the opcode bitstring 16 | typedef struct _BITSPAN 17 | { 18 | unsigned int length; // number of bits in bitspan 19 | char replacementChar; 20 | unsigned int bitPos; 21 | int differencePosition; 22 | bool hasZero; 23 | } BITSPAN, *PBITSPAN; 24 | 25 | void initBitSpan(BITSPAN& bitSpan); 26 | void incrementBitSpan(BITSPAN& bitSpan); 27 | void updateLongestBitSpan(BITSPAN& curr, BITSPAN& longest); 28 | void replacesBitsFromSpan(std::string& bitString, 29 | unsigned int pos, 30 | unsigned int count, 31 | char replacementChar); 32 | -------------------------------------------------------------------------------- /combine.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: combine.cpp 3 | // 4 | // Combining instructions 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //------------------------------------------------------------------------------ 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "combine.h" 14 | #include "bitspan.h" 15 | #include "thread_pool.h" 16 | 17 | static bool compareInstructionCombine(const INSTRUCTION_COMBINE& a, 18 | const INSTRUCTION_COMBINE& b); 19 | static bool areInstructionsCombinable(Instruction& a, Instruction& b, 20 | char& replacementChar, 21 | int& differencePosition); 22 | static void combineInstructionsWorker(PARSED_DATA& parsedData, 23 | const string& curBitString, 24 | Instruction* instruction, 25 | set& combinedInstructions, 26 | unordered_map& visitedInstructions); 27 | 28 | // Set of instructions to combine. It is populated by the workers but only 29 | // inserted into the parserData.combinedInstructions by the parent thread 30 | static set g_TempCombinedInstructions(compareInstructionCombine); 31 | 32 | // synchronize access to g_TempCombinedInstructions set 33 | static boost::mutex g_TempCombinedInstructionsMutex; 34 | 35 | // Custom comparator for inserting INSTRUCTION_COMBINEs into the 36 | // g_TempCombinedInstructions set. 37 | // We want: 38 | // - higher counts (meaning more bits in the bit span) 39 | // - otherwise sort by lower opcode string 40 | static bool compareInstructionCombine(const INSTRUCTION_COMBINE& a, 41 | const INSTRUCTION_COMBINE& b) 42 | { 43 | if(a.length != b.length) 44 | { 45 | // comparison flipped here because we actually want 46 | // higher counts first in our set 47 | return a.length > b.length; 48 | } 49 | 50 | if(a.opcodeA != b.opcodeA) 51 | { 52 | return a.opcodeA < b.opcodeA; 53 | } 54 | 55 | if(a.opcodeB != b.opcodeB) 56 | { 57 | return a.opcodeB < b.opcodeB; 58 | } 59 | 60 | return false; 61 | } 62 | 63 | // Returns true if instruction a and b are combinable 64 | static bool areInstructionsCombinable(Instruction& a, 65 | Instruction& b, 66 | char& replacementChar, 67 | int& differencePosition) 68 | { 69 | bool isEqual = false; 70 | 71 | if(a.getOpcode().length() != b.getOpcode().length()) 72 | { 73 | // safety check against variable length instructions 74 | // shouldn't ever hit 75 | cout << "Attempting to combine different instruction length sizes!!" << endl; 76 | throw 1; 77 | return false; 78 | } 79 | 80 | for(unsigned int j = 0; j < COMBINE_MAX; j++) 81 | { 82 | switch(j) 83 | { 84 | case COMBINE_DUPLICATES: 85 | isEqual = a.areInstructionComponentsEqual(&b); 86 | if(isEqual) 87 | { 88 | replacementChar = '*'; 89 | return true; 90 | } 91 | break; 92 | case COMBINE_IMMEDIATES: 93 | isEqual = a.areInstructionComponentsEqualExceptImmediate(&b, &differencePosition); 94 | if(isEqual == false) 95 | { 96 | isEqual = a.areInstructionComponentsEqualExceptNegativeSign(&b, &differencePosition, TYPE_IMMEDIATE); 97 | } 98 | 99 | if(isEqual) 100 | { 101 | replacementChar = a.getComponentLetterFromPosition(TYPE_IMMEDIATE, differencePosition); 102 | return true; 103 | } 104 | break; 105 | case COMBINE_REGISTERS: 106 | isEqual = a.areInstructionComponentsEqualExceptRegister(&b, &differencePosition); 107 | if(isEqual) 108 | { 109 | replacementChar = a.getComponentLetterFromPosition(TYPE_REGISTER, differencePosition); 110 | return true; 111 | } 112 | break; 113 | default: 114 | // BUGBUG: handle errors gracefully 115 | cout << "[-] Invalid combine type specified!!" << endl; 116 | return false; 117 | } 118 | } 119 | 120 | return false; 121 | } 122 | 123 | // Iterates over all bits of the curBitString and attempts to see if 124 | // instruction can be merged with any other instruction one bit away. If a 125 | // match candidate is found, inserts it into g_TempCombinedInstructions. 126 | // Attempts to find the longest bit span of combinable instructions 127 | static void combineInstructionsWorker(PARSED_DATA& parsedData, 128 | const string& curBitString, 129 | Instruction* instruction, 130 | set& combinedInstructions, 131 | unordered_map& visitedInstructions) 132 | { 133 | BITSPAN longestBitSpan = {0, 0, 0, 0, 0}; 134 | BITSPAN curBitSpan = {0, 0, 0, 0, 0}; 135 | string spanBitString; 136 | 137 | // loop through each bit of the current instruction 138 | for(unsigned int i = 0; i < curBitString.length(); i++) 139 | { 140 | map:: iterator zeroItr; 141 | map:: iterator oneItr; 142 | string zeroBitString; 143 | string oneBitString; 144 | bool isEqual = false; 145 | bool hasZero = false; 146 | char replacementChar = '\0'; 147 | int differencePosition = -1; 148 | 149 | if(curBitString[i] != '0' && curBitString[i] != '1') 150 | { 151 | // this bit has already been combined 152 | // check if it increases our span 153 | if(curBitString[i] == curBitSpan.replacementChar) 154 | { 155 | incrementBitSpan(curBitSpan); 156 | } 157 | else 158 | { 159 | // we are starting a new bit span 160 | updateLongestBitSpan(curBitSpan, longestBitSpan); 161 | initBitSpan(curBitSpan); 162 | curBitSpan.length = 1; 163 | curBitSpan.replacementChar = curBitString[i]; 164 | } 165 | 166 | // this is already a combined instruction, no need to do more work 167 | continue; 168 | } 169 | 170 | zeroBitString = curBitString; 171 | oneBitString = curBitString; 172 | 173 | // create two opcoded bit strings: 174 | // - replace all bits in the span with 0s 175 | // - replace all bitgs in the span with 1s 176 | // both new opcode bit strings must be presented and combinable 177 | // for us to increase our bitspan count 178 | replacesBitsFromSpan(zeroBitString, i, curBitSpan.length, '0'); 179 | replacesBitsFromSpan(oneBitString, i, curBitSpan.length, '1'); 180 | 181 | if(curBitString[i] == '0') 182 | { 183 | hasZero = true; 184 | } 185 | 186 | // current bit position is 0, increment it to a 1 and see if another 187 | // string is there 188 | zeroItr = parsedData.combinedInstructions.find(zeroBitString); 189 | if(zeroItr == parsedData.combinedInstructions.end()) 190 | { 191 | // didn't find an adjacent instruction 192 | if(curBitSpan.length > 0) 193 | { 194 | i -= 1; 195 | } 196 | 197 | updateLongestBitSpan(curBitSpan, longestBitSpan); 198 | initBitSpan(curBitSpan); 199 | continue; 200 | } 201 | 202 | oneItr = parsedData.combinedInstructions.find(oneBitString); 203 | if(oneItr == parsedData.combinedInstructions.end()) 204 | { 205 | // didn't find an adjacent instruction 206 | if(curBitSpan.length > 0) 207 | { 208 | i -= 1; 209 | } 210 | 211 | updateLongestBitSpan(curBitSpan, longestBitSpan); 212 | initBitSpan(curBitSpan); 213 | continue; 214 | } 215 | 216 | // 217 | // We have a candidate adjacent instruction, check if they are 218 | // combinable 219 | // 220 | isEqual = areInstructionsCombinable(*zeroItr->second, 221 | *oneItr->second, 222 | replacementChar, 223 | differencePosition); 224 | 225 | // TODO: review this logic 226 | if(!isEqual) 227 | { 228 | // no match 229 | updateLongestBitSpan(curBitSpan, longestBitSpan); 230 | initBitSpan(curBitSpan); 231 | continue; 232 | } 233 | 234 | // check if instructions are combinable but not the same replacement 235 | // char 236 | if(replacementChar != curBitSpan.replacementChar) 237 | { 238 | updateLongestBitSpan(curBitSpan, longestBitSpan); 239 | initBitSpan(curBitSpan); 240 | incrementBitSpan(curBitSpan); 241 | 242 | if(hasZero) 243 | { 244 | curBitSpan.hasZero = true; 245 | curBitSpan.bitPos = i; 246 | } 247 | curBitSpan.replacementChar = replacementChar; 248 | continue; 249 | } 250 | 251 | if(isEqual) 252 | { 253 | if(hasZero && curBitSpan.hasZero == false) 254 | { 255 | curBitSpan.hasZero = true; 256 | curBitSpan.bitPos = i; 257 | curBitSpan.replacementChar = replacementChar; 258 | } 259 | 260 | if(curBitSpan.differencePosition == -1) 261 | { 262 | curBitSpan.differencePosition = differencePosition; 263 | } 264 | } 265 | 266 | incrementBitSpan(curBitSpan); 267 | 268 | } // for(unsigned int i = 0; i < curBitString.length(); i++) 269 | 270 | updateLongestBitSpan(curBitSpan, longestBitSpan); 271 | 272 | // if longestBitSpan.count is non-zero that means: 273 | // - we found at least one bit span to combine 274 | // - this is the longest one 275 | if(longestBitSpan.length > 0) 276 | { 277 | // two instructions to delete 278 | // new instruction to insert 279 | unordered_map::iterator itr; 280 | INSTRUCTION_COMBINE newCombine; 281 | string tempBitString; 282 | 283 | tempBitString = curBitString; 284 | tempBitString[longestBitSpan.bitPos] = '1'; 285 | 286 | // check if we already have a better match 287 | itr = visitedInstructions.find(tempBitString); 288 | if(itr != visitedInstructions.end()) 289 | { 290 | // we have seen this address already, check if our current span is better or worse 291 | if(longestBitSpan.length > itr->second) 292 | { 293 | // this new span is better, insert it in 294 | visitedInstructions.insert({{tempBitString, longestBitSpan.length}}); 295 | } 296 | else 297 | { 298 | // this new span is worse, ignore it 299 | return; 300 | } 301 | } 302 | else 303 | { 304 | // we haven't seen this address yet, add it in 305 | visitedInstructions.insert({{tempBitString, longestBitSpan.length}}); 306 | } 307 | 308 | // instructions are equal, combine them 309 | newCombine.length = longestBitSpan.length; 310 | newCombine.instruction = new Instruction(); 311 | *newCombine.instruction = *instruction; 312 | 313 | newCombine.opcodeA = curBitString; 314 | newCombine.opcodeB = tempBitString; 315 | 316 | tempBitString[longestBitSpan.bitPos] = longestBitSpan.replacementChar; 317 | 318 | //cout << "MATCH " << longestBitSpan.count << " " << longestBitSpan.replacementChar << " " << newCombine.opcodeA << " " << newCombine.opcodeB << " " << tempBitString << endl; 319 | 320 | newCombine.instruction->setOpcodeBitString(tempBitString); 321 | newCombine.instruction->setCombined(true); 322 | newCombine.instruction->setNeedsFree(true); 323 | 324 | if(longestBitSpan.differencePosition != -1) 325 | { 326 | newCombine.instruction->setComponentPositionCombined(longestBitSpan.differencePosition); 327 | } 328 | 329 | // insert our newly created instruction into our temp set it will be 330 | // sorted by bit span count so we can ensure we merge only the optimal 331 | // instructions into parsedData.combinedInstructions 332 | combinedInstructions.insert(std::move(newCombine)); 333 | } 334 | 335 | return; 336 | } 337 | 338 | static int combineInstructionsThread(PARSED_DATA& parsedData, 339 | unsigned long long start, 340 | unsigned long long end) 341 | { 342 | map::iterator startItr = parsedData.combinedInstructions.begin(); 343 | map::iterator endItr = parsedData.combinedInstructions.begin(); 344 | set combinedInstructions(compareInstructionCombine); 345 | unordered_map visitedInstructions; 346 | 347 | if(start >= parsedData.combinedInstructions.size() || 348 | end >= parsedData.combinedInstructions.size()) 349 | { 350 | cout << "Bad sizes!!\n"; 351 | cout << start << " " << end << " " << parsedData.combinedInstructions.size() << endl; 352 | throw 1; 353 | } 354 | 355 | if(start > end) 356 | { 357 | cout << "Bad sizes 2 !!\n"; 358 | cout << start << " " << end << " " << parsedData.combinedInstructions.size() << endl; 359 | throw 2; 360 | } 361 | 362 | std::advance(startItr, start); 363 | endItr = startItr; 364 | std::advance(endItr, end - start + 1); 365 | 366 | for(; startItr != endItr; startItr++) 367 | { 368 | combineInstructionsWorker(parsedData, 369 | startItr->first, 370 | startItr->second, 371 | combinedInstructions, 372 | visitedInstructions); 373 | } 374 | 375 | g_TempCombinedInstructionsMutex.lock(); 376 | g_TempCombinedInstructions.merge(combinedInstructions); 377 | g_TempCombinedInstructionsMutex.unlock(); 378 | 379 | incrementWorkerCompletions(); 380 | return 0; 381 | } 382 | 383 | // Queue each instruction to the thread pool to be combined by worker threads 384 | static unsigned int combineInstructionsScheduler(PARSED_DATA& parsedData) 385 | { 386 | boost::asio::thread_pool threadPool(parsedData.numThreads); 387 | unsigned long long numInstructions = 0; 388 | unsigned long long portionSize = 0; 389 | unsigned long long start = 0; 390 | unsigned int submissions = 0; 391 | 392 | resetThreadPool(); 393 | 394 | // 395 | // split the instructions into 1/num threads pieces 396 | // 397 | numInstructions = parsedData.combinedInstructions.size(); 398 | portionSize = numInstructions/parsedData.numThreads; 399 | 400 | if(portionSize == 0) 401 | { 402 | // we can end up with a 0 portionSize if numThreads > numInstructions 403 | portionSize = 1; 404 | } 405 | 406 | for(unsigned int i = 0; i < parsedData.numThreads; i++) 407 | { 408 | unsigned long long end = 0; 409 | 410 | start = i * portionSize; 411 | 412 | if(i == parsedData.numThreads - 1) 413 | { 414 | // last thread, always set end to numInstructions 415 | end = numInstructions - 1; 416 | } 417 | else 418 | { 419 | end = start + portionSize - 1; 420 | } 421 | 422 | if(start >= numInstructions) 423 | { 424 | continue; 425 | } 426 | 427 | // queue a worker to work on 1/n of the disassembly 428 | boost::asio::post(threadPool, 429 | boost::bind(combineInstructionsThread, 430 | boost::ref(parsedData), 431 | start, 432 | end)); 433 | submissions++; 434 | } 435 | 436 | // wait for threads 437 | // TODO: improve poll logic 438 | while(1) 439 | { 440 | boost::this_thread::sleep(boost::posix_time::milliseconds(100)); 441 | unsigned int completedCount = getWorkerCompletions(); 442 | 443 | // check if we finished our submitted jobs 444 | if(completedCount >= submissions) 445 | { 446 | // finished 447 | break; 448 | } 449 | } 450 | 451 | threadPool.join(); 452 | 453 | // short-circuit exit if we didn't combine any instructions during this 454 | // loop 455 | if(g_TempCombinedInstructions.size() == 0) 456 | { 457 | //cout << " [*] No instructions combined during pass. Short-circuiting" << endl; 458 | return 0; 459 | } 460 | 461 | //cout << "g_TempCombinedInstructions: " << g_TempCombinedInstructions.size() << endl; 462 | 463 | g_TempCombinedInstructionsMutex.lock(); 464 | 465 | // Update parsedData.combinedInstructions with the newly created combined 466 | // instructions. Remove two instructions for every onec combined 467 | // instruction we add back in. 468 | for(set:: iterator currItr = g_TempCombinedInstructions.begin(); 469 | currItr != g_TempCombinedInstructions.end(); 470 | currItr++) 471 | { 472 | // Verify both opcodeA and opcodeB are present. It's possible we remove 473 | // one or both while combining another instruction 474 | auto tempItr = parsedData.combinedInstructions.find(currItr->opcodeA); 475 | if(tempItr == parsedData.combinedInstructions.end()) 476 | { 477 | delete currItr->instruction; 478 | continue; 479 | } 480 | 481 | auto tempItr2 = parsedData.combinedInstructions.find(currItr->opcodeB); 482 | if(tempItr2 == parsedData.combinedInstructions.end()) 483 | { 484 | delete currItr->instruction; 485 | continue; 486 | } 487 | 488 | // We only delete instructions that were previously combined as they 489 | // were allocated here. 490 | if(tempItr->second->getNeedsFree() == true) 491 | { 492 | delete tempItr->second; 493 | } 494 | parsedData.combinedInstructions.erase(tempItr); 495 | 496 | if(tempItr2->second->getNeedsFree() == true) 497 | { 498 | delete tempItr2->second; 499 | } 500 | parsedData.combinedInstructions.erase(tempItr2); 501 | 502 | // insert the new combined instruction 503 | parsedData.combinedInstructions.insert({{std::move(currItr->instruction->getOpcode()), 504 | currItr->instruction}}); 505 | } 506 | 507 | g_TempCombinedInstructions.clear(); 508 | g_TempCombinedInstructionsMutex.unlock(); 509 | 510 | return 1; 511 | } 512 | 513 | // Attempts to combine instructions into one. To combine two instructions into 514 | // one: 515 | // -- the opcodes must bit one bit apart 516 | // -- the instructions must be identical (COMBINE_DUPLICATE) 517 | // -- the instructions must be identical except for an immediate field 518 | // (COMBINE_IMMEDIATE) 519 | // -- the instructions must be identical except for a register field 520 | // (COMBINE_REGISTER) 521 | // 522 | // When we find two instructions to combine we must: 523 | // -- remove the first instruction from combinedInstructions set 524 | // -- remove the second instruction from the combinedInstruction set 525 | // -- change the shared bit to another character such as: 526 | // ---- '*' for duplicates 527 | // ---- lowercase letter for immediates 528 | // ---- uppercase letter for registers 529 | // -- create a new combined instruction and add it to the combinedInstructions 530 | // set 531 | // -- ensure that we have the best (AKA longest) combination possible 532 | // 533 | // Because we are inserting and deleting while iterating through the set we need be 534 | // careful with our iterators 535 | // 536 | void combineInstructions(PARSED_DATA& parsedData) 537 | { 538 | boost::timer::auto_cpu_timer t; 539 | unsigned int result = 0; 540 | 541 | // worst case we must run this algorithm once for every bit in the opcode 542 | // we have a short-circuit exit if execute a loop without combining any 543 | // instructions 544 | // TODO: is this still true with failures in combining?? 545 | for(unsigned int k = 0; k < parsedData.maxOpcodeBits; k++) 546 | { 547 | cout << " [*] Pass: " << k << " Instructions: " << parsedData.combinedInstructions.size() << endl; 548 | 549 | result = combineInstructionsScheduler(parsedData); 550 | if(result == 0) 551 | { 552 | // no more to combine, return early 553 | return; 554 | } 555 | } 556 | 557 | return; 558 | } 559 | -------------------------------------------------------------------------------- /combine.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: combine.h 3 | // 4 | // Combining instructions 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | #pragma once 10 | 11 | #include "parser.h" 12 | 13 | // two instructions to combine to a single one 14 | typedef struct _INSTRUCTION_COMBINE 15 | { 16 | unsigned int length; // count of bits being combined 17 | Instruction* instruction; 18 | string opcodeA; 19 | string opcodeB; 20 | } INSTRUCTION_COMBINE, *PINSTRUCTION_COMBINE; 21 | 22 | void combineInstructions(PARSED_DATA& parsedData); 23 | -------------------------------------------------------------------------------- /examples/ethereum.txt: -------------------------------------------------------------------------------- 1 | 0x00 STOP 2 | 0x01 ADD 3 | 0x02 MUL 4 | 0x03 SUB 5 | 0x04 DIV 6 | 0x05 SDIV 7 | 0x06 MOD 8 | 0x07 SMOD 9 | 0x08 ADDMOD 10 | 0x09 MULMOD 11 | 0x0a EXP 12 | 0x0b SIGNEXTEND 13 | 0x10 LT 14 | 0x11 GT 15 | 0x12 SLT 16 | 0x13 SGT 17 | 0x14 EQ 18 | 0x15 ISZERO 19 | 0x16 AND 20 | 0x17 OR 21 | 0x18 XOR 22 | 0x19 NOT 23 | 0x1a BYTE 24 | 0x20 SHA3 25 | 0x30 ADDRESS 26 | 0x31 BALANCE 27 | 0x32 ORIGIN 28 | 0x33 CALLER 29 | 0x34 CALLVALUE 30 | 0x35 CALLDATALOAD 31 | 0x36 CALLDATASIZE 32 | 0x37 CALLDATACOPY 33 | 0x38 CODESIZE 34 | 0x39 CODECOPY 35 | 0x3a GASPRICE 36 | 0x3b EXTCODESIZE 37 | 0x3c EXTCODECOPY 38 | 0x40 BLOCKHASH 39 | 0x41 COINBASE 40 | 0x42 TIMESTAMP 41 | 0x43 NUMBER 42 | 0x44 DIFFICULTY 43 | 0x45 GASLIMIT 44 | 0x50 POP 45 | 0x51 MLOAD 46 | 0x52 MSTORE 47 | 0x53 MSTORE8 48 | 0x54 SLOAD 49 | 0x55 SSTORE 50 | 0x56 JUMP 51 | 0x57 JUMPI 52 | 0x58 PC 53 | 0x59 MSIZE 54 | 0x5a GAS 55 | 0x5b JUMPDEST 56 | 0x80 DUP1 57 | 0x81 DUP2 58 | 0x82 DUP3 59 | 0x83 DUP4 60 | 0x84 DUP5 61 | 0x85 DUP6 62 | 0x86 DUP7 63 | 0x87 DUP8 64 | 0x88 DUP9 65 | 0x89 DUP10 66 | 0x8a DUP11 67 | 0x8b DUP12 68 | 0x8c DUP13 69 | 0x8d DUP14 70 | 0x8e DUP15 71 | 0x8f DUP16 72 | 0x90 SWAP1 73 | 0x91 SWAP2 74 | 0x92 SWAP3 75 | 0x93 SWAP4 76 | 0x94 SWAP5 77 | 0x95 SWAP6 78 | 0x96 SWAP7 79 | 0x97 SWAP8 80 | 0x98 SWAP9 81 | 0x99 SWAP10 82 | 0x9a SWAP11 83 | 0x9b SWAP12 84 | 0x9c SWAP13 85 | 0x9d SWAP14 86 | 0x9e SWAP15 87 | 0x9f SWAP16 88 | 0xa0 LOG0 89 | 0xa1 LOG1 90 | 0xa2 LOG2 91 | 0xa3 LOG3 92 | 0xa4 LOG4 93 | 0xf0 CREATE 94 | 0xf1 CALL 95 | 0xf2 CALLCODE 96 | 0xf3 RETURN 97 | 0xf4 DELEGATE_CALL 98 | 0xf5 CREATE2 99 | 0xfa STATICCALL 100 | 0xfd REVERT 101 | 0xff SUICIDE 102 | -------------------------------------------------------------------------------- /instruction.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: instruction.h 3 | // 4 | // Instruction class definition 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include "slautil/slautil.h" 15 | using namespace std; 16 | 17 | enum InstructionComponentType 18 | { 19 | TYPE_INSTRUCTION = 0, 20 | TYPE_REGISTER, 21 | TYPE_IMMEDIATE, 22 | TYPE_SIGNED_IMMEDIATE, // TODO: not used 23 | TYPE_MAX, // Not a valid type, must be the last one 24 | }; 25 | 26 | class InstructionComponent 27 | { 28 | public: 29 | InstructionComponent(const InstructionComponentType newType, const string &newComponent); 30 | InstructionComponent(const InstructionComponentType newType, const string &newComponent, bool isCombined); 31 | 32 | 33 | // BUGBUG: should these really be public? I'm treating InstructionComponent more as struct than as a class 34 | InstructionComponentType type; 35 | string component; 36 | string combinedComponent; 37 | bool isCombined; 38 | }; 39 | 40 | class Instruction 41 | { 42 | public: 43 | // gets and sets the opcode bitstring 44 | string getOpcode(void); 45 | void setOpcode(const string &opcodeBitString); // opcode must be a hex string begining with 0x 46 | void setOpcodeBitString(const string &newOpcode); // opcode is a binary string without a prefix 47 | 48 | // gets and sets the combined flag 49 | // BUGBUG: do I really need this?? 50 | bool getCombined(void); 51 | void setCombined(bool isCombined); 52 | int setComponentPositionCombined(const unsigned int componentPosition); 53 | 54 | bool getNeedsFree(void); 55 | void setNeedsFree(bool needsToBeFreed); 56 | 57 | // adds a new instruction component to the instruction 58 | void addComponent(const InstructionComponentType newType, const string &newComponent); 59 | void addComponent(const InstructionComponentType newType, const string &newComponent, bool isCombined); 60 | 61 | // helper functions for identifying combined bits by letters 62 | char getComponentLetterFromPosition(const InstructionComponentType type, const unsigned int componentPosition); 63 | unsigned int getComponentPositionFromLetter(const char componentLetter); 64 | 65 | // prints the instruction 66 | string printInstruction(set& tokenInstructions); 67 | string getInstructionOutputString(bool getCombined, bool escapeDuplicateRegisters); 68 | int getInstructionDuplicatedRegisters(bool getCombined, map& duplicatedRegisters); 69 | string getOpcodeOutputString(set& tokenInstructions); 70 | 71 | // basic checks that the instruction is sane 72 | bool validateInstruction(void); 73 | 74 | // tests to check if two instruction can be combined 75 | bool areInstructionComponentsEqual(Instruction* right); 76 | bool areInstructionComponentsEqualExceptImmediate(Instruction* right, int* differencePosition); 77 | bool areInstructionComponentsEqualExceptNegativeSign(Instruction* right, int* differencePosition, InstructionComponentType componentType); 78 | bool areInstructionComponentsEqualExceptRegister(Instruction* right, int* differencePosition); 79 | 80 | // for creating the .slaspec 81 | void separateOpcode(); 82 | int computeAttachVariables(map& allInstructions, map& attachVariables, vector& slas); 83 | int generateAttachedRegisters(string opcode, unsigned int regStart, unsigned int regEnd, map& allInstructions, vector& slas, string& foundRegisters); 84 | 85 | //private: 86 | string opcode; // entire opcode of instruction in binary 87 | vector splitOpcode; // opcode split into individual components 88 | vector components; // the instruction broken up as components 89 | bool combined; // is the instruction combined or not 90 | bool needsFree; 91 | }; 92 | 93 | // misc utility functions used by the Instruction class 94 | // BUGBUG: should this be a part of the class?? 95 | int convertHexNibbletoInteger(unsigned char x); 96 | bool isInstructionComponentFiller(string& str); 97 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: main.cpp 3 | // 4 | // Handles command line argument parsing and calling the parsing, combining, 5 | // and output routines. 6 | // 7 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 8 | // Licensed under the Apache 2.0 License. 9 | //----------------------------------------------------------------------------- 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "combine.h" 17 | #include "parser.h" 18 | #include "parser_sla.h" 19 | #include "output.h" 20 | using namespace std; 21 | 22 | using namespace boost::filesystem; 23 | 24 | int generateFromSleigh(PARSED_DATA& parsedData, 25 | bool printRegistersOnly, 26 | bool skipInstructionCombining); 27 | int generateFromText(PARSED_DATA& parsedData, 28 | bool printRegistersOnly, 29 | bool skipInstructionCombining); 30 | int readFilenamesFromDirectory(PARSED_DATA& parsedData, 31 | const string& dirPath, 32 | const string& extension); 33 | 34 | int main(int argc, char *argv[]) 35 | { 36 | boost::program_options::options_description desc{"Ghidra Processor Module Generator"}; 37 | boost::program_options::variables_map args; 38 | vector additionalRegisters; // list of additional registers passed 39 | // in at the command line 40 | PARSED_DATA parsedData; 41 | bool skipInstructionCombining; // if set, skip attempting to combine 42 | // instructions. Useful for debugging 43 | bool printRegistersOnly; // if set parse the instruction set and only 44 | // display the registers. Useful for debugging purposes. 45 | bool parseSleigh; // if set the input is .sla, not disassembly text 46 | string inputFilename; 47 | string inputDirectory; 48 | boost::timer::auto_cpu_timer t; 49 | int result = 0; 50 | 51 | parsedData.maxOpcodeBits = 0; 52 | skipInstructionCombining = false; 53 | printRegistersOnly = false; 54 | parseSleigh = false; 55 | 56 | cout << "Ghidra Processor Module Generator" << endl; 57 | 58 | // 59 | // command line arg parsing 60 | // 61 | 62 | try 63 | { 64 | desc.add_options() 65 | ("input-disassembly,i", boost::program_options::value(&inputFilename), "Path to a newline delimited text file containing all opcodes and instructions for the processor module.") 66 | ("input-disassembly-dir", boost::program_options::value(&inputDirectory), "Path to a directory with multiple newline delimited text files containing all opcodes and instructions for the processor module.") 67 | ("input-sleigh,s", boost::program_options::value(&inputFilename), "Path to a XML .sla file containing all opcodes and instructions for the processor module.") 68 | ("input-sleigh-dir", boost::program_options::value(&inputDirectory), "Path to a directory with multiple XML .sla files containing all opcodes and instructions for the processor module.") 69 | ("num-threads,t", boost::program_options::value(&parsedData.numThreads), "Number of worker threads to use. Optional. Defaults to number of physical CPUs if not specified") 70 | ("processor-name,n",boost::program_options::value(&parsedData.processorName)->default_value("MyProc"), "Name of the target processor. Defaults to \"MyProc\" if not specified") 71 | ("processor-family,f",boost::program_options::value(&parsedData.processorFamily)->default_value("MyProcFamily"), "Name of the target processor's family. Defaults to \"MyProcFamily\" if not specified") 72 | ("endian,e", boost::program_options::value(&parsedData.endianness)->default_value("big"), "Endianness of the processor. Must be either \"little\" or \"big\". Defaults to big if not specified") 73 | ("alignment,a", boost::program_options::value(&parsedData.alignment)->default_value(1), "Instruction alignment of the processor. Defaults to 1 if not specified") 74 | ("bitness,b", boost::program_options::value(&parsedData.bitness)->default_value(32), "Bitness of the processor. Defaults to 32 if not specified") 75 | ("print-registers-only", boost::program_options::bool_switch(&printRegistersOnly), "Only print parsed registers. Useful for debugging purposes. False by default") 76 | ("omit-opcodes", boost::program_options::bool_switch(&parsedData.omitOpcodes)->default_value(false), "Don't print opcodes in the outputted .sla file. False by default") 77 | ("omit-example-instructions", boost::program_options::bool_switch(&parsedData.omitExampleInstructions)->default_value(false), "Don't print example combined instructions in the outputted .sla file. False by default") 78 | ("skip-instruction-combining", boost::program_options::bool_switch(&skipInstructionCombining), "Don't combine instructions. Useful for debugging purposes. False by default") 79 | ("additional-registers,ar", boost::program_options::value>(&additionalRegisters)->multitoken(), "List of additional registers. Use this option if --print-registers-only is missing registers for your instruction set") 80 | ("help,h", "Help screen"); 81 | 82 | store(parse_command_line(argc, argv, desc), args); 83 | notify(args); 84 | 85 | if(args.count("help") || argc == 1) 86 | { 87 | cout << desc << endl; 88 | return 0; 89 | } 90 | 91 | if(parsedData.endianness != "big" && parsedData.endianness != "little") 92 | { 93 | cout << "Processor endianness must be either little or big" << endl; 94 | return -1; 95 | } 96 | 97 | // make sure exactly one input method is specified by the user 98 | int inputFlagCount = args.count("input-disassembly") + 99 | args.count("input-disassembly-dir") + 100 | args.count("input-sleigh") + 101 | args.count("input-sleigh-dir"); 102 | if(inputFlagCount != 1) 103 | { 104 | cout << "Specifiy exactly one of: --input-disassembly,--input-disassembly-dir, --input-sleigh, or --input-sleigh-dir" << endl; 105 | return -1; 106 | } 107 | 108 | if(args.count("input-disassembly") != 0 && 109 | args.count("input-disassembly-dir") != 0) 110 | { 111 | cout << "Specify either input disassembly file or dir, not both!!" << endl; 112 | return -1; 113 | } 114 | 115 | if(args.count("input-disassembly") != 0) 116 | { 117 | parsedData.inputFilenames.push_back(inputFilename); 118 | } 119 | 120 | if(args.count("input-disassembly-dir") != 0) 121 | { 122 | result = readFilenamesFromDirectory(parsedData, 123 | inputDirectory, 124 | "*"); 125 | if(result != 0) 126 | { 127 | return result; 128 | } 129 | } 130 | 131 | if(args.count("input-sleigh") != 0) 132 | { 133 | parsedData.inputFilenames.push_back(inputFilename); 134 | parseSleigh = true; 135 | } 136 | 137 | if(args.count("input-sleigh-dir") != 0) 138 | { 139 | result = readFilenamesFromDirectory(parsedData, 140 | inputDirectory, 141 | ".sla"); 142 | if(result != 0) 143 | { 144 | cout << "Failed to find any .sla files" << endl; 145 | return result; 146 | } 147 | parseSleigh = true; 148 | } 149 | 150 | if(parsedData.inputFilenames.size() == 0) 151 | { 152 | cout << "Failed to find input files" << endl; 153 | return -1; 154 | } 155 | 156 | if(args.count("num-threads") == 0) 157 | { 158 | // user didn't specify number of threads 159 | // default to number of physical cpus 160 | parsedData.numThreads = boost::thread::physical_concurrency(); 161 | if(parsedData.numThreads == 0) 162 | { 163 | cout << "Unable to determine number of CPUs. Please specify thread count with --num-threads at the command line." << endl; 164 | return -1; 165 | } 166 | } 167 | 168 | if(parsedData.numThreads == 0) 169 | { 170 | cout << "Invalid number of threads specified" << endl; 171 | return -1; 172 | } 173 | } 174 | catch (const boost::program_options::error &ex) 175 | { 176 | cout << "[-] Error parsing command line: " << ex.what() << endl; 177 | return -1; 178 | } 179 | 180 | cout << "[*] Using " << parsedData.numThreads << " worker thread(s)" << endl; 181 | 182 | // 183 | // initialize the default set of registers from Ghidra 184 | // 185 | cout << "[*] Initializing default Ghidra registers" << endl; 186 | result = initRegisters(); 187 | if(result != 0) 188 | { 189 | cout << "[-] Failed to initialize default Ghidra registers!!" << endl; 190 | goto ERROR_CLEANUP; 191 | } 192 | 193 | result = addRegisters(additionalRegisters); 194 | if(result != 0) 195 | { 196 | cout << "[-] Failed to add additional registers!!" << endl; 197 | goto ERROR_CLEANUP; 198 | } 199 | 200 | if(parseSleigh == false) 201 | { 202 | // user supplied one or more text files of disassembly 203 | result = generateFromText(parsedData, 204 | printRegistersOnly, 205 | skipInstructionCombining); 206 | if(!result) 207 | { 208 | return result; 209 | } 210 | } 211 | else 212 | { 213 | // user supplied one or more .sla files 214 | result = generateFromSleigh(parsedData, 215 | printRegistersOnly, 216 | skipInstructionCombining); 217 | if(!result) 218 | { 219 | return result; 220 | } 221 | } 222 | 223 | ERROR_CLEANUP: 224 | clearParserData(parsedData, false); 225 | return result; 226 | } 227 | 228 | // search directory for all files of extension type 229 | int readFilenamesFromDirectory(PARSED_DATA& parsedData, 230 | const string& dirPath, 231 | const string& extension) 232 | { 233 | if(!is_directory(dirPath)) 234 | { 235 | cout << "Invalid directory: " << dirPath << endl; 236 | return -1; 237 | } 238 | 239 | for(auto& dir_entry : boost::make_iterator_range(directory_iterator(dirPath), {})) 240 | { 241 | if(extension == "*" || extension == dir_entry.path().extension()) 242 | { 243 | parsedData.inputFilenames.push_back(dir_entry.path().string()); 244 | } 245 | } 246 | 247 | // make sure we have at least one input file 248 | if(parsedData.inputFilenames.size() == 0) 249 | { 250 | cout << "Failed to find any input files in: " << dirPath << endl; 251 | return -1; 252 | } 253 | 254 | // TODO: numeric sort vs alpha sort? 255 | sort(parsedData.inputFilenames.begin(), parsedData.inputFilenames.end()); 256 | 257 | return 0; 258 | } 259 | 260 | // Generate one or more .sla files from the supplied text disassembly files 261 | int generateFromText(PARSED_DATA& parsedData, 262 | bool printRegistersOnly, 263 | bool skipInstructionCombining) 264 | { 265 | int result = 0; 266 | 267 | for(unsigned int i = 0; i < parsedData.inputFilenames.size(); i++) 268 | { 269 | // 270 | // read the input file and parse the instructions into parsedData 271 | // 272 | cout << "[*] Parsing instructions " << parsedData.inputFilenames[i] << endl; 273 | 274 | result = parseInstructions(parsedData, i); 275 | if(result != 0) 276 | { 277 | cout << "[-] Failed to parse instructions" << endl; 278 | goto ERROR_CLEANUP; 279 | } 280 | cout << "[*] Parsed " << parsedData.allInstructions.size() << " instructions" << endl; 281 | 282 | // only print registers and exit if option is set 283 | if(printRegistersOnly) 284 | { 285 | goto CONTINUE_LOOP; 286 | } 287 | 288 | // 289 | // combine the instructions and process data for output 290 | // 291 | 292 | // skip combining if option is set 293 | if(skipInstructionCombining == false) 294 | { 295 | cout << "[*] Combining instructions" << endl; 296 | combineInstructions(parsedData); 297 | } 298 | 299 | cout << "[*] Computing attach registers" << endl; 300 | computeAttachVariables(parsedData); 301 | 302 | cout << "[*] Computing token instructions" << endl; 303 | computeTokenInstructions(parsedData); 304 | 305 | // 306 | // Output the completed Ghidra Processor Specification 307 | // 308 | 309 | cout << "[*] Generating Ghidra processor specification" << endl; 310 | createProcessorModule(parsedData, i); 311 | 312 | CONTINUE_LOOP: 313 | clearParserData(parsedData, printRegistersOnly); 314 | result = 0; 315 | } // for(unsigned int i = 0; i < parsedData.inputFilenames.size(); i++) 316 | 317 | // only print registers and exit if option is set 318 | if(printRegistersOnly) 319 | { 320 | cout << "[*] Found registers: " << getOutputRegisters(parsedData) << endl; 321 | cout << "[*] Found mnemonics: " << getOutputMnemonics(parsedData) << endl; 322 | cout << "If there are any issues edit registers.h before proceeding." << endl; 323 | 324 | result = 0; 325 | goto ERROR_CLEANUP; 326 | } 327 | 328 | cout << "[*] Creating .ldefs" << endl; 329 | result = createLdefs(parsedData); 330 | if(result != 0) 331 | { 332 | return result; 333 | } 334 | 335 | ERROR_CLEANUP: 336 | clearParserData(parsedData, false); 337 | return result; 338 | } 339 | 340 | // Generate a .sla files from the one or more supplied .sla files 341 | int generateFromSleigh(PARSED_DATA& parsedData, 342 | bool printRegistersOnly, 343 | bool skipInstructionCombining) 344 | { 345 | int result = 0; 346 | 347 | for(unsigned int i = 0; i < parsedData.inputFilenames.size(); i++) 348 | { 349 | // 350 | // read the input file and parse the instructions into parsedData 351 | // 352 | cout << "[*] Parsing instructions: " << parsedData.inputFilenames[i] << endl; 353 | 354 | result = parseInstructionsSla(parsedData, i); 355 | if(result != 0) 356 | { 357 | cout << "[-] Failed to parse instructions" << endl; 358 | goto ERROR_CLEANUP; 359 | } 360 | cout << "[*] Parsed " << parsedData.combinedInstructions.size() << " instructions" << endl; 361 | 362 | // only print registers and exit if option is set 363 | if(printRegistersOnly) 364 | { 365 | continue; 366 | } 367 | } 368 | 369 | // only print registers and exit if option is set 370 | if(printRegistersOnly) 371 | { 372 | cout << "[*] Found registers: " << getOutputRegisters(parsedData) << endl; 373 | cout << "If there are any issues edit registers.h before proceeding." << endl; 374 | result = 0; 375 | goto ERROR_CLEANUP; 376 | } 377 | 378 | // 379 | // combine the instructions and process data for output 380 | // 381 | 382 | // skip combining if option is set 383 | if(skipInstructionCombining == false) 384 | { 385 | cout << "[*] Combining instructions" << endl; 386 | combineInstructions(parsedData); 387 | } 388 | 389 | cout << "[*] Computing attach registers" << endl; 390 | computeAttachVariables(parsedData); 391 | 392 | cout << "[*] Computing token instructions" << endl; 393 | computeTokenInstructions(parsedData); 394 | 395 | // 396 | // Output the completed Ghidra Processor Specification 397 | // 398 | 399 | cout << "[*] Generating Ghidra processor specification" << endl; 400 | createProcessorModule(parsedData, 0); 401 | 402 | cout << "[*] Created Processor Module Directory" << endl; 403 | 404 | cout << " [*] Creating .ldefs" << endl; 405 | parsedData.inputFilenames.resize(1); // TODO: make this a flag to createLdefs 406 | result = createLdefs(parsedData); 407 | if(result != 0) 408 | { 409 | return result; 410 | } 411 | 412 | ERROR_CLEANUP: 413 | clearParserData(parsedData, false); 414 | return result; 415 | } 416 | -------------------------------------------------------------------------------- /output.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: output.cpp 3 | // 4 | // Outputs the files that comprise the Ghidra processor module. 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | 10 | #include 11 | #include "output.h" 12 | 13 | #include 14 | 15 | // Creates the directory structure required by the processor spec. Processor 16 | // specs must be in the /data/languages/ directory structure 17 | int createDirectoryStructure(PARSED_DATA& parsedData) 18 | { 19 | bool result = false; 20 | 21 | boost::filesystem::path p{parsedData.processorFamily}; 22 | 23 | p.append("data"); 24 | p.append("languages"); 25 | 26 | if(boost::filesystem::exists(p) && boost::filesystem::is_directory(p)) 27 | { 28 | // directory already exists 29 | return 0; 30 | } 31 | 32 | // create the directory 33 | // BUGBUG: catch exceptions or use no throw 34 | result = boost::filesystem::create_directories(p); 35 | if(result == false) 36 | { 37 | cout << " [-] Failed to create processor directories!!" << endl; 38 | return -1; 39 | } 40 | 41 | return 0; 42 | } 43 | 44 | // Creates an empty Module.manifest inside the directory. 45 | // Unsure why this is required by Ghidra 46 | // /Module.manifest 47 | int createModuleManifest(PARSED_DATA& parsedData) 48 | { 49 | boost::filesystem::path p{parsedData.processorFamily}; 50 | 51 | // BUGBUG: why is this file needed? 52 | p.append("Module.manifest"); 53 | 54 | boost::filesystem::ofstream ofs(p); 55 | ofs.close(); 56 | 57 | return 0; 58 | } 59 | 60 | // Creates the bare minimum processor cspec file required to be loaded into 61 | // Ghidra. It is up to the enduser to fully define this file to get decompiler 62 | // support to work 63 | // /data/languages/.cspec 64 | int createCspec(PARSED_DATA& parsedData) 65 | { 66 | string cspecFilename; 67 | 68 | boost::filesystem::path p{parsedData.processorFamily}; 69 | 70 | cspecFilename = parsedData.processorFamily + ".cspec"; 71 | 72 | p.append("data"); 73 | p.append("languages"); 74 | p.append(cspecFilename); 75 | 76 | boost::filesystem::ofstream ofs(p); 77 | 78 | ofs << "\n"; 79 | ofs << "\n"; 80 | ofs << "\n"; 81 | ofs << "\n"; 82 | ofs << "\t\n"; 83 | ofs << "\t\t\n"; 84 | ofs << "\t\t\t\n"; 85 | ofs << "\t\t\t\n"; 86 | ofs << "\t\t\n"; 87 | ofs << "\t\n"; 88 | ofs << "\n"; 89 | 90 | ofs.close(); 91 | return 0; 92 | } 93 | 94 | // creates the bare minimum processor ldefs file required to be loaded into 95 | // Ghidra. Uses values passed in at the command line to fill out the file. 96 | // /data/languages/.ldefs 97 | int createLdefs(PARSED_DATA& parsedData) 98 | { 99 | boost::timer::auto_cpu_timer t; 100 | string ldefsFilename; 101 | string bigOrLittle; 102 | 103 | boost::filesystem::path p{parsedData.processorFamily}; 104 | 105 | ldefsFilename = parsedData.processorFamily + ".ldefs"; 106 | 107 | p.append("data"); 108 | p.append("languages"); 109 | p.append(ldefsFilename); 110 | 111 | boost::filesystem::ofstream ofs(p); 112 | 113 | if(parsedData.endianness == "big") 114 | { 115 | bigOrLittle = "BE"; 116 | } 117 | else 118 | { 119 | bigOrLittle = "LE"; 120 | } 121 | 122 | ofs << "\n"; 123 | ofs << "\n"; 124 | ofs << "\n"; 125 | ofs << "\n"; 126 | 127 | for(unsigned int i = 0; i < parsedData.inputFilenames.size(); i++) 128 | { 129 | ofs << "\t\n"; 146 | ofs << "\t\t" << parsedData.processorFamily << " " << parsedData.processorName << " processor " << parsedData.bitness << "-bit " << bigOrLittle << "\n"; 147 | ofs << "\t\t\n"; 148 | ofs << "\t\n"; 149 | } 150 | ofs << "\n"; 151 | 152 | ofs.close(); 153 | 154 | return 0; 155 | } 156 | 157 | // Creates the bare minimum processor pspec file required to be loaded into 158 | // Ghidra. It is up to the enduser to fully define this file to get decompiler 159 | // support to work 160 | // /data/languages/.pspec 161 | int createPspec(PARSED_DATA& parsedData) 162 | { 163 | string pspecFilename; 164 | 165 | boost::filesystem::path p{parsedData.processorFamily}; 166 | 167 | pspecFilename = parsedData.processorFamily + ".pspec"; 168 | 169 | p.append("data"); 170 | p.append("languages"); 171 | p.append(pspecFilename); 172 | 173 | boost::filesystem::ofstream ofs(p); 174 | 175 | ofs << "\n"; 176 | ofs << "\n"; 177 | ofs << "\n"; 178 | ofs << "\t\n"; 179 | ofs << "\n"; 180 | 181 | ofs.close(); 182 | 183 | return 0; 184 | } 185 | 186 | // Uses the filled out parsedData structure to create a .slaspec file, 187 | // the core of the processor module. This file contains all of the registers, 188 | // defined tokens, and instructions of the instruction set 189 | // /data/languages/.slaspec 190 | int createSlaspec(PARSED_DATA& parsedData, unsigned int fileId) 191 | { 192 | string pspecFilename; 193 | 194 | boost::filesystem::path p{parsedData.processorFamily}; 195 | 196 | if(fileId == 0) 197 | { 198 | pspecFilename = parsedData.processorName + ".slaspec"; 199 | } 200 | else 201 | { 202 | pspecFilename = parsedData.processorName + to_string(fileId) + ".slaspec"; 203 | } 204 | 205 | p.append("data"); 206 | p.append("languages"); 207 | p.append(pspecFilename); 208 | 209 | boost::filesystem::ofstream ofs(p); 210 | 211 | ofs << "# File autogenerated by Ghidra Processor Module Generator\n"; 212 | ofs << "# https://github.com/oberoisecurity/ghidra-processor-module-generator\n"; 213 | ofs << "\n"; 214 | 215 | // endianness and alignment 216 | ofs << "# TODO: Verify these\n"; 217 | ofs << "define endian=" << parsedData.endianness << ";\n"; 218 | ofs << "define alignment=" << parsedData.alignment << ";\n"; 219 | ofs << "\n"; 220 | 221 | // ram and register spaces 222 | ofs << "# TODO: Verify these\n"; 223 | ofs << "define space ram type=ram_space size=4 wordsize=1 default;\n"; 224 | ofs << "define space register type=register_space size=4;\n"; 225 | ofs << "\n"; 226 | 227 | // define registers 228 | if(parsedData.registers.size() > 0) 229 | { 230 | ofs << "# TODO: Verify these\n"; 231 | ofs << "define register offset=0 size=4\n"; 232 | ofs << "[" << getOutputRegisters(parsedData) << "];\n"; 233 | ofs << "\n"; 234 | } 235 | 236 | // flags 237 | ofs << "# TODO: Add flags if needed\n"; 238 | ofs << "# ex. @define MY_FLAG\t\"my_reg[0,1]\"\n"; 239 | ofs << "\n"; 240 | 241 | // define token registers 242 | for(unsigned int i = 0; i < sizeof(parsedData.tokenInstructions)/sizeof(parsedData.tokenInstructions[0]); i++) 243 | { 244 | unsigned int opcodeBitSize = 0; 245 | 246 | if(parsedData.tokenInstructions[i].size() > 0) 247 | { 248 | opcodeBitSize = (i + 1) * 8; 249 | 250 | ofs << "# TODO: Simplify these where possible\n"; 251 | ofs << "# TODO: Combine signed immediates where it makes sense\n"; 252 | ofs << "define token instr" << opcodeBitSize; 253 | 254 | // TODO: make if statement here for VLA 255 | 256 | ofs << "(" << opcodeBitSize << ")\n"; 257 | ofs << getOutputTokenInstructions(parsedData.tokenInstructions[i]); 258 | ofs << ";\n"; 259 | ofs << "\n"; 260 | } 261 | } 262 | 263 | // attach variables 264 | if(parsedData.attachVariables.size() > 0) 265 | { 266 | ofs << "# TODO: Simplify these where possible\n"; 267 | ofs << getOutputAttachVariables(parsedData); 268 | ofs << "\n"; 269 | } 270 | 271 | // check if any instructions have duplicated registers 272 | // we need to zero for every slaspec file 273 | // or we can run into export statements that fail 274 | // to compile in the SLEIGH compiler 275 | parsedData.duplicatedRegisters.clear(); 276 | for(auto& combinedInstruction: parsedData.combinedInstructions) 277 | { 278 | // combinedInstruction.first = the opcode 279 | // combinedInstruction.second = pointer to the Instruction 280 | combinedInstruction.second->getInstructionDuplicatedRegisters(true, 281 | parsedData.duplicatedRegisters); 282 | } 283 | 284 | if(parsedData.duplicatedRegisters.size() > 0) 285 | { 286 | ofs << "# Duplicated registers" << endl; 287 | ofs << "# To workaround: https://github.com/NationalSecurityAgency/ghidra/issues/6874" << endl; 288 | ofs << getOutputDuplicateRegisters(parsedData); 289 | ofs << "\n"; 290 | } 291 | 292 | // 293 | // Instructions 294 | // 295 | ofs << "#\n"; 296 | ofs << "# Instructions\n"; 297 | ofs << "#\n\n"; 298 | 299 | ofs << "#\n"; 300 | ofs << "# Example Instruction:\n"; 301 | ofs << "#\n"; 302 | ofs << "# 1) # BBBBBAAAAAaaaaaaaaaaaaaa00000100\n"; 303 | ofs << "# 2) # addi r0,r0,0x0\n"; 304 | ofs << "# 3) #:addi regA_22_26,regB_27_31,imm_08_21 is regB_27_31 & regA_22_26 & imm_08_21 & opcode_00_05=0b000100\n"; 305 | ofs << "# 4) {}\n"; 306 | ofs << "#\n"; 307 | ofs << "# Line one is the opcode written in bits from MSB to LSB\n"; 308 | ofs << "# - 0 and 1s represent bits of the opcode that are required and cannot change\n"; 309 | ofs << "# - upper case letters represent registers\n"; 310 | ofs << "# - lower case letters represent immediate values\n"; 311 | ofs << "# Line two is an example decoding of the instruction if all registers and immediates are set to 0\n"; 312 | ofs << "# Line three is the SLEIGH encoded instruction\n"; 313 | ofs << "# Line four is the empty p-code implementation which must be completed for decompiler support\n"; 314 | ofs << "#\n\n"; 315 | 316 | // sorted instructions 317 | // string = the text of the instruction itself not the opcode 318 | map sortedCombinedInstructions; 319 | 320 | // sort the instructions 321 | for(auto combinedInstruction: parsedData.combinedInstructions ) 322 | { 323 | string instructionString; 324 | 325 | // combinedInstruction.first = the opcode 326 | // combinedInstruction.second = pointer to the Instruction 327 | instructionString = getOutputInstruction(combinedInstruction.second, 328 | parsedData); 329 | sortedCombinedInstructions.insert({{instructionString, 330 | combinedInstruction.second}}); 331 | } 332 | 333 | for(auto sortedCombinedInstruction: sortedCombinedInstructions) 334 | { 335 | string instruction = sortedCombinedInstruction.first; 336 | 337 | // escape forward slash 338 | boost::replace_all(instruction, "/", "_"); 339 | 340 | if(parsedData.omitOpcodes == false) 341 | { 342 | ofs << "# " << sortedCombinedInstruction.second->getOpcode() << "\n"; 343 | } 344 | 345 | if((parsedData.omitExampleInstructions == false) && 346 | (sortedCombinedInstruction.second->getCombined() == true)) 347 | { 348 | ofs << "# " << getOriginalOutputString(sortedCombinedInstruction.second, parsedData) << "\n"; 349 | } 350 | 351 | ofs << instruction << "\n"; 352 | ofs << "{}\n"; 353 | ofs << "\n"; 354 | } 355 | sortedCombinedInstructions.clear(); 356 | 357 | ofs.close(); 358 | return 0; 359 | } 360 | 361 | // Gets a list of all registers define register section of the processor module 362 | string getOutputRegisters(PARSED_DATA& parsedData) 363 | { 364 | string output; 365 | std::set::iterator it; 366 | 367 | for(it = parsedData.registers.begin(); 368 | it != parsedData.registers.end(); 369 | ++it) 370 | { 371 | if(it == parsedData.registers.begin()) 372 | { 373 | output += *it; 374 | } 375 | else 376 | { 377 | output += " " + *it; 378 | } 379 | } 380 | return output; 381 | } 382 | 383 | // Gets a list of instruction mnemonics found 384 | // only used for debugging purposes 385 | string getOutputMnemonics(PARSED_DATA& parsedData) 386 | { 387 | string output; 388 | std::set::iterator it; 389 | 390 | for(it = parsedData.mnemonics.begin(); 391 | it != parsedData.mnemonics.end(); 392 | ++it) 393 | { 394 | if(it == parsedData.mnemonics.begin()) 395 | { 396 | output += *it; 397 | } 398 | else 399 | { 400 | output += " " + *it; 401 | } 402 | } 403 | return output; 404 | } 405 | 406 | // Outputs a list of the define token instructions for the processor module 407 | // ex: 408 | // imm_00_00 = (0, 0) 409 | // simm_00_00 = (0, 0) signed 410 | // imm_00_03 = (0, 3) 411 | // opcode_00_03 = (0, 3) 412 | // opcode_00_04 = (0, 4) 413 | // regA_04_07 = (4, 7) 414 | // regA_05_05 = (5, 5) 415 | // regA_05_05_2 = (5, 5) 416 | string getOutputTokenInstructions(set& tokenInstructions) 417 | { 418 | string output = ""; 419 | std::set::iterator it; 420 | 421 | for (auto& token: tokenInstructions) 422 | { 423 | int start, end; 424 | vector result; 425 | 426 | //cout << "token: " << token << endl; 427 | 428 | boost::split(result, token, boost::is_any_of("_")); 429 | if(result.size() < 3) 430 | { 431 | cout << "Failed to split token!!\n"; 432 | return ""; 433 | } 434 | 435 | start = std::stoi(result[1]); 436 | end = std::stoi(result[2]); 437 | 438 | output += "\t" + token + " = (" + to_string(start) + ", " + to_string(end) + ")\n"; 439 | 440 | // if this was an immediate value, create a signed immediate as well 441 | // we do this because we can't tell the difference between an unsigned 442 | // immediate and a postive signed immediate 443 | if(token.find("imm_") != string::npos) 444 | { 445 | output += "\ts" + token + " = (" + to_string(start) + ", " + to_string(end) + ") signed\n"; 446 | } 447 | } 448 | 449 | return output; 450 | } 451 | 452 | // Outputs the processor module's attached variables field 453 | // There can be multiple attach variables for a single processor module 454 | // ex: attach variables [ regA_05_05 regC_05_05_2 regE_05_05_2 ] [ 455 | // sr vbr 456 | // ]; 457 | string getOutputAttachVariables(PARSED_DATA& parsedData) 458 | { 459 | std::set::iterator it; 460 | string output = ""; 461 | 462 | for (auto& x: parsedData.attachVariables) 463 | { 464 | // x.first = string of registers 465 | // x.second = set containing all register variables using x.first 466 | string registers; 467 | 468 | for(auto& y: x.second) 469 | { 470 | registers += y + " "; 471 | } 472 | 473 | output += "attach variables [ " + registers + "] [\n"; 474 | output += "\t " + x.first + "\n"; 475 | output += "];\n"; 476 | output += "\n"; 477 | } 478 | 479 | return output; 480 | } 481 | 482 | // Add an export statement in the form of: 483 | // a0_dup1: a0 is a0 { export a0; } 484 | // this is required to avoid duplicate registers 485 | string getOutputDuplicateRegisters(PARSED_DATA& parsedData) 486 | { 487 | string output; 488 | 489 | for (auto& x: parsedData.duplicatedRegisters) 490 | { 491 | string reg = x.first; 492 | unsigned int count = x.second; 493 | 494 | if(count <= 1) 495 | { 496 | // shouldn't ever get here 497 | continue; 498 | } 499 | 500 | for(unsigned int i = 1; i < count; i ++) 501 | { 502 | output += reg + "_dup" + std::to_string(i) + ": " + reg + " is " + reg + " {export " + reg + ";}\n"; 503 | } 504 | } 505 | 506 | return output; 507 | } 508 | 509 | // Takes an instruction and converts into SLEIGH format 510 | // example: ":mov rm_04_07, rn_08_11 is opcode_12_15=0b0110 & rn_08_11 & rm_04_07 & opcode_00_03=0b0011" 511 | string getOutputInstruction(Instruction* instruction, PARSED_DATA& parsedData) 512 | { 513 | string output; 514 | int index = 0; 515 | 516 | // instruction decorator 517 | output += ":"; 518 | 519 | output += instruction->getInstructionOutputString(true, true); 520 | 521 | output += " is "; 522 | 523 | index = convertOpcodeSizeToIndex(instruction->getOpcode().length()); 524 | if(index < 0) 525 | { 526 | cout << "Invalid opcode size!!" << endl; 527 | throw 1; 528 | } 529 | 530 | output += instruction->getOpcodeOutputString(parsedData.tokenInstructions[index]); 531 | 532 | return output; 533 | } 534 | 535 | // Takes a combined instruction and converts into SLEIGH format, removing the 536 | // combined pieces. It does this by converting all of the non-binary pieces of 537 | // the opcode into 0s 538 | // example: "mov r0, r1" 539 | string getOriginalOutputString(Instruction* instruction, PARSED_DATA& parsedData) 540 | { 541 | int result = 0; 542 | string disassembledString; 543 | 544 | // zeroize the combined opcode string 545 | string zeroizedOpcode = instruction->getOpcode(); 546 | for(unsigned int i = 0; i < zeroizedOpcode.length(); i++) 547 | { 548 | if(zeroizedOpcode[i] != '0' && zeroizedOpcode[i] != '1') 549 | { 550 | zeroizedOpcode[i] = '0'; 551 | } 552 | } 553 | 554 | result = disassembleOpcodeFromParsedData(parsedData, 555 | zeroizedOpcode, 556 | disassembledString); 557 | if(result != 0) 558 | { 559 | return ""; 560 | } 561 | 562 | return disassembledString; 563 | } 564 | 565 | int getOriginalOutputStringFromSla(PARSED_DATA& parsedData, 566 | string zeroizedOpcode, 567 | string& disassembledString) 568 | { 569 | int result = 0; 570 | 571 | // loop through all of the loaded .sla files attempting to disassemble 572 | // zeroizedOpcode 573 | // TODO: improve speed? 574 | for(unsigned int i = 0; i < parsedData.slas.size(); i++) 575 | { 576 | result = parsedData.slas[i].getConstructorTextByBitPattern(zeroizedOpcode, 577 | disassembledString); 578 | if(result == 0) 579 | { 580 | // successfully found the string 581 | // cout << "Succeeded " << i << "\n" << disassembledString << endl; 582 | return 0; 583 | } 584 | } 585 | 586 | // not found in any of the loaded .sla files 587 | // cout << "Failed" << endl; 588 | return -1; 589 | } 590 | 591 | int disassembleOpcodeFromParsedData(PARSED_DATA& parsedData, 592 | string zeroizedOpcode, 593 | string& disassembledString) 594 | { 595 | int result = 0; 596 | 597 | // check through the allInstructions first 598 | auto itr = parsedData.allInstructions.find(zeroizedOpcode); 599 | if(itr != parsedData.allInstructions.end()) 600 | { 601 | disassembledString = itr->second->getInstructionOutputString(false, 602 | false); 603 | return 0; 604 | } 605 | 606 | // check through all of the sla files 607 | result = getOriginalOutputStringFromSla(parsedData, 608 | zeroizedOpcode, 609 | disassembledString); 610 | if(result == 0) 611 | { 612 | return 0; 613 | } 614 | 615 | cout << "Failed to find zeroized opcode!!" << endl; 616 | cout << zeroizedOpcode << endl; 617 | return -1; 618 | } 619 | 620 | // Wrapper function for creating the various files required for the processor 621 | // module. parsedData has already been filled out at this point 622 | int createProcessorModule(PARSED_DATA& parsedData, unsigned int fileId) 623 | { 624 | boost::timer::auto_cpu_timer t; 625 | int result = 0; 626 | 627 | cout << " [*] Creating Processor Directory Structure" << endl; 628 | result = createDirectoryStructure(parsedData); 629 | if(result != 0) 630 | { 631 | return result; 632 | } 633 | 634 | cout << " [*] Creating Module.manifest" << endl; 635 | result = createModuleManifest(parsedData); 636 | if(result != 0) 637 | { 638 | return result; 639 | } 640 | 641 | cout << " [*] Creating .cspec" << endl; 642 | result = createCspec(parsedData); 643 | if(result != 0) 644 | { 645 | return result; 646 | } 647 | 648 | cout << " [*] Creating .pspec" << endl; 649 | result = createPspec(parsedData); 650 | if(result != 0) 651 | { 652 | return result; 653 | } 654 | 655 | cout << " [*] Creating .slapec" << endl; 656 | result = createSlaspec(parsedData, fileId); 657 | if(result != 0) 658 | { 659 | return result; 660 | } 661 | 662 | return 0; 663 | } 664 | -------------------------------------------------------------------------------- /output.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: output.h 3 | // 4 | // Outputs the files that comprise the Ghidra processor module. 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | #pragma once 10 | 11 | #include "parser.h" 12 | using namespace std; 13 | 14 | int createProcessorModule(PARSED_DATA& parsedData, unsigned int fileId); 15 | int createDirectoryStructure(PARSED_DATA& parsedData); 16 | int createModuleManifest(PARSED_DATA& parsedData); 17 | int createPspec(PARSED_DATA& parsedData); 18 | int createCspec(PARSED_DATA& parsedData); 19 | int createLdefs(PARSED_DATA& parsedData); 20 | int createSlaspec(PARSED_DATA& parsedData, unsigned int fileId); 21 | 22 | string getOutputRegisters(PARSED_DATA& parsedData); 23 | string getOutputMnemonics(PARSED_DATA& parsedData); 24 | string getOutputTokenInstructions(set& tokenInstructions); 25 | string getOutputAttachVariables(PARSED_DATA& parsedData); 26 | string getOutputDuplicateRegisters(PARSED_DATA& parsedData); 27 | string getOutputInstruction(Instruction* instruction, PARSED_DATA& parserData); 28 | string getOriginalOutputString(Instruction* instruction, 29 | PARSED_DATA& parsedData); 30 | int getOriginalOutputStringFromSla(PARSED_DATA& parsedData, 31 | string zeroizedOpcode, 32 | string& disassembledString); 33 | int disassembleOpcodeFromParsedData(PARSED_DATA& parsedData, 34 | string zeroizedOpcode, 35 | string& disassembledString); 36 | -------------------------------------------------------------------------------- /parser.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: parser.cpp 3 | // 4 | // Parsing instructions from disassembly text file 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "parser.h" 15 | #include "registers.h" 16 | #include "thread_pool.h" 17 | 18 | const boost::regex g_opcodeRegex{"0[xX][0-9a-fA-F]+"}; 19 | const boost::regex g_integerRegex{"\\d+"}; 20 | 21 | // used to track if we have a variable length instruction set 22 | static bool g_opcodeSize[4] = {false, false, false, false}; 23 | 24 | set g_allRegisters; 25 | extern const char* ALL_REGISTERS[]; 26 | 27 | static bool splitChar(char ch); 28 | static bool isCharWhiteSpace(char ch); 29 | static int splitDisassemblyLine(vector& lineSplit, const string& line); 30 | static void updateOpcodeSize(unsigned int opcodeSize); 31 | static bool hasVariableLengthOpcodes(void); 32 | 33 | static int parseInstructionsWorker(PARSED_DATA& parsedData, 34 | const char* buffer, 35 | unsigned long long start, 36 | unsigned long long end); 37 | static int parseInstructionsParser(PARSED_DATA& parsedData, 38 | unsigned int lineNum, 39 | string& line, 40 | set& registers, 41 | set& mnemonics, 42 | map& allInstructions); 43 | 44 | // helper to convert number of opcode bits to to index into tokenInstructions array 45 | int convertOpcodeSizeToIndex(unsigned int opcodeSizeInBits) 46 | { 47 | switch(opcodeSizeInBits) 48 | { 49 | // bits to index into tokenInsructions array 50 | case 8: 51 | return 0; 52 | case 16: 53 | return 1; 54 | case 24: 55 | return 2; 56 | case 32: 57 | return 3; 58 | default: 59 | cout << "[-] convertOpcodeSizeToIndex: Invalid opcode size (" << opcodeSizeInBits << ") specified!!" << endl; 60 | throw 1; 61 | } 62 | 63 | // never get here, will throw in default case of switch statement 64 | return -1; 65 | } 66 | 67 | // check which opcode sizes we have seen during parsing 68 | static void updateOpcodeSize(unsigned int opcodeSizeInBits) 69 | { 70 | switch(opcodeSizeInBits) 71 | { 72 | case 8: 73 | g_opcodeSize[0] = true; 74 | break; 75 | case 16: 76 | g_opcodeSize[1] = true; 77 | break; 78 | case 24: 79 | g_opcodeSize[2] = true; 80 | break; 81 | case 32: 82 | g_opcodeSize[3] = true; 83 | break; 84 | default: 85 | cout << "[-] updateOpcodeSize: Invalid opcode size (" << opcodeSizeInBits << ") specified!!" << endl; 86 | break; 87 | } 88 | } 89 | 90 | // returns true if the parsed architecture has variable length opcodes 91 | // supported opcode lengths are 1-4 bytes 92 | static bool hasVariableLengthOpcodes(void) 93 | { 94 | unsigned int count = 0; 95 | 96 | for(unsigned int i = 0; i < sizeof(g_opcodeSize)/sizeof(g_opcodeSize[0]); i++) 97 | { 98 | if(g_opcodeSize[i] == true) 99 | { 100 | count++; 101 | } 102 | } 103 | 104 | if(count > 1) 105 | { 106 | return true; 107 | } 108 | 109 | return false; 110 | } 111 | 112 | // Load all the registers extracted from Ghidra into a set 113 | // When parsing the instructions this is how we will tell the difference 114 | // between an instruction mnemonic versus a register 115 | int initRegisters(void) 116 | { 117 | for(unsigned int i = 0; 118 | i < sizeof(ALL_REGISTERS)/sizeof(ALL_REGISTERS[0]); 119 | i++) 120 | { 121 | g_allRegisters.insert(ALL_REGISTERS[i]); 122 | } 123 | 124 | return 0; 125 | } 126 | 127 | // additionalRegisters is a list of additional registers specified by the user 128 | // at the command line or queried from the .sla file 129 | int addRegisters(vector& additionalRegisters) 130 | { 131 | for(auto additionalRegister : additionalRegisters) 132 | { 133 | g_allRegisters.insert(additionalRegister); 134 | } 135 | 136 | return 0; 137 | } 138 | 139 | // Returns true if the passed in string is a register. This is determined 140 | // seeing if it's in the g_allRegisters set 141 | bool isRegister(const string& str) 142 | { 143 | set::iterator it; 144 | 145 | // workaround when parsing .sla that contain register sets 146 | if(str == "__register_list__") 147 | { 148 | return true; 149 | } 150 | 151 | it = g_allRegisters.find(str); 152 | if(it == g_allRegisters.end()) 153 | { 154 | return false; 155 | } 156 | return true; 157 | } 158 | 159 | // Returns true if the passed in string is an opcode. We determine a string is 160 | // an opcode if it is a hex string beginning with 0x 161 | bool isOpcode(const string& str) 162 | { 163 | if(str.length() > 2) 164 | { 165 | if(str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) 166 | { 167 | return true; 168 | } 169 | } 170 | 171 | return false; 172 | 173 | // regex method was too slow 174 | // return boost::regex_match(str, g_opcodeRegex); 175 | } 176 | 177 | // returns true if the passed in string is an integer 178 | bool isInteger(const string& str) 179 | { 180 | if(str.length() >= 1) 181 | { 182 | if(str[0] >= '0' && str[0] <= '9') 183 | { 184 | return true; 185 | } 186 | } 187 | 188 | return false; 189 | 190 | // regex method was too slow 191 | //return boost::regex_match(str, g_integerRegex); 192 | } 193 | 194 | // an immediate is a hex string or decimal string 195 | bool isImmediate(const string& str) 196 | { 197 | // workaround when parsing .sla that contain register sets 198 | if(str == "__immediate_list__") 199 | { 200 | return true; 201 | } 202 | 203 | if(isOpcode(str) || isInteger(str)) 204 | { 205 | return true; 206 | } 207 | 208 | return false; 209 | } 210 | 211 | // TODO: comment 212 | static int parseInstructionsWorker(PARSED_DATA& parsedData, 213 | const char* buffer, 214 | unsigned long long start, 215 | unsigned long long end) 216 | { 217 | // to improve performance each thread has it's own copy of these data 218 | // structures that are merged together later 219 | set registers; 220 | set mnemonics; 221 | map allInstructions; 222 | const char* bufferStart = NULL; 223 | 224 | // loop through the file portion line by line 225 | bufferStart = buffer + start; 226 | for(unsigned long long i = start; i <= end; i++) 227 | { 228 | if(buffer[i] == '\n') 229 | { 230 | int result = 0; 231 | unsigned long long len = 0; 232 | 233 | len = &buffer[i] - bufferStart; 234 | string line(bufferStart, len); 235 | 236 | // parse each line 237 | result = parseInstructionsParser(parsedData, 238 | 0, 239 | line, 240 | registers, 241 | mnemonics, 242 | allInstructions); 243 | if(result != 0) 244 | { 245 | goto ERROR_EXIT; 246 | } 247 | 248 | bufferStart = &buffer[i]; 249 | } 250 | } 251 | 252 | // merge the data back up 253 | parsedData.mnemonicsMutex.lock(); 254 | parsedData.mnemonics.merge(mnemonics); 255 | parsedData.mnemonicsMutex.unlock(); 256 | 257 | parsedData.registersMutex.lock(); 258 | parsedData.registers.merge(registers); 259 | parsedData.registersMutex.unlock(); 260 | 261 | parsedData.registersMutex.lock(); 262 | parsedData.allInstructions.merge(allInstructions); 263 | parsedData.registersMutex.unlock(); 264 | 265 | incrementWorkerCompletions(); 266 | return 0; 267 | 268 | ERROR_EXIT: 269 | incrementWorkerCompletions(); 270 | incrementWorkerFailures(); 271 | return -1; 272 | } 273 | 274 | // returns true if the character should be split 275 | // into it's own element 276 | static bool splitChar(char ch) 277 | { 278 | switch(ch) 279 | { 280 | case ',': 281 | case '@': 282 | case '(': 283 | case ')': 284 | case '[': 285 | case ']': 286 | case '{': 287 | case '}': 288 | case '+': 289 | case '-': 290 | case '#': 291 | case ' ': 292 | case '*': 293 | case '!': 294 | case '\t': 295 | case '\r': 296 | case '\n': 297 | return true; 298 | default: 299 | return false; 300 | } 301 | 302 | return false; 303 | } 304 | 305 | // returns true if the character is a whitespace char 306 | static bool isCharWhiteSpace(char ch) 307 | { 308 | switch(ch) 309 | { 310 | case ' ': 311 | case '\t': 312 | case '\r': 313 | case '\n': 314 | return true; 315 | default: 316 | return false; 317 | } 318 | 319 | return false; 320 | } 321 | 322 | // splits a line of disassembly into a vector of strings 323 | static int splitDisassemblyLine(vector& lineSplit, const string& line) 324 | { 325 | string currSplit = ""; 326 | 327 | for(unsigned int i = 0; i < line.size(); i++) 328 | { 329 | bool shouldSplit = false; 330 | bool shouldSkip = false; 331 | 332 | shouldSplit = splitChar(line[i]); 333 | if(shouldSplit == true) 334 | { 335 | if(currSplit.size() > 0) 336 | { 337 | //cout << "currSplit: " << currSplit << endl; 338 | lineSplit.emplace_back(currSplit); 339 | currSplit = ""; 340 | } 341 | 342 | shouldSkip = isCharWhiteSpace(line[i]); 343 | if(shouldSkip == false) 344 | { 345 | // non-ws char, append to our vector 346 | lineSplit.emplace_back(std::string(1, line[i])); 347 | } 348 | } 349 | else 350 | { 351 | currSplit.push_back(line[i]); 352 | } 353 | } 354 | 355 | if(currSplit.size() > 0) 356 | { 357 | lineSplit.emplace_back(currSplit); 358 | } 359 | 360 | return 0; 361 | } 362 | 363 | 364 | // tokenizes the input instructions and appends them to the allInstructions set 365 | static int parseInstructionsParser(PARSED_DATA& parsedData, 366 | unsigned int lineNum, 367 | string& line, 368 | set& registers, 369 | set& mnemonics, 370 | map& allInstructions) 371 | { 372 | map::iterator itr; 373 | vector lineSplit; 374 | string opcode; 375 | int result = 0; 376 | 377 | Instruction* currInstruction = new Instruction(); 378 | if(currInstruction == NULL) 379 | { 380 | cout << "[-] Error line " << lineNum << ": Failed to allocate!!" << endl; 381 | goto ERROR_EXIT; 382 | } 383 | 384 | // We want to split these fillers from register values 385 | // TODO: improve performance here 386 | splitDisassemblyLine(lineSplit, line); 387 | 388 | // Our combining algorithm needs to be rewritten to support more than 26 389 | // tokens. For the time being bail 390 | if(lineSplit.size() > MAX_TOKENS) 391 | { 392 | cout << "[-] Error line " << lineNum << ": Line has more than MAX_TOKENS!!" << endl; 393 | cout << line << endl; 394 | throw 1; 395 | delete currInstruction; 396 | goto ERROR_EXIT; 397 | } 398 | 399 | // tokenize each line component and add it to the Instruction 400 | for(unsigned int i = 0; i < lineSplit.size(); i++) 401 | { 402 | if(i == 0) 403 | { 404 | unsigned int opcodeBitLength = 0; 405 | 406 | // the first element on the line must be the opcode 407 | result = isOpcode(lineSplit[i]); 408 | if(result != true) 409 | { 410 | cout << "[-] Error line " << lineNum << ": First field is not an hex opcode!!" << endl; 411 | cout << "[-] Got: " << lineSplit[i] << endl; 412 | delete currInstruction; 413 | goto ERROR_EXIT; 414 | } 415 | 416 | currInstruction->setOpcode(lineSplit[i]); 417 | 418 | // we need to keep track of the maximum bit length for the 419 | // combining stage 420 | opcodeBitLength = currInstruction->getOpcode().length(); 421 | updateOpcodeSize(opcodeBitLength); 422 | 423 | if(opcodeBitLength > parsedData.maxOpcodeBits) 424 | { 425 | parsedData.maxOpcodeBitsMutex.lock(); 426 | if(opcodeBitLength > parsedData.maxOpcodeBits) 427 | { 428 | cout << " [*] Updating bit length from " << parsedData.maxOpcodeBits << " to " << opcodeBitLength << endl; 429 | parsedData.maxOpcodeBits = opcodeBitLength; 430 | } 431 | parsedData.maxOpcodeBitsMutex.unlock(); 432 | } 433 | } 434 | else 435 | { 436 | InstructionComponentType currType; 437 | 438 | // all remaining elements on the line are components of the 439 | // instruction 440 | if(isRegister(lineSplit[i])) 441 | { 442 | currType = TYPE_REGISTER; 443 | registers.insert(lineSplit[i]); 444 | 445 | } 446 | else if(isImmediate(lineSplit[i])) 447 | { 448 | currType = TYPE_IMMEDIATE; 449 | } 450 | else 451 | { 452 | mnemonics.insert(lineSplit[i]); 453 | currType = TYPE_INSTRUCTION; 454 | } 455 | 456 | currInstruction->addComponent(currType, lineSplit[i]); 457 | } 458 | } // for (int i = 0; i < lineSplit.size(); i++) 459 | 460 | // sanity check the instruction 461 | result = currInstruction->validateInstruction(); 462 | if(result != true) 463 | { 464 | cout << "[-] Error line " << lineNum << ": Instruction is invalid!!" << endl; 465 | delete currInstruction; 466 | goto ERROR_EXIT; 467 | } 468 | 469 | opcode = currInstruction->getOpcode(); 470 | 471 | // check for duplicate instructions before inserting 472 | itr = allInstructions.find(opcode); 473 | if(itr != allInstructions.end()) 474 | { 475 | cout << "[-] Error line " << lineNum << ": Found duplicate opcode!!" << endl; 476 | delete currInstruction; 477 | goto ERROR_EXIT; 478 | } 479 | 480 | // everything is good, insert instruction into our set 481 | allInstructions.insert({{std::move(opcode), currInstruction}}); 482 | 483 | return 0; 484 | 485 | ERROR_EXIT: 486 | return -1; 487 | } 488 | 489 | // tokenizes the input instructions and appends them to the allInstructions set 490 | int parseInstructions(PARSED_DATA& parsedData, unsigned int fileId) 491 | { 492 | boost::timer::auto_cpu_timer t; 493 | boost::asio::thread_pool threadPool(parsedData.numThreads); 494 | unsigned int portion = 0; 495 | unsigned long long fileSize = 0; 496 | char* fileBuffer = NULL; 497 | unsigned long long portionSize = 0; 498 | unsigned long long start = 0; 499 | 500 | // sanity check thread value 501 | if(parsedData.numThreads == 0) 502 | { 503 | cout << "[-] numThreads cannot be 0" << endl; 504 | return -1; 505 | } 506 | 507 | resetThreadPool(); 508 | 509 | // TODO: review exit error flow 510 | // TODO: why pass in fileId? 511 | 512 | // open the input file for parsing 513 | boost::filesystem::path infile{parsedData.inputFilenames[fileId]}; 514 | boost::filesystem::ifstream ifs{infile, std::ios::ate}; 515 | 516 | if(!ifs) 517 | { 518 | cout << "[-] Failed to open input file!!" << endl; 519 | return -1; 520 | } 521 | 522 | // get the file size 523 | fileSize = ifs.tellg(); 524 | ifs.seekg(0, std::ios::beg); 525 | 526 | // TODO: this throws 527 | fileBuffer = new char[fileSize]; 528 | if(!fileBuffer) 529 | { 530 | cout << "[-] Failed to allocate buffer!!" << endl; 531 | return -1; 532 | } 533 | 534 | ifs.read(fileBuffer, fileSize); 535 | ifs.close(); 536 | 537 | // 538 | // split the disassembly into 1/num threads pieces 539 | // 540 | portionSize = fileSize/parsedData.numThreads; 541 | 542 | for(unsigned int i = 0; i < parsedData.numThreads; i++) 543 | { 544 | unsigned long long end = 0; 545 | 546 | if(start >= fileSize) 547 | { 548 | cout << "Reached end of file " << endl; 549 | continue; 550 | } 551 | 552 | if(i == parsedData.numThreads - 1) 553 | { 554 | // last thread, always set end to fileSize 555 | end = fileSize - 1; 556 | } 557 | else 558 | { 559 | end = start + portionSize; 560 | for(unsigned long long j = end; j < fileSize; j++) 561 | { 562 | if(fileBuffer[j] == '\n') 563 | { 564 | end = j; 565 | break; 566 | } 567 | } 568 | } 569 | 570 | // queue a worker to work on 1/n of the disassembly 571 | boost::asio::post(threadPool, 572 | boost::bind(parseInstructionsWorker, 573 | boost::ref(parsedData), 574 | fileBuffer, 575 | start, 576 | end)); 577 | start = end + 1; 578 | } 579 | 580 | // TODO: improve poll logic 581 | while(1) 582 | { 583 | boost::this_thread::sleep(boost::posix_time::milliseconds(100)); 584 | 585 | unsigned int completedCount = getWorkerCompletions(); 586 | unsigned int failCount = getWorkerFailures(); 587 | 588 | //cout << "Test cases: " << completed_count << "/" << lineNum << " Fail cases: " << fail_count << endl; 589 | 590 | // check if we exceeded our max number of failures 591 | if(failCount > 0) 592 | { 593 | // abort the rest of the threads 594 | threadPool.stop(); 595 | break; 596 | } 597 | 598 | // check if we finished our submitted jobs 599 | if(completedCount >= portion) 600 | { 601 | // finished 602 | break; 603 | } 604 | } 605 | 606 | threadPool.join(); 607 | 608 | delete [] fileBuffer; 609 | 610 | if(getWorkerFailures() > 0) 611 | { 612 | return -1; 613 | } 614 | 615 | // Copy the instructions into the combined instructions set. We need to 616 | // save the original allInstructions to recreate the registers lists when 617 | // we print out the instructions 618 | parsedData.combinedInstructions = parsedData.allInstructions; 619 | 620 | // check if we have a variable length opcodes 621 | parsedData.variableLengthISA = hasVariableLengthOpcodes(); 622 | return 0; 623 | } 624 | 625 | // Walks through all instructions that have combined registers and figures out 626 | // the register list and register variable name and appends them to 627 | // registerVariables. Once registerVariables is filled out attachVariables is 628 | // filled out 629 | void computeAttachVariables(PARSED_DATA& parsedData) 630 | { 631 | boost::timer::auto_cpu_timer t; 632 | std::set::iterator it; 633 | 634 | // iterate through all combined instructions and update registerVariables 635 | for(auto& x: parsedData.combinedInstructions) 636 | { 637 | x.second->computeAttachVariables(parsedData.allInstructions, 638 | parsedData.registerVariables, 639 | parsedData.slas); 640 | } 641 | 642 | for(auto& y: parsedData.registerVariables) 643 | { 644 | // y.second = string consisting all delimited by space 645 | // y.first = register variable name 646 | parsedData.attachVariables[y.second].insert(y.first); 647 | } 648 | return; 649 | } 650 | 651 | // TODO: wrong comment 652 | // Walks through all instructions that have combined registers and figures out 653 | // the register list and register variable name and appends them to 654 | // registerVariables. Once registerVariables is filled out attachVariables is 655 | // filled out. 656 | void computeTokenInstructions(PARSED_DATA& parsedData) 657 | { 658 | boost::timer::auto_cpu_timer t; 659 | std::set::iterator it; 660 | 661 | // iterate through all combined instructions. getOpcodeOutputString() will 662 | // append new tokens to the tokenInstructions set 663 | for(auto& x: parsedData.combinedInstructions) 664 | { 665 | int index = convertOpcodeSizeToIndex(x.first.length()); 666 | if(index < 0) 667 | { 668 | cout << "Invalid opcode size!!" << endl; 669 | throw 1; 670 | } 671 | 672 | x.second->getOpcodeOutputString(parsedData.tokenInstructions[index]); 673 | } 674 | 675 | return; 676 | } 677 | 678 | // worker that deletes the instruction from parsedData.allinstructions 679 | int clearParserWorker(PARSED_DATA& parsedData, 680 | unsigned long long start, 681 | unsigned long long end) 682 | { 683 | map::iterator startItr; 684 | map::iterator endItr; 685 | 686 | startItr = parsedData.allInstructions.begin(); 687 | 688 | std::advance(startItr, start); 689 | endItr = startItr; 690 | std::advance(endItr, end-start + 1); 691 | 692 | for(; startItr != endItr; startItr++) 693 | { 694 | delete startItr->second; 695 | } 696 | 697 | incrementWorkerCompletions(); 698 | return 0; 699 | } 700 | 701 | // splits the instructions to be deleted onto a thread pool 702 | int clearParserScheduler(PARSED_DATA& parsedData) 703 | { 704 | boost::asio::thread_pool threadPool(parsedData.numThreads); 705 | unsigned int portion = 0; 706 | unsigned long long numInstructions = 0; 707 | unsigned long long portionSize = 0; 708 | unsigned long long start = 0; 709 | 710 | // sanity check thread value 711 | if(parsedData.numThreads == 0) 712 | { 713 | cout << "[-] numThreads cannot be 0" << endl; 714 | return -1; 715 | } 716 | 717 | resetThreadPool(); 718 | 719 | // 720 | // split freeing the instructions into 1/num threads pieces 721 | // 722 | numInstructions = parsedData.allInstructions.size(); 723 | if(numInstructions < 1024) 724 | { 725 | parsedData.numThreads = 1; 726 | portionSize = numInstructions; 727 | } 728 | else 729 | { 730 | portionSize = numInstructions/parsedData.numThreads; 731 | } 732 | 733 | for(unsigned int i = 0; i < parsedData.numThreads; i++) 734 | { 735 | unsigned long long end = 0; 736 | 737 | if(i == parsedData.numThreads - 1) 738 | { 739 | // last thread, always set end to fileSize 740 | end = numInstructions - 1; 741 | } 742 | else 743 | { 744 | end = start + portionSize; 745 | } 746 | 747 | // queue a worker to work on 1/n of the disassembly 748 | boost::asio::post(threadPool, 749 | boost::bind(clearParserWorker, 750 | boost::ref(parsedData), 751 | start, 752 | end)); 753 | start = end + 1; 754 | portion++; 755 | } 756 | 757 | // TODO: improve poll logic 758 | while(1) 759 | { 760 | boost::this_thread::sleep(boost::posix_time::milliseconds(100)); 761 | unsigned int completedCount = getWorkerCompletions(); 762 | 763 | // check if we finished our submitted jobs 764 | if(completedCount >= portion) 765 | { 766 | // finished 767 | break; 768 | } 769 | } 770 | 771 | threadPool.join(); 772 | return 0; 773 | } 774 | 775 | // free the parser data structure 776 | void clearParserData(PARSED_DATA& parsedData, bool save_registers) 777 | { 778 | boost::timer::auto_cpu_timer t; 779 | cout << "[*] Freeing parser data" << endl; 780 | 781 | // free any instructions that we allocated during the combine phase 782 | for(auto& y: parsedData.combinedInstructions) 783 | { 784 | if(y.second->getNeedsFree() == true) 785 | { 786 | delete y.second; 787 | } 788 | } 789 | 790 | // multithreaded free the allInstructions map 791 | // this is a perf bottleneck, even after multithreading 792 | if(parsedData.allInstructions.size() > 0) 793 | { 794 | clearParserScheduler(parsedData); 795 | parsedData.allInstructions.clear(); 796 | } 797 | 798 | // no other data structures allocated Instructions 799 | parsedData.combinedInstructions.clear(); 800 | parsedData.registerVariables.clear(); 801 | parsedData.attachVariables.clear(); 802 | 803 | for(unsigned int i = 0; i < sizeof(parsedData.tokenInstructions)/sizeof(parsedData.tokenInstructions[0]); i++) 804 | { 805 | parsedData.tokenInstructions[i].clear(); 806 | } 807 | 808 | if(!save_registers) 809 | { 810 | parsedData.registers.clear(); 811 | parsedData.mnemonics.clear(); 812 | } 813 | } 814 | -------------------------------------------------------------------------------- /parser.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: parser.h 3 | // 4 | // Parsing instructions from disassembly text file 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "instruction.h" 18 | #include "slautil/slautil.h" 19 | using namespace std; 20 | 21 | // if there are too many tokens we have to change our bit naming algorithm 22 | #define MAX_TOKENS 26 23 | 24 | enum COMBINE_TYPE 25 | { 26 | COMBINE_DUPLICATES = 0, // instructions are identical except for a single 27 | // bit in the opcode 28 | COMBINE_IMMEDIATES = 1, // instructions are identical except for a single 29 | // bit in the opcode and a single immediate field 30 | COMBINE_REGISTERS = 2, // instructions are identical except for a single 31 | // bit in the opcode and a single register field 32 | COMBINE_MAX = 3, 33 | }; 34 | 35 | // data we parsed from the instruction set 36 | // we need this to create our output 37 | typedef struct _PARSED_DATA 38 | { 39 | // all instructions parsed. Instruction* was allocated by new and must be 40 | // deleted. 41 | // string = the instruction opcode as a text string of 0s and 1s 42 | map allInstructions; 43 | 44 | // synchronize access to allinstructions map 45 | boost::mutex allInstructionsMutex; 46 | 47 | // combined instructions (e.g. merge duplicates, registers, immediates, 48 | // etc). 49 | // Shallow copy from allInstructions to start with and for that reason we 50 | // should not call delete on instructions. 51 | // string = the combined instrution opcode as a text string 0s, 1s, *s, 52 | // capital letters (for registers), and lower case registers (for 53 | // immediates) 54 | map combinedInstructions; 55 | 56 | // all registers seen while parsing the instruction set 57 | set registers; 58 | 59 | // synchronize access to registers set 60 | boost::mutex registersMutex; 61 | 62 | // all instruction mnemonics seen while parsing the instruction set 63 | // only used for debugging with --print-registers-only option 64 | set mnemonics; 65 | 66 | // synchronize access to mnemonics set 67 | boost::mutex mnemonicsMutex; 68 | 69 | // number of bits for the biggest instruction opcode parsed 70 | unsigned int maxOpcodeBits; 71 | 72 | // synchronize access tot he maxOpcodeBits 73 | boost::mutex maxOpcodeBitsMutex; 74 | 75 | // set to true if the architecture has variable length instructions 76 | bool variableLengthISA; 77 | 78 | // 79 | // Output datas 80 | // 81 | 82 | // registerVariables and attachVariables are used for outputting the 83 | // "attach variables" field in the output 84 | // key = register variable name. Ex "regA_04_07" 85 | // value = string of space delimited registers. Ex: "r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15" 86 | map registerVariables; 87 | 88 | // can be thought of as an inverse registerVariables where we group all 89 | // register variables that have the same list of registers 90 | // key = string of space delimted registers. Ex: "r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15" 91 | // value = set of all register variable names that have the same list of registers. Ex "regA_10_10", "regC_10_10", "regE_10_10" 92 | map> attachVariables; 93 | 94 | // used for outputting the "define token instr pieces" 95 | // to support variable length architectectures: 96 | // - [0] - 1 byte instructions 97 | // - [0] - 2 byte instructions 98 | // - [0] - 3 byte instructions 99 | // - [0] - 4 byte instructions 100 | set tokenInstructions[4]; 101 | 102 | // used for outtputing the "duplicated registers" export section 103 | map duplicatedRegisters; 104 | 105 | // 106 | // command line options, we need some of these for our output 107 | // 108 | 109 | // Path to file(s) for parsing 110 | vector inputFilenames; 111 | 112 | // list of loaded .sla files 113 | // needed similar to the allInstructions map for generating register attach 114 | // directives 115 | vector slas; 116 | 117 | // endianess of the instruction set. Can be either "little" or "big". 118 | // Needed in the output files 119 | string endianness; 120 | 121 | // name of the processor 122 | string processorName; 123 | 124 | // family of the processor 125 | string processorFamily; 126 | 127 | // alignment of the instruction set 128 | unsigned int alignment; 129 | 130 | // bitness of the instruction set 131 | unsigned int bitness; 132 | 133 | // whether or not to display opcodes as comments in the outputted .sla file 134 | // useful for debugging 135 | bool omitOpcodes; 136 | 137 | // whether or not to display an example combined instruction as comments in 138 | // the outputted .sla file. useful for debugging 139 | bool omitExampleInstructions; 140 | 141 | // number of threads to use for each thread pool 142 | // defaults to number of physical CPUs by default 143 | unsigned int numThreads; 144 | 145 | } PARSED_DATA, *PPARSED_DATA; 146 | 147 | int initRegisters(void); 148 | int addRegisters(vector& additionalRegisters); 149 | bool isOpcode(const string& str); 150 | bool isInteger(const string &str); 151 | bool isImmediate(const string& str); 152 | bool isRegister(const string& str); 153 | int parseInstructions(PARSED_DATA& parsedData, unsigned int fileId); 154 | void computeAttachVariables(PARSED_DATA& parsedData); 155 | void computeTokenInstructions(PARSED_DATA& parsedData); 156 | void clearParserData(PARSED_DATA& parsedData, bool save_registers); 157 | int convertOpcodeSizeToIndex(unsigned int opcodeSizeInBits); 158 | -------------------------------------------------------------------------------- /parser_sla.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: parser_sla.cpp 3 | // 4 | // Parsing and combining the instructions from .sla 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | #include "slautil/slautil.h" 10 | #include "parser.h" 11 | 12 | // Tokenizes the input instructions from the .sla and appends them to the 13 | // allInstructions set 14 | int parseInstructionsSla(PARSED_DATA& parsedData, unsigned int fileId) 15 | { 16 | Slautil slautil; 17 | vector registers; 18 | unsigned int count = 0; 19 | int result = 0; 20 | 21 | result = slautil.loadSla(parsedData.inputFilenames[fileId]); 22 | if(result != 0) 23 | { 24 | return result; 25 | } 26 | 27 | result = slautil.getConstructorCount(count); 28 | if(result != 0) 29 | { 30 | cout << "Failed to get constructor count" << endl; 31 | return result; 32 | } 33 | 34 | result = slautil.getRegisters(registers); 35 | if(result != 0) 36 | { 37 | cout << "Failed to get sla registers" << endl; 38 | return result; 39 | } 40 | 41 | result = addRegisters(registers); 42 | if(result != 0) 43 | { 44 | cout << "Failed to add sla registers" << endl; 45 | } 46 | 47 | for(unsigned int j = 0; j < registers.size(); j++) 48 | { 49 | parsedData.registers.insert(registers[j]); 50 | } 51 | 52 | for(unsigned int i = 0; i < count; i++) 53 | { 54 | string bit_pattern; 55 | string constructor_text; 56 | string line; 57 | vector lineSplit; 58 | Instruction* currInstruction = NULL; 59 | bool isCombined = false; 60 | map::iterator itr; 61 | 62 | result = slautil.getConstructorBitPattern(i, bit_pattern); 63 | if(result != 0) 64 | { 65 | cout << "Failed to get bit pattern" << endl; 66 | return result; 67 | } 68 | 69 | result = slautil.getConstructorText(i, constructor_text); 70 | if(result != 0) 71 | { 72 | cout << "Failed to get constructor text" << endl; 73 | return result; 74 | } 75 | 76 | line = bit_pattern + " " + constructor_text; 77 | 78 | // We want to split these fillers from register values 79 | // The simplest way I could come up was to do this but it's slow... 80 | // BUGBUG: improve performance here 81 | // TODO; replace with other impl 82 | boost::replace_all(line, ",", " , "); 83 | boost::replace_all(line, "@", " @ "); 84 | boost::replace_all(line, "(", " ( "); 85 | boost::replace_all(line, ")", " ) "); 86 | boost::replace_all(line, "[", " [ "); 87 | boost::replace_all(line, "]", " ] "); 88 | boost::replace_all(line, "+", " + "); 89 | boost::replace_all(line, "-", " - "); 90 | boost::replace_all(line, "#", " # "); 91 | boost::replace_all(line, "_DUP", ""); // TODO: hack to workaround not being able to have duplicate 92 | // registers in a single instruction 93 | boost::trim(line); 94 | 95 | // split the line into components 96 | boost::split(lineSplit, line, boost::algorithm::is_space(), boost::token_compress_on); 97 | 98 | // Our combining algorithm needs to be rewritten to support more than 99 | // 26 tokens. For the time being bail 100 | if(lineSplit.size() > MAX_TOKENS) 101 | { 102 | cout << "[-] Error constructor " << i << ": Line has more than MAX_TOKENS!!" << endl; 103 | return -1; 104 | } 105 | 106 | currInstruction = new Instruction(); 107 | if(currInstruction == NULL) 108 | { 109 | cout << "[-] Error constructur " << i << ": Failed to allocate!!" << endl; 110 | return -1; 111 | } 112 | 113 | // tokenize each line component and add it to the Instruction 114 | for (unsigned int i = 0; i < lineSplit.size(); i++) 115 | { 116 | if(i == 0) 117 | { 118 | unsigned int opcodeBitLength = 0; 119 | 120 | currInstruction->setOpcodeBitString(lineSplit[i]); 121 | 122 | // we need to keep track of the maximum bit length for the 123 | // combining stage 124 | opcodeBitLength = currInstruction->getOpcode().length(); 125 | if(opcodeBitLength > parsedData.maxOpcodeBits) 126 | { 127 | //cout << "Updating bit length from " << parsedData.maxOpcodeBits << " to " << opcodeBitLength << endl; 128 | parsedData.maxOpcodeBits = opcodeBitLength; 129 | } 130 | } 131 | else 132 | { 133 | InstructionComponentType currType; 134 | 135 | if (lineSplit[i].find("_DUP") != std::string::npos) 136 | { 137 | std::cout << "found! " << lineSplit[i] << endl; 138 | throw 1; 139 | } 140 | 141 | // all remaining elements on the line are components of the 142 | // instruction 143 | if(isRegister(lineSplit[i])) 144 | { 145 | currType = TYPE_REGISTER; 146 | 147 | if(lineSplit[i] == "__register_list__") 148 | { 149 | currInstruction->setCombined(true); 150 | isCombined = true; 151 | } 152 | else 153 | { 154 | parsedData.registers.insert(lineSplit[i]); 155 | } 156 | } 157 | else if(isImmediate(lineSplit[i])) 158 | { 159 | currType = TYPE_IMMEDIATE; 160 | 161 | if(lineSplit[i] == "__immediate_list__") 162 | { 163 | currInstruction->setCombined(true); 164 | isCombined = true; 165 | } 166 | } 167 | else 168 | { 169 | currType = TYPE_INSTRUCTION; 170 | } 171 | 172 | currInstruction->addComponent(currType, 173 | lineSplit[i], 174 | isCombined); 175 | } 176 | } // for (int i = 0; i < lineSplit.size(); i++) 177 | 178 | // sanity check the instruction 179 | result = currInstruction->validateInstruction(); 180 | if(result != true) 181 | { 182 | cout << "[-] Error line " << i << ": Instruction is invalid!!" << endl; 183 | delete currInstruction; 184 | return -1; 185 | } 186 | 187 | // check for duplicate instructions before inserting 188 | itr = parsedData.allInstructions.find(currInstruction->getOpcode()); 189 | if(itr != parsedData.allInstructions.end()) 190 | { 191 | cout << "[-] Error line " << i << ": Found duplicate opcode!!" << endl; 192 | delete currInstruction; 193 | return -1; 194 | } 195 | 196 | // everything is good, insert instruction into our set 197 | parsedData.allInstructions.insert({{currInstruction->getOpcode(), 198 | currInstruction}}); 199 | 200 | } // for(unsigned int i = 0; i < count; i++) 201 | 202 | // Copy the instructions into the combined instructions set 203 | // We need to save the original allInstructions to recreate the registers 204 | // lists when we print out the instructions 205 | parsedData.combinedInstructions.merge(parsedData.allInstructions); 206 | 207 | parsedData.slas.push_back(slautil); 208 | return 0; 209 | } 210 | -------------------------------------------------------------------------------- /parser_sla.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: parser_sla.h 3 | // 4 | // Parsing and combining the instructions from .sla 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "instruction.h" 17 | using namespace std; 18 | 19 | int parseInstructionsSla(PARSED_DATA& parsedData, unsigned int fileId); 20 | -------------------------------------------------------------------------------- /slautil/slautil.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: slautil.cpp 3 | // 4 | // Misc helper functions for working with .sla files 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "slautil.h" 16 | 17 | using namespace std; 18 | namespace pt = boost::property_tree; 19 | 20 | // sorting bit_patterns by start_bit 21 | struct less_than_key 22 | { 23 | inline bool operator() (const BIT_PATTERN& a, const BIT_PATTERN& b) 24 | { 25 | return (b.start_bit < a.start_bit); 26 | } 27 | }; 28 | 29 | // default constructor 30 | Slautil::Slautil(void) 31 | { 32 | m_initialized = false; 33 | } 34 | 35 | // load the processor module file 36 | // currently only supports XML .sla files 37 | int Slautil::loadSla(const string& filename) 38 | { 39 | int status = 0; 40 | 41 | status = this->loadSlaXML(filename); 42 | if(status != SLA_SUCCESS) 43 | { 44 | return status; 45 | } 46 | 47 | m_initialized = true; 48 | return SLA_SUCCESS; 49 | } 50 | 51 | // return the registers from the processor module 52 | int Slautil::getRegisters(vector& registers) 53 | { 54 | if(!m_initialized) 55 | { 56 | return NOT_INITIALIZED; 57 | } 58 | 59 | registers.reserve(m_registers.size()); 60 | std::copy(m_registers.begin(), 61 | m_registers.end(), 62 | std::back_inserter(registers)); 63 | 64 | return SLA_SUCCESS; 65 | } 66 | 67 | // get the number of instructions (constructors) in the processor module 68 | int Slautil::getConstructorCount(unsigned int& count) 69 | { 70 | if(!m_initialized) 71 | { 72 | return NOT_INITIALIZED; 73 | } 74 | 75 | count = m_constructors.size(); 76 | return SLA_SUCCESS; 77 | } 78 | 79 | // generate the opcode bit patterns for immediates and registers 80 | int Slautil::addNonOpcodeBitPatterns(void) 81 | { 82 | for(unsigned int i = 0; i < m_constructors.size(); i++) 83 | { 84 | unsigned int num_immediates = 0; 85 | unsigned int num_registers = 0; 86 | 87 | PCONSTRUCTOR curr_constructor = &m_constructors[i]; 88 | 89 | // TODO; cleanup function 90 | 91 | for(unsigned int j = 0; 92 | j < curr_constructor->constructor_pieces.size(); 93 | j++) 94 | { 95 | PCONSTRUCTOR_PIECE curr_constructor_piece = NULL; 96 | 97 | curr_constructor_piece = &curr_constructor->constructor_pieces[j]; 98 | 99 | if(curr_constructor_piece->type == "opprint") 100 | { 101 | boost::unordered_map ::iterator itr; 102 | boost::unordered_map ::iterator itr2; 103 | 104 | //cout << curr_constructor_piece->id << endl; 105 | 106 | itr = m_varlist_syms.find(curr_constructor_piece->id); 107 | if(itr == m_varlist_syms.end()) 108 | { 109 | itr2 = m_operand_syms.find(curr_constructor_piece->id); 110 | if(itr2 == m_operand_syms.end()) 111 | { 112 | boost::unordered_map ::iterator itr3; 113 | 114 | itr3 = m_vars.find(curr_constructor_piece->id); 115 | if(itr3 == m_vars.end()) 116 | { 117 | cout << "Failed to find " << curr_constructor_piece->id << endl; 118 | throw 1; 119 | continue; 120 | } 121 | 122 | if(std::find(m_registers.begin(), m_registers.end(), itr3->second) != m_registers.end()) 123 | { 124 | num_registers++; 125 | continue; 126 | } 127 | else 128 | { 129 | cout << "What the heck should never get here!! " << itr3->second << endl; 130 | throw 4; 131 | } 132 | 133 | throw 1; 134 | continue; 135 | } 136 | 137 | // found operand_syms 138 | addBitPattern(m_constructors[i], 139 | itr2->second.bitfield, 140 | "imm", 141 | num_immediates); 142 | num_immediates++; 143 | continue; 144 | } 145 | 146 | addBitPattern(m_constructors[i], 147 | itr->second.bitfield, 148 | "reg", 149 | num_registers); 150 | num_registers++; 151 | continue; 152 | } 153 | } // for(unsigned int j = 0; j < curr_constructor->constructor_pieces.size(); j++) 154 | 155 | // sort the bit patterns 156 | std::sort(curr_constructor->bit_patterns.begin(), 157 | curr_constructor->bit_patterns.end(), 158 | less_than_key()); 159 | } 160 | 161 | return SLA_SUCCESS; 162 | } 163 | 164 | // adds a bit pattern to a constructor 165 | int Slautil::addBitPattern(CONSTRUCTOR& curr_constructor, 166 | const TOKENFIELD& bitfield, 167 | const string& type, 168 | unsigned int count) 169 | { 170 | BIT_PATTERN curr_bit_pattern; 171 | unsigned char patternChar = '\x0'; 172 | 173 | if(count >= 25) 174 | { 175 | return -1; 176 | } 177 | 178 | curr_bit_pattern.start_bit = bitfield.startbit; 179 | curr_bit_pattern.end_bit = bitfield.endbit; 180 | curr_bit_pattern.pattern_type = type; 181 | 182 | if(type == "imm") 183 | { 184 | patternChar = 'a' + count; 185 | } 186 | else if(type == "reg") 187 | { 188 | patternChar = 'A' + count; 189 | } 190 | else 191 | { 192 | patternChar = '?'; 193 | } 194 | 195 | for(unsigned int i = curr_bit_pattern.start_bit; 196 | i <= curr_bit_pattern.end_bit; 197 | i++) 198 | { 199 | curr_bit_pattern.pattern += patternChar; 200 | } 201 | 202 | curr_constructor.bit_patterns.push_back(curr_bit_pattern); 203 | 204 | return SLA_SUCCESS; 205 | } 206 | 207 | // get the opcode bit pattern given an constructor id 208 | int Slautil::getConstructorBitPattern(unsigned int id, string& bit_pattern) 209 | { 210 | PCONSTRUCTOR curr_constructor = NULL; 211 | unsigned int size = 0; 212 | 213 | if(id >= m_constructors.size()) 214 | { 215 | cout << "Bad ID!!" << endl; 216 | return -2; 217 | } 218 | 219 | curr_constructor = &m_constructors[id]; 220 | bit_pattern = ""; 221 | 222 | for(unsigned int k = 0; k < curr_constructor->bit_patterns.size(); k++) 223 | { 224 | PBIT_PATTERN curr_bit_pattern = &curr_constructor->bit_patterns[k]; 225 | 226 | //bit_pattern += curr_bit_pattern->pattern_type + "_" + to_string(curr_bit_pattern->start_bit) + "_" + to_string(curr_bit_pattern->end_bit) + "="; 227 | 228 | if(curr_bit_pattern->pattern_type == "opcode") 229 | { 230 | bit_pattern += curr_constructor->bit_patterns[k].pattern; 231 | } 232 | else if(curr_bit_pattern->pattern_type == "reg") 233 | { 234 | size = curr_bit_pattern->end_bit - 235 | curr_bit_pattern->start_bit + 1; 236 | bit_pattern += string(size, curr_bit_pattern->pattern[0]); 237 | } 238 | else if(curr_bit_pattern->pattern_type == "imm") 239 | { 240 | size = curr_bit_pattern->end_bit - 241 | curr_bit_pattern->start_bit + 1; 242 | bit_pattern += string(size, curr_bit_pattern->pattern[0]); 243 | } 244 | } 245 | 246 | // sanity check the bit pattern size 247 | if(bit_pattern.size() == 0) 248 | { 249 | return -1; 250 | } 251 | 252 | return SLA_SUCCESS; 253 | } 254 | 255 | // get the instruction mnemonic given a constructor id 256 | int Slautil::getConstructorText(unsigned int id, string& constructor_text) 257 | { 258 | string unused; 259 | return getConstructorText(id, constructor_text, false, unused); 260 | } 261 | 262 | // get the instruction mnemonic given a constructor id 263 | int Slautil::getConstructorText(unsigned int id, 264 | string& constructor_text, 265 | bool use_bit_pattern, 266 | const string& bit_pattern) 267 | { 268 | PCONSTRUCTOR curr_constructor = NULL; 269 | 270 | if(!m_initialized) 271 | { 272 | return NOT_INITIALIZED; 273 | } 274 | 275 | if(id >= m_constructors.size()) 276 | { 277 | cout << "Bad ID!!" << endl; 278 | return -2; 279 | } 280 | 281 | curr_constructor = &m_constructors[id]; 282 | constructor_text = ""; 283 | 284 | for(unsigned int j = 0; j < curr_constructor->constructor_pieces.size(); j++) 285 | { 286 | PCONSTRUCTOR_PIECE curr_constructor_piece = NULL; 287 | 288 | curr_constructor_piece = &curr_constructor->constructor_pieces[j]; 289 | 290 | if(curr_constructor_piece->type == "print") 291 | { 292 | constructor_text += curr_constructor_piece->part; 293 | } 294 | else if(curr_constructor_piece->type == "opprint") 295 | { 296 | //cout << curr_constructor_piece->type << endl; 297 | //cout << curr_constructor_piece->id << endl; 298 | 299 | // todo change logic 300 | boost::unordered_map ::iterator itr; 301 | boost::unordered_map ::iterator itr2; 302 | 303 | itr = m_varlist_syms.find(curr_constructor_piece->id); 304 | if(itr == m_varlist_syms.end()) 305 | { 306 | itr2 = m_operand_syms.find(curr_constructor_piece->id); 307 | if(itr2 == m_operand_syms.end()) 308 | { 309 | boost::unordered_map ::iterator itr3; 310 | itr3 = m_vars.find(curr_constructor_piece->id); 311 | 312 | if(itr3 == m_vars.end()) 313 | { 314 | cout << "Failed to find " << curr_constructor_piece->id << endl; 315 | throw 1; 316 | continue; 317 | } 318 | 319 | constructor_text += itr3->second; 320 | continue; 321 | } 322 | else 323 | { 324 | if(use_bit_pattern == false) 325 | { 326 | constructor_text += "__immediate_list__"; 327 | } 328 | else 329 | { 330 | unsigned int value = 0; 331 | convertBitFieldToValue(itr2->second.bitfield, 332 | bit_pattern, 333 | value); 334 | 335 | stringstream ss; 336 | ss << setbase(16) << value; 337 | 338 | constructor_text += "0x" + ss.str(); 339 | } 340 | } 341 | 342 | continue; 343 | } 344 | else 345 | { 346 | if(use_bit_pattern == false) 347 | { 348 | constructor_text += "__register_list__"; 349 | } 350 | else 351 | { 352 | unsigned int register_index = 0; 353 | convertBitFieldToValue(itr->second.bitfield, 354 | bit_pattern, 355 | register_index); 356 | 357 | if(register_index < itr->second.register_ids.size()) 358 | { 359 | boost::unordered_map ::iterator itr3; 360 | itr3 = m_vars.find(itr->second.register_ids[register_index]); 361 | 362 | constructor_text += itr3->second; 363 | } 364 | else 365 | { 366 | constructor_text += "___ERROR_REGISTER__INDEX__"; 367 | } 368 | } 369 | } 370 | } 371 | } // for(unsigned int j = 0; j < curr_constructor->constructor_pieces.size(); j++) 372 | 373 | return SLA_SUCCESS; 374 | } 375 | 376 | // get the constructor register by id 377 | int Slautil::getConstructorTextRegisterById(unsigned int id, 378 | string& register_name, 379 | unsigned int register_number, 380 | string& bit_pattern) 381 | { 382 | PCONSTRUCTOR curr_constructor = NULL; 383 | unsigned int registers_count = 0; 384 | 385 | // TODO: sloppy, add error handling 386 | 387 | if(!m_initialized) 388 | { 389 | return NOT_INITIALIZED; 390 | } 391 | 392 | if(id >= m_constructors.size()) 393 | { 394 | cout << "Bad ID!!" << endl; 395 | return -2; 396 | } 397 | 398 | curr_constructor = &m_constructors[id]; 399 | register_name = ""; 400 | 401 | for(unsigned int j = 0; 402 | j < curr_constructor->constructor_pieces.size(); 403 | j++) 404 | { 405 | PCONSTRUCTOR_PIECE curr_constructor_piece = NULL; 406 | curr_constructor_piece = &curr_constructor->constructor_pieces[j]; 407 | 408 | if(curr_constructor_piece->type == "opprint") 409 | { 410 | // todo change logic 411 | boost::unordered_map ::iterator itr; 412 | boost::unordered_map ::iterator itr2; 413 | 414 | itr = m_varlist_syms.find(curr_constructor_piece->id); 415 | if(itr != m_varlist_syms.end()) 416 | { 417 | if(registers_count != register_number) 418 | { 419 | registers_count++; 420 | continue; 421 | } 422 | 423 | unsigned int register_index = 0; 424 | 425 | /* 426 | cout << "itr->second.bitfield " << &itr->second.bitfield << endl; 427 | cout << "bitpattern " << bit_pattern << endl; 428 | cout << "regindex " << register_index << endl; 429 | */ 430 | 431 | convertBitFieldToValue(itr->second.bitfield, 432 | bit_pattern, 433 | register_index); 434 | 435 | if(register_index < itr->second.register_ids.size()) 436 | { 437 | boost::unordered_map ::iterator itr3; 438 | itr3 = m_vars.find(itr->second.register_ids[register_index]); 439 | register_name += itr3->second; 440 | return 0; 441 | } 442 | else 443 | { 444 | cout << "bad bad bad" << endl; 445 | register_name += "___ERROR_REGISTER__INDEX__"; 446 | cout << register_name << endl; 447 | throw 1; 448 | return 0; 449 | } 450 | } 451 | 452 | boost::unordered_map ::iterator itr3; 453 | itr3 = m_vars.find(curr_constructor_piece->id); 454 | if(itr3 == m_vars.end()) 455 | { 456 | cout << "Failed to find " << curr_constructor_piece->id << endl; 457 | throw 1; 458 | continue; 459 | } 460 | else 461 | { 462 | if(std::find(m_registers.begin(), m_registers.end(), itr3->second) != m_registers.end()) 463 | { 464 | if(registers_count == register_number) 465 | { 466 | register_name = itr3->second; 467 | //cout << "FOUND " << register_name << endl; 468 | //throw 1; 469 | return 0; 470 | } 471 | 472 | registers_count++; 473 | continue; 474 | } 475 | continue; 476 | } 477 | } 478 | else if(curr_constructor_piece->type == "print") 479 | { 480 | // TODO: 481 | // BUGBUG: incorrect hack 482 | if(curr_constructor_piece->part[0] == 'r' && 483 | curr_constructor_piece->part[1] == '0') 484 | { 485 | //cout << "print: " << curr_constructor_piece->part << endl; 486 | 487 | if(registers_count == register_number) 488 | { 489 | register_name = "r0"; 490 | //cout << "FOUND " << register_name << endl; 491 | return 0; 492 | } 493 | 494 | registers_count++; 495 | continue; 496 | } 497 | } 498 | } 499 | 500 | // TODO; should never get here 501 | cout << "RC " << registers_count << " register_number " << register_number << endl; 502 | cout << "Failing here" << endl; 503 | return -1; 504 | } 505 | 506 | // get a constructor mnemonic via opcode bit string 507 | int Slautil::getConstructorTextByBitPattern(const string& bit_pattern, 508 | string& constructor_text) 509 | { 510 | unsigned int id = 0; 511 | int result = 0; 512 | 513 | result = getConstructorIdByBitPattern(bit_pattern, id); 514 | if(result != 0) 515 | { 516 | //cout << "Failed to find bit pattern" << endl; 517 | return result; 518 | } 519 | 520 | result = getConstructorText(id, constructor_text, true, bit_pattern); 521 | if(result != 0) 522 | { 523 | //cout << "Failed to get constructor text" << endl; 524 | return result; 525 | } 526 | 527 | return SLA_SUCCESS; 528 | } 529 | 530 | // get a constructor ID by opcode bit string 531 | int Slautil::getConstructorIdByBitPattern(const string& bit_pattern, 532 | unsigned int& id) 533 | { 534 | unsigned int count; 535 | int result = 0; 536 | id = 0xffffffff; 537 | 538 | if(!m_initialized) 539 | { 540 | return NOT_INITIALIZED; 541 | } 542 | 543 | result = this->getConstructorCount(count); 544 | if(result != 0) 545 | { 546 | cout << "Failed to get constructor count" << endl; 547 | return result; 548 | } 549 | 550 | for(unsigned int i = 0; i < count; i++) 551 | { 552 | string bit_pattern2; 553 | result = this->getConstructorBitPattern(i, bit_pattern2); 554 | if(result != 0) 555 | { 556 | cout << "Failed to get bit pattern" << endl; 557 | return result; 558 | } 559 | 560 | result = this->compareBitPatterns(bit_pattern, bit_pattern2); 561 | if(result == 0) 562 | { 563 | if(id != 0xffffffff) 564 | { 565 | //cout << "Found duplicate!!" << endl; 566 | //return -1; 567 | } 568 | id = i; 569 | } 570 | } 571 | 572 | if(id != 0xffffffff) 573 | { 574 | return SLA_SUCCESS; 575 | } 576 | 577 | return -1; 578 | } 579 | 580 | // compare two opcode bit patterns 581 | // has fuzzy logic for combined fields 582 | int Slautil::compareBitPatterns(const string& a, const string& b) 583 | { 584 | bool a_is_digit = true; 585 | bool b_is_digit = true; 586 | 587 | if(a.size() != b.size()) 588 | { 589 | return -1; 590 | } 591 | 592 | for(unsigned int i = 0; i < a.size(); i++) 593 | { 594 | a_is_digit = ((a[i] == '0') || (a[i] == '1')); 595 | b_is_digit = ((b[i] == '0') || (b[i] == '1')); 596 | 597 | if(a_is_digit != b_is_digit) 598 | { 599 | // one is a digit, the other isn't 600 | // this is fine 601 | continue; 602 | } 603 | 604 | // both are digits or non-digits 605 | // must be the same 606 | if(a[i] != b[i]) 607 | { 608 | return -1; 609 | } 610 | } 611 | 612 | return 0; 613 | } 614 | 615 | // converts a bit field into a value 616 | int Slautil::convertBitFieldToValue(TOKENFIELD& bitfield, 617 | const string& bit_pattern, 618 | unsigned int& value) 619 | { 620 | value = 0; 621 | 622 | unsigned int bit_pattern_end = bit_pattern.length(); 623 | 624 | if(bitfield.startbit >= bit_pattern_end || 625 | bitfield.endbit >= bit_pattern_end) 626 | { 627 | cout << "Invalid bit field\bit pattern combination!!" << endl; 628 | throw 2; 629 | } 630 | 631 | for(unsigned int i = bitfield.startbit; i <= bitfield.endbit; i++) 632 | { 633 | unsigned int bit_pos = i - bitfield.startbit; 634 | 635 | if(bit_pattern[bit_pattern_end - i - 1] == '1') 636 | { 637 | value += (1 << bit_pos); 638 | } 639 | else if(bit_pattern[bit_pattern_end - i - 1] == '0') 640 | { 641 | // don't do anything for zero 642 | } 643 | else 644 | { 645 | // TODO fix 646 | cout << "Unexpected bit string val!!" << endl; 647 | throw 1; 648 | } 649 | } 650 | 651 | return 0; 652 | } 653 | -------------------------------------------------------------------------------- /slautil/slautil.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: slautil.h 3 | // 4 | // Misc helper functions for working with .sla files 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | using namespace std; 20 | namespace pt = boost::property_tree; 21 | 22 | #define SLEIGH_VERSION 4 23 | #define SLA_SUCCESS (0) 24 | #define NOT_INITIALIZED (-1) 25 | 26 | typedef struct _DECISION_PAIR 27 | { 28 | unsigned int id; 29 | unsigned int off; 30 | unsigned int nonzero; 31 | unsigned int mask; 32 | unsigned int val; 33 | } DECISION_PAIR, *PDECISION_PAIR; 34 | 35 | typedef struct _BIT_PATTERN 36 | { 37 | unsigned int start_bit; 38 | unsigned int end_bit; 39 | string pattern_type; 40 | string pattern; 41 | 42 | } BIT_PATTERN, *PBIT_PATTERN; 43 | 44 | typedef struct _TOKENFIELD 45 | { 46 | bool bigendian; 47 | bool signbit; 48 | unsigned int startbit; 49 | unsigned int endbit; 50 | unsigned int startbyte; 51 | unsigned int endbyte; 52 | unsigned int shift; 53 | 54 | } TOKENFIELD, *PTOKENFIELD; 55 | 56 | typedef struct _varlist_sym 57 | { 58 | unsigned int id; 59 | TOKENFIELD bitfield; 60 | vector register_ids; 61 | } varlist_sym, *pvarlist_sym; 62 | 63 | typedef struct _OPERAND_SYM 64 | { 65 | unsigned int id; 66 | TOKENFIELD bitfield; 67 | } OPERAND_SYM, *POPERAND_SYM; 68 | 69 | typedef struct _CONSTRUCTOR_PIECE 70 | { 71 | string type; // print or opprint 72 | unsigned int id; // needed for opprint 73 | string part; 74 | } CONSTRUCTOR_PIECE, *PCONSTRUCTOR_PIECE; 75 | 76 | typedef struct _CONSTRUCTOR 77 | { 78 | unsigned int id; 79 | unsigned int constructor_length; // length of the instruction in bytes 80 | unsigned int source_file; 81 | unsigned int line_number; 82 | vector constructor_pieces; 83 | vector bit_patterns; 84 | } CONSTRUCTOR, *PCONSTRUCTOR; 85 | 86 | class Slautil 87 | { 88 | public: 89 | Slautil(); 90 | 91 | int loadSla(const string& filename); 92 | int getRegisters(vector& registers); 93 | 94 | // various way to look up instructions 95 | int getConstructorCount(unsigned int& count); 96 | int getConstructorText(unsigned int id, string& constructor_text); 97 | int getConstructorBitPattern(unsigned int id, string& bit_pattern); 98 | int getConstructorTextByBitPattern(const string& bit_pattern, 99 | string& constructor_text); 100 | int getConstructorIdByBitPattern(const string& bit_pattern, 101 | unsigned int& id); 102 | int getConstructorTextRegisterById(unsigned int id, 103 | string& register_name, 104 | unsigned int register_number, 105 | string& bit_pattern); 106 | 107 | private: 108 | int loadSlaXML(const string& filename); 109 | 110 | // parsing fields within the xml 111 | int parseRegisters(void); 112 | int parseVars(void); 113 | int parseSubtableSymHeads(void); 114 | int parseConstructors(void); 115 | int parseVarlistSym(void); 116 | int parseOperandSyms(void); 117 | int parseDecisionPairs(void); 118 | int convertDecisionPairsToBitPatterns(void); 119 | int recursiveParseDecisionPairs(const boost::property_tree::ptree & subtree); 120 | int parseDecisionPair(const boost::property_tree::ptree& subtree); 121 | int addNonOpcodeBitPatterns(void); 122 | 123 | // various helper routines 124 | int getConstructorText(unsigned int id, 125 | string& constructor_text, 126 | bool use_bitpattern, 127 | const string& bit_pattern); 128 | int checkSubsym(unsigned int& id); 129 | int countAdjacentOnes(unsigned int id, 130 | unsigned int mask, 131 | unsigned int value); 132 | string extractBits(unsigned int start_bit, 133 | unsigned int number_of_bits, 134 | unsigned int value); 135 | int addBitPattern(CONSTRUCTOR& curr_constructor, 136 | const TOKENFIELD& bitfield, 137 | const string& type, 138 | unsigned int count); 139 | int compareBitPatterns(const string& a, const string& b); 140 | int convertBitFieldToValue(TOKENFIELD& bitfield, 141 | const string& bit_pattern, 142 | unsigned int& value); 143 | 144 | // member vars 145 | boost::unordered_map m_varlist_syms; 146 | boost::unordered_map m_operand_syms; 147 | boost::unordered_map m_subsyms; 148 | boost::unordered_map m_vars; 149 | vector m_constructors; 150 | vector m_decision_pairs; 151 | vector m_registers; 152 | unsigned int m_constructor_count; 153 | unsigned int m_sleigh_version; 154 | pt::ptree m_tree; 155 | bool m_initialized; 156 | }; 157 | -------------------------------------------------------------------------------- /slautil/slaxml.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: slautil.h 3 | // 4 | // Parsing XML SLA files 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "slautil.h" 16 | 17 | using namespace std; 18 | namespace pt = boost::property_tree; 19 | 20 | // load the XML SLA processor module 21 | int Slautil::loadSlaXML(const string& filename) 22 | { 23 | // Parse the XML into the property tree. 24 | try 25 | { 26 | pt::read_xml(filename, m_tree); 27 | } 28 | catch(...) 29 | { 30 | cout << "[-] Exception when opening sla (" << filename << ")!" << endl; 31 | return -1; 32 | } 33 | 34 | m_sleigh_version = m_tree.get("sleigh..version", 0); 35 | if(m_sleigh_version != SLEIGH_VERSION) 36 | { 37 | cout << "[-] Invalid sleigh version (" << m_sleigh_version << ")!" << endl; 38 | cout << "[-] Is the .sla file correct?" << endl; 39 | return -1; 40 | } 41 | 42 | this->parseVars(); 43 | this->parseSubtableSymHeads(); 44 | this->parseOperandSyms(); 45 | this->parseConstructors(); 46 | this->parseDecisionPairs(); 47 | this->convertDecisionPairsToBitPatterns(); 48 | this->parseVarlistSym(); 49 | this->parseRegisters(); // TODO: needs to happen before add_non_opcode_bit_patterns() 50 | this->addNonOpcodeBitPatterns(); 51 | 52 | return SLA_SUCCESS; 53 | } 54 | 55 | // read the variables from the processor module 56 | int Slautil::parseVars(void) 57 | { 58 | for(auto &v : m_tree.get_child("sleigh.symbol_table")) 59 | { 60 | if(v.first != "varnode_sym_head" && 61 | v.first != "value_sym_head" && 62 | v.first != "operand_sym_head") 63 | { 64 | continue; 65 | } 66 | 67 | // todo can throw 68 | std::string name = v.second.get_child(".name").data(); 69 | std::string id_str = v.second.get_child(".id").data(); 70 | 71 | unsigned int id = stoi(id_str, 0, 0x10); 72 | 73 | m_vars.emplace(id, name); 74 | } 75 | 76 | return SLA_SUCCESS; 77 | } 78 | 79 | // read the subtable sym heads from the processor module 80 | int Slautil::parseSubtableSymHeads(void) 81 | { 82 | for(auto &v : m_tree.get_child("sleigh.symbol_table")) 83 | { 84 | size_t pos = 0; 85 | 86 | if(v.first != "subtable_sym_head") 87 | { 88 | continue; 89 | } 90 | 91 | // todo can throw 92 | std::string name = v.second.get_child(".name").data(); 93 | std::string id_str = v.second.get_child(".id").data(); 94 | 95 | // silly workaround to support instructions that reference the same reg more than once 96 | pos = name.find("_dup"); 97 | if (pos == std::string::npos) 98 | { 99 | continue; 100 | } 101 | 102 | name.resize(pos); 103 | 104 | unsigned int id = stoi(id_str, 0, 0x10); 105 | m_vars.emplace(id, name); 106 | } 107 | 108 | return SLA_SUCCESS; 109 | } 110 | 111 | // read the operand syms from the processor module 112 | int Slautil::parseOperandSyms(void) 113 | { 114 | for(auto &operand_sym_node : m_tree.get_child("sleigh.symbol_table")) 115 | { 116 | OPERAND_SYM curr_operand_sym; 117 | std::string var_subsym_id_str; 118 | std::string var_id_str; 119 | 120 | if(operand_sym_node.first != "operand_sym") 121 | { 122 | continue; 123 | } 124 | 125 | var_id_str = operand_sym_node.second.get(".id", ""); 126 | if(var_id_str == "") 127 | { 128 | continue; 129 | } 130 | unsigned int var_id = stoi(var_id_str, 0, 0x10); 131 | 132 | var_subsym_id_str = operand_sym_node.second.get(".subsym", ""); 133 | if(var_subsym_id_str != "") 134 | { 135 | unsigned int var_subsym_id = stoi(var_subsym_id_str, 0, 0x10); 136 | m_subsyms[var_id] = var_subsym_id; 137 | continue; 138 | } 139 | 140 | curr_operand_sym.id = var_id; 141 | curr_operand_sym.bitfield.startbit = operand_sym_node.second.get("tokenfield..startbit", 0); 142 | curr_operand_sym.bitfield.endbit = operand_sym_node.second.get("tokenfield..endbit", 0); 143 | curr_operand_sym.bitfield.startbyte = operand_sym_node.second.get("tokenfield..startbyte", 0); 144 | curr_operand_sym.bitfield.endbyte = operand_sym_node.second.get("tokenfield..endbyte", 0); 145 | curr_operand_sym.bitfield.shift = operand_sym_node.second.get("tokenfield..shift", 0); 146 | 147 | m_operand_syms[curr_operand_sym.id] = curr_operand_sym; 148 | } 149 | 150 | return SLA_SUCCESS; 151 | } 152 | 153 | // read the instruction constructors from the processor module 154 | int Slautil::parseConstructors(void) 155 | { 156 | m_constructor_count = m_tree.get("sleigh.symbol_table.subtable_sym..numct", 0); 157 | 158 | for(auto &constructor_node : m_tree.get_child("sleigh.symbol_table.subtable_sym")) 159 | { 160 | CONSTRUCTOR temp_constructor = {}; 161 | vector ids; 162 | 163 | if(constructor_node.first != "constructor") 164 | { 165 | continue; 166 | } 167 | 168 | temp_constructor.constructor_length = constructor_node.second.get(".length", 0); 169 | temp_constructor.source_file = constructor_node.second.get(".source", 0); 170 | temp_constructor.line_number = constructor_node.second.get(".line", 0); 171 | 172 | // todo: should we check constructor.parent = 0? 173 | 174 | for(auto &constructor_node_child : constructor_node.second) 175 | { 176 | if(constructor_node_child.first == "") 177 | { 178 | continue; 179 | } 180 | else if(constructor_node_child.first == "construct_tpl") 181 | { 182 | continue; 183 | } 184 | else if(constructor_node_child.first == "oper") 185 | { 186 | string id_str; 187 | unsigned int id = 0; 188 | 189 | id_str = constructor_node_child.second.get(".id", ""); 190 | id = stoi(id_str, NULL, 0x10); 191 | ids.push_back(id); 192 | } 193 | else if(constructor_node_child.first == "print") 194 | { 195 | CONSTRUCTOR_PIECE temp_constructor_piece; 196 | 197 | temp_constructor_piece.type = "print"; 198 | temp_constructor_piece.id = -1; 199 | temp_constructor_piece.part = constructor_node_child.second.get(".piece", ""); 200 | 201 | temp_constructor.constructor_pieces.push_back(temp_constructor_piece); 202 | } 203 | else if(constructor_node_child.first == "opprint") 204 | { 205 | string id_str; 206 | unsigned int id = 0; 207 | unsigned int id2 = 0; 208 | 209 | id_str = constructor_node_child.second.get(".id", ""); 210 | id = stoi(id_str); 211 | 212 | CONSTRUCTOR_PIECE temp_constructor_piece; 213 | 214 | id2 = ids[id]; 215 | 216 | checkSubsym(id2); 217 | 218 | string var = m_vars[id2]; 219 | 220 | temp_constructor_piece.type = "opprint"; 221 | temp_constructor_piece.id = id2; 222 | temp_constructor_piece.part = var; 223 | 224 | // part?? 225 | 226 | temp_constructor.constructor_pieces.push_back(temp_constructor_piece); 227 | } 228 | else 229 | { 230 | cout << "Unknown constructor node child: " << constructor_node_child.first << endl; 231 | return -2; 232 | } 233 | } 234 | m_constructors.push_back(temp_constructor); 235 | } 236 | 237 | if(m_constructor_count != m_constructors.size()) 238 | { 239 | cout << "Invalid constructors: " << m_constructor_count << " " << m_constructors.size() << endl; 240 | return -2; 241 | } 242 | 243 | return SLA_SUCCESS; 244 | } 245 | 246 | // parse the decision pairs from the processor module 247 | // decision pairs are used to differentiate instructions via their opcode 248 | int Slautil::parseDecisionPairs(void) 249 | { 250 | m_decision_pairs.resize(m_constructor_count); 251 | 252 | const boost::property_tree::ptree & subtree = m_tree.get_child("sleigh.symbol_table.subtable_sym.decision"); 253 | this->recursiveParseDecisionPairs(subtree); 254 | 255 | return SLA_SUCCESS; 256 | } 257 | 258 | // decision pairs can be recursively defined 259 | int Slautil::recursiveParseDecisionPairs(const boost::property_tree::ptree& subtree) 260 | { 261 | //TODO: why use boost foreach?? 262 | for(auto &v : subtree) 263 | { 264 | if(v.first == "decision") 265 | { 266 | this->recursiveParseDecisionPairs(v.second); 267 | } 268 | else if(v.first == "pair") 269 | { 270 | this->parseDecisionPair(v.second); 271 | } 272 | else if(v.first == "") 273 | { 274 | continue; 275 | } 276 | else 277 | { 278 | cout << "Unknown value!!" << v.first << endl; 279 | return -1; 280 | } 281 | } 282 | 283 | return 0; 284 | } 285 | 286 | // parse an individual decision pair 287 | int Slautil::parseDecisionPair(const boost::property_tree::ptree& subtree) 288 | { 289 | DECISION_PAIR decision_pair = {}; 290 | 291 | // todo error checking 292 | decision_pair.id = subtree.get(".id", 0); 293 | decision_pair.off = subtree.get("instruct_pat.pat_block..off", 0); 294 | decision_pair.nonzero = subtree.get("instruct_pat.pat_block..nonzero", 0); 295 | string mask = subtree.get("instruct_pat.pat_block.mask_word..mask", ""); 296 | string val = subtree.get("instruct_pat.pat_block.mask_word..val", ""); 297 | 298 | decision_pair.mask = stol(mask, NULL, 0x10); 299 | decision_pair.val = stol(val, NULL, 0x10); 300 | 301 | m_decision_pairs[decision_pair.id] = decision_pair; 302 | 303 | return 0; 304 | } 305 | 306 | // convert the decision pairs into opcode bit patterns 307 | int Slautil::convertDecisionPairsToBitPatterns(void) 308 | { 309 | for(unsigned int i = 0; i < m_constructors.size(); i++) 310 | { 311 | //cout << i << ")" << endl; 312 | unsigned int constructor_length = 0; 313 | PDECISION_PAIR curr_decision_pair = NULL; 314 | unsigned int shift_value = 0; 315 | unsigned int mask = 0; 316 | unsigned int value = 0; 317 | 318 | curr_decision_pair = &m_decision_pairs[i]; 319 | constructor_length = m_constructors[i].constructor_length; 320 | 321 | //cout << curr_decision_pair->id << " " << curr_decision_pair->mask << " " << curr_decision_pair->val << endl; 322 | 323 | if(curr_decision_pair->nonzero > 4) 324 | { 325 | cout << "Invalid decision nonzero amount!!" << endl; 326 | return -3; 327 | } 328 | 329 | if(constructor_length <= curr_decision_pair->off) 330 | { 331 | cout << "Invalid decision offset amount!!" << endl; 332 | return -4; 333 | } 334 | 335 | shift_value = curr_decision_pair->off * 8; 336 | mask = curr_decision_pair->mask >> shift_value; 337 | value = curr_decision_pair->val >> shift_value; 338 | 339 | countAdjacentOnes(i, mask, (value & mask)); 340 | } 341 | 342 | return SLA_SUCCESS; 343 | } 344 | 345 | // read the varlist syms from the processor module 346 | int Slautil::parseVarlistSym(void) 347 | { 348 | for(auto &varlist_sym_node : m_tree.get_child("sleigh.symbol_table")) 349 | { 350 | varlist_sym curr_varlist_sym; 351 | 352 | if(varlist_sym_node.first != "varlist_sym") 353 | { 354 | continue; 355 | } 356 | 357 | //cout << varlist_sym_node.first << endl; 358 | 359 | std::string id_str = varlist_sym_node.second.get(".id", ""); 360 | curr_varlist_sym.id = stoi(id_str, 0, 0x10); 361 | 362 | curr_varlist_sym.bitfield.startbit = varlist_sym_node.second.get("tokenfield..startbit", 0); 363 | curr_varlist_sym.bitfield.endbit = varlist_sym_node.second.get("tokenfield..endbit", 0); 364 | curr_varlist_sym.bitfield.startbyte = varlist_sym_node.second.get("tokenfield..startbyte", 0); 365 | curr_varlist_sym.bitfield.endbyte = varlist_sym_node.second.get("tokenfield..endbyte", 0); 366 | curr_varlist_sym.bitfield.shift = varlist_sym_node.second.get("tokenfield..shift", 0); 367 | 368 | //cout << curr_varlist_sym.bitfield.startbit << " " << curr_varlist_sym.bitfield.endbit << endl; 369 | 370 | for(auto &var_node : varlist_sym_node.second) 371 | { 372 | if(var_node.first != "var") 373 | { 374 | continue; 375 | } 376 | 377 | std::string var_id_str = var_node.second.get(".id", ""); 378 | unsigned int var_id = stoi(var_id_str, 0, 0x10); 379 | curr_varlist_sym.register_ids.push_back(var_id); 380 | 381 | } 382 | 383 | m_varlist_syms[curr_varlist_sym.id] = curr_varlist_sym; 384 | } 385 | 386 | return SLA_SUCCESS; 387 | } 388 | 389 | // read the registers from the processor module 390 | int Slautil::parseRegisters(void) 391 | { 392 | for(auto &v : m_tree.get_child("sleigh.symbol_table")) 393 | { 394 | if(v.first != "varnode_sym") 395 | { 396 | continue; 397 | } 398 | 399 | // TODO: throws if missing 400 | std::string space = v.second.get_child(".space").data(); 401 | if(space != "register") 402 | { 403 | continue; 404 | } 405 | //cout << space << endl; 406 | 407 | std::string id_str = v.second.get_child(".id").data(); 408 | 409 | unsigned int id = stoi(id_str, 0, 0x10); 410 | 411 | boost::unordered_map ::iterator itr; 412 | 413 | itr = m_vars.find(id); 414 | if(itr == m_vars.end()) 415 | { 416 | cout << "Failed to find " << id << "!!" << endl; 417 | return -1; 418 | } 419 | 420 | m_registers.push_back(itr->second); 421 | } 422 | 423 | return 0; 424 | } 425 | 426 | // helper function to count the number of adjacent ones in a bitmask 427 | int Slautil::countAdjacentOnes(unsigned int id, 428 | unsigned int mask, 429 | unsigned int value) 430 | { 431 | unsigned int count = 0; 432 | 433 | for(unsigned int i = 0; i < 32; i++) 434 | { 435 | bool bit_on = (mask & (1 << i)); 436 | 437 | if(bit_on) 438 | { 439 | count += 1; 440 | } 441 | 442 | if(!bit_on) 443 | { 444 | if(count != 0) 445 | { 446 | BIT_PATTERN temp_bit_pattern; 447 | 448 | /* 449 | cout << "opcode_"; 450 | cout << (i - count); 451 | cout << "_"; 452 | cout << (i - 1); 453 | cout << "= " << endl; 454 | */ 455 | 456 | // TODO: make this a func 457 | temp_bit_pattern.pattern_type = "opcode"; 458 | temp_bit_pattern.start_bit = i - count; 459 | temp_bit_pattern.end_bit = i - 1; 460 | temp_bit_pattern.pattern = extractBits(temp_bit_pattern.start_bit, 461 | temp_bit_pattern.end_bit, 462 | value); 463 | 464 | m_constructors[id].bit_patterns.push_back(temp_bit_pattern); 465 | } 466 | count = 0; 467 | } 468 | } 469 | 470 | if(count != 0) 471 | { 472 | BIT_PATTERN temp_bit_pattern; 473 | 474 | /* 475 | cout << "opcode_"; 476 | cout << (32 - count); 477 | cout << "_"; 478 | cout << (32 - 1); 479 | cout << "= " << endl; 480 | */ 481 | 482 | // TODO: make this a func 483 | temp_bit_pattern.pattern_type = "opcode"; 484 | temp_bit_pattern.start_bit = 33 - count - 1; 485 | temp_bit_pattern.end_bit = 32 - 1; 486 | temp_bit_pattern.pattern = extractBits(temp_bit_pattern.start_bit, 487 | temp_bit_pattern.end_bit, 488 | value); 489 | 490 | m_constructors[id].bit_patterns.push_back(temp_bit_pattern); 491 | } 492 | 493 | return SLA_SUCCESS; 494 | } 495 | 496 | // convert a number into a bit string 497 | string Slautil::extractBits(unsigned int start_bit, 498 | unsigned int end_bit, 499 | unsigned int value) 500 | { 501 | string bit_string = ""; 502 | 503 | for(unsigned int i = start_bit; i <= end_bit; i++) 504 | { 505 | if(value & (1 << i)) 506 | { 507 | bit_string.insert(0, 1, '1'); 508 | } 509 | else 510 | { 511 | bit_string.insert(0, 1, '0'); 512 | } 513 | } 514 | 515 | if(bit_string.length() == 0) 516 | { 517 | cout << "Invalid extract bits!!" << endl; 518 | cout << start_bit << " " << end_bit << endl; 519 | throw 1; 520 | } 521 | 522 | return bit_string; 523 | } 524 | 525 | // remap a subsym if necessary 526 | int Slautil::checkSubsym(unsigned int& id) 527 | { 528 | boost::unordered_map ::iterator itr; 529 | itr = m_subsyms.find(id); 530 | if(itr == m_subsyms.end()) 531 | { 532 | return -3; 533 | } 534 | 535 | //cout << "replace " << id << " with " << itr->second << endl; 536 | id = itr->second; 537 | 538 | return SLA_SUCCESS; 539 | } 540 | -------------------------------------------------------------------------------- /thread_pool.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: parser.cpp 3 | // 4 | // Thread pool helpers 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //------------------------------------------------------------------------------ 9 | #include "thread_pool.h" 10 | #include 11 | #include 12 | 13 | // count of successful and failed worker jobs 14 | // use atomic for thread-safety 15 | static boost::atomic g_CompletedCount = 0; 16 | static boost::atomic g_FailureCount = 0; 17 | 18 | // reset thread pool counters 19 | void resetThreadPool(void) 20 | { 21 | g_FailureCount = 0; 22 | g_CompletedCount = 0; 23 | } 24 | 25 | // increment the number of worker completions 26 | void incrementWorkerCompletions(void) 27 | { 28 | g_CompletedCount++; 29 | } 30 | 31 | // get the number of completed workers 32 | unsigned int getWorkerCompletions(void) 33 | { 34 | return g_CompletedCount; 35 | } 36 | 37 | // increment the number of failures 38 | void incrementWorkerFailures(void) 39 | { 40 | g_FailureCount++; 41 | } 42 | 43 | // get the number of failures 44 | unsigned int getWorkerFailures(void) 45 | { 46 | return g_FailureCount; 47 | } 48 | -------------------------------------------------------------------------------- /thread_pool.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: thread_pool.h 3 | // 4 | // Thread pool helpers 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //------------------------------------------------------------------------------ 9 | #pragma once 10 | 11 | #include 12 | #include 13 | 14 | void resetThreadPool(void); 15 | void incrementWorkerFailures(void); 16 | unsigned int getWorkerFailures(void); 17 | void incrementWorkerCompletions(void); 18 | unsigned int getWorkerCompletions(void); 19 | -------------------------------------------------------------------------------- /validator.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // File: validator.cpp 3 | // 4 | // Handles command line argument parsing invoking the disassembly routine. 5 | // 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved. 7 | // Licensed under the Apache 2.0 License. 8 | //----------------------------------------------------------------------------- 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | using namespace std; 17 | 18 | // This is a tiny LoadImage class which feeds the executable bytes to the translator 19 | // Taken straight from sleighexample.cc 20 | class MyLoadImage : public LoadImage { 21 | uintb baseaddr; 22 | int4 length; 23 | uint1 *data; 24 | public: 25 | MyLoadImage(uintb ad,uint1 *ptr,int4 sz) : LoadImage("nofile") { baseaddr = ad; data = ptr; length = sz; } 26 | virtual void loadFill(uint1 *ptr,int4 size,const Address &addr); 27 | virtual string getArchType(void) const { return "myload"; } 28 | virtual void adjustVma(long adjust) { } 29 | }; 30 | 31 | // This is the only important method for the LoadImage. It returns bytes from the static array 32 | // depending on the address range requested 33 | void MyLoadImage::loadFill(uint1 *ptr,int4 size,const Address &addr) 34 | 35 | { 36 | uintb start = addr.getOffset(); 37 | uintb max = baseaddr + (length-1); 38 | for(int4 i=0;imax)) { // If byte does not fall in window 41 | ptr[i] = 0; // return 0 42 | continue; 43 | } 44 | uintb diff = curoff - baseaddr; 45 | ptr[i] = data[(int4)diff]; // Otherwise return data from our window 46 | } 47 | } 48 | 49 | // Here is a simple class for emitting assembly. In this case, we send the strings straight 50 | // to standard out. 51 | class AssemblyRaw : public AssemblyEmit { 52 | public: 53 | virtual void dump(const Address &addr,const string &mnem,const string &body) { 54 | disassembly = mnem + " " + body; 55 | boost::trim(disassembly); 56 | } 57 | string disassembly; 58 | }; 59 | 60 | // converts unsigned char to two byte hex value 61 | #define CHAR2HEX( x ) setw(2) << setfill('0') << uppercase << hex << (unsigned int)x 62 | 63 | int parseInputAndDisassemble(string& inputFilename, string& outputFilename, string& slaFilename); 64 | int convertOpcodeToBinary(string& opcode, vector& opcodeBytes); 65 | int convertHexNibbletoInteger(unsigned char x); 66 | int sleighDisassemble(string& slaFilename, vector& opcodeBytes, string& disassembly); 67 | 68 | int main(int argc, char *argv[]) 69 | { 70 | boost::program_options::options_description desc{"Ghidra Processor Module Generator Validator"}; 71 | boost::program_options::variables_map args; 72 | string inputFilename; 73 | string outputFilename; 74 | string slaFilename; 75 | int result = 0; 76 | 77 | cout << "Ghidra Processor Module Generator Validator" << endl; 78 | 79 | // 80 | // command line arg parsing 81 | // 82 | 83 | try 84 | { 85 | desc.add_options() 86 | ("input-file,i", boost::program_options::value(&inputFilename), "Path to a newline delimited text file containing all opcodes and instructions for the processor module. Required.") 87 | ("output-file,o",boost::program_options::value(&outputFilename)->default_value("output.txt"), "Output file. Defaults to output.txt if not specified.") 88 | ("sla-file,s",boost::program_options::value(&slaFilename), "Path to the compiled processor .sla.") 89 | ("help,h", "Help screen"); 90 | 91 | store(parse_command_line(argc, argv, desc), args); 92 | notify(args); 93 | 94 | if(args.count("help") || argc == 1) 95 | { 96 | cout << desc << endl; 97 | return 0; 98 | } 99 | 100 | if(args.count("input-file") == 0) 101 | { 102 | cout << "Input file name is required!!" << endl; 103 | return -1; 104 | } 105 | 106 | if(args.count("sla-file") == 0) 107 | { 108 | cout << "Sla file name is required!!" << endl; 109 | return -1; 110 | } 111 | } 112 | catch (const boost::program_options::error &ex) 113 | { 114 | cout << "[-] Error parsing command line: " << ex.what() << endl; 115 | return -1; 116 | } 117 | 118 | cout << "[*] Input file: " << inputFilename << endl; 119 | cout << "[*] Compiled SLA file: " << slaFilename << endl; 120 | cout << "[*] Outputting (might take a while) to: " << outputFilename << endl; 121 | 122 | result = parseInputAndDisassemble(inputFilename, outputFilename, slaFilename); 123 | if(result != 0) 124 | { 125 | return result; 126 | } 127 | 128 | cout << "[*] Successfully created output disassembly file. Diff input and output files to find errors in the SLA." << endl; 129 | return 0; 130 | } 131 | 132 | // Parses the input file for addresses and passes it to the SLEIGH disassembler for output 133 | int parseInputAndDisassemble(string& inputFilename, string& outputFilename, string& slaFilename) 134 | { 135 | unsigned int lineNum = 0; 136 | int result = 0; 137 | std::string line; 138 | 139 | // open the input file for parsing 140 | boost::filesystem::path infile{inputFilename}; 141 | boost::filesystem::ifstream ifs{infile}; 142 | 143 | boost::filesystem::path outfile{outputFilename}; 144 | boost::filesystem::ofstream ofs{outfile}; 145 | 146 | if(!ifs) 147 | { 148 | cout << "[-] Failed to open input file!!" << endl; 149 | return -1; 150 | } 151 | 152 | if(!ofs) 153 | { 154 | cout << "[-] Failed to open output file!!" << endl; 155 | return -1; 156 | } 157 | 158 | // 159 | // parse the input file line by line 160 | // 161 | while (std::getline(ifs, line)) 162 | { 163 | vector lineSplit; 164 | vector opcodeBytes; 165 | string disassembly; 166 | 167 | lineNum++; 168 | 169 | // split the line into components 170 | boost::split(lineSplit, line, boost::algorithm::is_space(), boost::token_compress_on); 171 | 172 | if(lineSplit.size() < 1) 173 | { 174 | continue; 175 | } 176 | 177 | result = convertOpcodeToBinary(lineSplit[0], opcodeBytes); 178 | if(result != 0) 179 | { 180 | cout << "Failed to covert opcode!!" << endl; 181 | goto exit; 182 | } 183 | 184 | result = sleighDisassemble(slaFilename, opcodeBytes, disassembly); 185 | if(result != 0) 186 | { 187 | goto exit; 188 | } 189 | 190 | ofs << "0x"; 191 | for (auto& x: opcodeBytes) 192 | { 193 | ofs << CHAR2HEX(x); 194 | } 195 | ofs << " " << disassembly; 196 | ofs << endl; 197 | } 198 | 199 | result = 0; 200 | 201 | exit: 202 | ifs.close(); 203 | ofs.close(); 204 | return result; 205 | } 206 | 207 | // disassembles opcode bytes using the passed in SLA file 208 | int sleighDisassemble(string& slaFilename, vector& opcodeBytes, string& disassembly) 209 | { 210 | unsigned char buffer[4096] = {0}; 211 | 212 | // initialize instruction to disassemble 213 | for(unsigned int i = 0; i < opcodeBytes.size(); i++) 214 | { 215 | buffer[i] = opcodeBytes[i]; 216 | } 217 | 218 | // instantiate sleigh 219 | try 220 | { 221 | MyLoadImage loader(0, (uint1*)buffer, sizeof(buffer)); 222 | 223 | // Set up the context object 224 | ContextInternal context; 225 | 226 | // Set up the disassembler 227 | Sleigh trans(&loader, &context); 228 | 229 | // Read sleigh file into DOM 230 | DocumentStorage docstorage; 231 | Element *sleighroot = docstorage.openDocument(slaFilename)->getRoot(); 232 | docstorage.registerTag(sleighroot); 233 | trans.initialize(docstorage); // Initialize the translator 234 | 235 | AssemblyRaw assememit; // Set up the disassembly dumper 236 | Address addr(trans.getDefaultCodeSpace(), 0); // First disassembly address 237 | 238 | // dump the disassembly now 239 | trans.printAssembly(assememit, addr); 240 | disassembly = assememit.disassembly; 241 | } 242 | catch(XmlError e) 243 | { 244 | cout << "Failed to instantiate SLEIGH. Is processor SLA invalid?" << endl; 245 | return -1; 246 | } 247 | catch(BadDataError e) 248 | { 249 | // disassembly error, just report it as a success so it appears in the output 250 | disassembly = "Error"; 251 | return 0; 252 | } 253 | catch(...) 254 | { 255 | cout << "Unknown error during disassembly!!\n"; 256 | return -3; 257 | } 258 | 259 | return 0; 260 | } 261 | 262 | // converts an opcode in the of 0xaabb... or 0b0011... to a an array of raw bytes 263 | int convertOpcodeToBinary(string& opcode, vector& opcodeBytes) 264 | { 265 | int opcodeLength = 0; 266 | 267 | // opcode must begin with 0x or 0b 268 | if(opcode[0] != '0') 269 | { 270 | cout << "Opcode must begin with 0x or 0b!!" << endl; 271 | return -1; 272 | } 273 | 274 | if(opcode[1] == 'x' || opcode[1] == 'X') 275 | { 276 | opcodeLength = opcode.length() - 2; 277 | if((opcodeLength % 2) != 0) 278 | { 279 | cout << "Hex opcode length must be divisble by 2!!" << endl; 280 | return -2; 281 | } 282 | 283 | // loop through the hex string, converting each byte 284 | for(unsigned int i = 2; i < opcode.length(); i += 2) 285 | { 286 | unsigned char value; 287 | unsigned char high; 288 | unsigned char low; 289 | 290 | // convert the hex string to a byte 291 | high = convertHexNibbletoInteger(opcode[i]); 292 | low = convertHexNibbletoInteger(opcode[i+1]); 293 | 294 | value = (high << 4) | low; 295 | 296 | opcodeBytes.push_back(value); 297 | } 298 | 299 | return 0; 300 | } 301 | else if(opcode[1] == 'b' || opcode[1] == 'B') 302 | { 303 | opcodeLength = opcode.length() - 2; 304 | if((opcodeLength % 8) != 0) 305 | { 306 | cout << "Binary opcode length must be divisble by 8!!" << endl; 307 | return -2; 308 | } 309 | 310 | // loop through the bit string, converting each byte 311 | for(unsigned int i = 2; i < opcode.length(); i += 8) 312 | { 313 | unsigned char value = 0; 314 | 315 | for(unsigned int j = 0; j < 8; j++) 316 | { 317 | value = value << 1; 318 | if(opcode[i + j] == '1') 319 | { 320 | value = value | 1; 321 | } 322 | } 323 | opcodeBytes.push_back(value); 324 | } 325 | 326 | return 0; 327 | } 328 | else 329 | { 330 | cout << "Opcode must begin with 0x or 0b!!" << endl; 331 | return -1; 332 | } 333 | 334 | return 0; 335 | } 336 | 337 | // simple utility to convert an ascii hex char to decimal 338 | int convertHexNibbletoInteger(unsigned char x) 339 | { 340 | if(x >= '0' && x <= '9') 341 | { 342 | return x - '0'; 343 | } 344 | 345 | if(x >= 'A' && x <= 'F') 346 | { 347 | return x - 'A' + 0xa; 348 | } 349 | 350 | if(x >= 'a' && x <= 'f') 351 | { 352 | return x - 'a' + 0xa; 353 | } 354 | 355 | return 0; 356 | } 357 | --------------------------------------------------------------------------------