├── LICENSE
├── Makefile
├── README.md
├── bitspan.cpp
├── bitspan.h
├── combine.cpp
├── combine.h
├── examples
    ├── 8048.txt
    ├── ethereum.txt
    ├── sh2.sla
    └── sh2.txt
├── instruction.cpp
├── instruction.h
├── main.cpp
├── output.cpp
├── output.h
├── parser.cpp
├── parser.h
├── parser_sla.cpp
├── parser_sla.h
├── registers.h
├── slautil
    ├── slautil.cpp
    ├── slautil.h
    └── slaxml.cpp
├── thread_pool.cpp
├── thread_pool.h
└── validator.cpp


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2020 Oberoi Security Solutions
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-O3 -pipe -march=native -flto=auto -Wall -Wextra -Wunused -Wunused-but-set-parameter -Wunused-but-set-variable -Wunused-function -I $(GHIDRA_TRUNK)/Ghidra/Features/Decompiler/src/decompile/cpp/
 3 | DEPS = bitspan.h combine.h instruction.h output.h parser.h parser_sla.h registers.h thread_pool.h validator.h
 4 | OBJ = main.o bitspan.o combine.o instruction.o output.o parser.o parser_sla.o thread_pool.o slautil/slautil.o slautil/slaxml.o
 5 | LIBS=-lboost_system -lboost_filesystem -lboost_regex -lboost_program_options -lboost_thread -lboost_timer
 6 | VALIDATOR-DEPS = loadimage.hh sleigh.hh
 7 | VALIDATOR-OBJ = validator.o
 8 | VALIDATOR-LIBS= -lboost_system -lboost_filesystem -lboost_program_options -L . $(GHIDRA_TRUNK)/Ghidra/Features/Decompiler/src/decompile/cpp/libsla.a
 9 | 
10 | 
11 | all: generator generator-validator
12 | 
13 | validator.o: validator.cpp $(VALIDATOR_DEPS)
14 | 	$(CXX) -c -o $@ $< $(CXXFLAGS) $(VALIDATOR-LIBS)
15 | 
16 | 
17 | %.o: %.cpp $(DEPS)
18 | 	$(CXX) -c -o $@ $< $(CXXFLAGS) $(LIBS)
19 | 
20 | generator: $(OBJ)
21 | 	$(CXX) -o $@ $^ $(CXXFLAGS) $(LIBS)
22 | 
23 | generator-validator: $(VALIDATOR-OBJ)
24 | 	$(CXX) -o $@ $^ $(CXXFLAGS) $(VALIDATOR-LIBS)
25 | 
26 | .PHONY: clean
27 | clean:
28 | 	rm -f *.o slautil/*.o generator generator-validator
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Generator
  2 | 
  3 | Ghidra Processor Module Generator (Generator) is a step towards automating the creation of [Ghidra](https://github.com/NationalSecurityAgency/ghidra) processor modules. Generator takes as input one or more text files containing all disassembled instructions for a given instruction set architecture (ISA) and outputs a processor module directory that can be loaded into Ghidra. Specifically Generator:
  4 | 
  5 | * combines duplicate instructions
  6 | * combines instructions which differ by an immediate value
  7 | * combines instructions which differ by a register
  8 | * creates a valid processor module directory that includes:
  9 |   * Module.manifest
 10 |   * .slaspec
 11 |   * .cspec
 12 |   * .ldefs
 13 |   * .pspec
 14 | 
 15 | The outputted processor module will be able to disassemble for your given architecture. As all instructions will have an empty p-code definition, Ghidra's decompiler will obviously not work. Generator only supports 1-4 byte ISAs. Both fixed length and variable length ISAs are supported.
 16 | 
 17 | ## Performance
 18 | Generator's runtime is varies based on the size/number of instructions of the input ISA. Generator is multithreaded and by default will use all available cores.
 19 | 
 20 | |ISA Size|Time|Ram Usage|Notes|
 21 | |---|---:|---:|---|
 22 | |1 Byte|<1 sec|<1 GB||
 23 | |2 Byte|<2 sec|<1 GB||
 24 | |3 Byte|~60 sec|~40 GB||
 25 | |4 Byte|~4-5 hours|~40 GB|Requires 4-byte ISA instructions|
 26 | 
 27 | Numbers are from an AMD Ryzen 9 7950X3D 16-Core Processor, 128 GB RAM, with NVMe SSD.
 28 | 
 29 | ## Usage
 30 | ### Overview
 31 | The high-level steps for running Generator on 1-3 byte ISAs are to:
 32 | 
 33 | 1) Create a newline delimited text file that contains a list of all valid hex opcode and instructions
 34 | 2) Run Generator on the text file
 35 | 3) Copy the created processor module directory to your Ghidra/Processors directory
 36 | 4) Launch Ghidra. Your processor will show up in the list of supported processors
 37 | 
 38 | See "Usage (1-3 Byte ISAs)" for detailed instructions. 4 byte ISAs require additional steps. See "Usage (4 Byte ISAs)". 
 39 | 
 40 | ### Generator Command-Line Arguments
 41 | |Command||
 42 | |---|---|
 43 | |-i [ --input-disassembly ] arg|Path to a newline delimited text file containing all opcodes and instructions for the processor module|
 44 | |--input-disassembly-dir arg|Path to a directory with multiple newline delimited text files containing all opcodes and instructions for the processor module|
 45 | |-s [ --input-sleigh ] arg|Path to a XML .sla file containing all opcodes and instructions for the processor module|
 46 | |--input-sleigh-dir arg|Path to a directory with multiple XML .sla files containing all opcodes and instructions for the processor module|
 47 | |-t [ --num-threads ] arg|Number of worker threads to use. Optional. Defaults to number of physical CPUs if not specified|
 48 | |-n [ --processor-name ] arg|Name of the target processor. Defaults to "MyProc" if not specified|
 49 | |-f [ --processor-family ] arg|Name of the target processor's family. Defaults to "MyProcFamily" if not specified|
 50 | |-e [ --endian ] arg|Endianness of the processor. Must be either "little" or "big". Defaults to big if not specified|
 51 | |-a [ --alignment ] arg|Instruction alignment of the processor. Defaults to 1 if not specified|
 52 | |-b [ --bitness ] arg|Bitness of the processor. Defaults to 32 if not specified|
 53 | |--print-registers-only|Only print parsed registers. Useful for debugging purposes. False by default|
 54 | |--omit-opcodes|Don't print opcodes in the outputted.sla file. False by default|
 55 | |--omit-example-instructions|Don't print example combined instructions in the outputted .sla file. False by default|
 56 | |--skip-instruction-combining|Don't combine instructions. Useful for debugging purposes. False by default|
 57 | |--additional-registers arg|List of additional registers. Use this option if --print-registers-only is missing registers for your instruction set|
 58 | |-h [ --help ]|Help screen|
 59 | 
 60 | ### Usage (1-3 Byte ISAs)
 61 | 1) Create a newline delimited text file that contains a list of all valid hex opcodes + instructions. Example (SuperH SH-2):  
 62 | > 0x0002 stc sr,r0  
 63 | > 0x0003 bsrf r0  
 64 | > 0x0004 mov.b r0,@(r0,r0)  
 65 | > 0x0005 mov.w r0,@(r0,r0)  
 66 | > 0x0006 mov.l r0,@(r0,r0)  
 67 | > ...  
 68 | > ...  
 69 | > 0xEFFE mov #-0x2,r15  
 70 | > 0xEFFF mov #-0x1,r15  
 71 | 
 72 | Exclude any invalid instructions. The opcode must begin with 0x and must be byte aligned.  
 73 | 2) Run Generator with `generator --input-disassembly examples/sh-2.txt --print-registers-only` flag. This flag parses all the instructions and will print out only the registers. Verify the output is correct before proceeding.
 74 | 
 75 | Ex:  
 76 | 
 77 | > ./generator --input-disassembly examples/sh-2.txt --print-registers-only  
 78 | > Ghidra Processor Module Generator  
 79 | > [\*] Using 16 worker thread(s)  
 80 | > [\*] Initializing default Ghidra registers  
 81 | > [\*] Parsing instructions examples/sh2.txt  
 82 | > [\*] Updating bit length from 0 to 16  
 83 | > [\*] Parsed 53752 instructions  
 84 | > [\*] Found registers: gbr mach macl pc pr r0 r1 r10 r11 r12 r13 r14 r15 r2 r3 r4 r5 r6 r7 r8 r9 sr vbr  
 85 | > [\*] Found mnemonics: # ( ) + , - @ add addc addv and and.b bf bf/s bra braf bsr bsrf bt bt/s clrmac clrt cmp/eq cmp/ge cmp/gt cmp/hi cmp/hs cmp/pl cmp/pz cmp/str div0s div0u div1 dmuls.l dmulu.l dt exts.b exts.w extu.b extu.w jmp jsr ldc ldc.l lds lds.l mac.l mac.w mov mov.b mov.l mov.w mova movt mul.l muls.w mulu.w neg negc nop not or or.b rotcl rotcr rotl rotr rte rts sett shal shar shll shll16 shll2 shll8 shlr shlr16 shlr2 shlr8 sleep stc stc.l sts sts.l sub subc subv swap.b swap.w tas.b trapa tst tst.b xor xor.b xtrct  
 86 | > If there are any issues edit registers.h before proceeding.  
 87 | 
 88 | 3) Manually verify that the registers and mnemonics lists are correct. You can use the `--additional-registers` command line option to add missing registers. On some architectures you may need to remove registers from registers.h and re-compile. **If the registers/mnemonics are incorrect Generator will not work**.
 89 | 4) Now you are ready to run Generator: `./generator --input-file instructions.txt --processor-name MyProcessor --processor-family ProcessorFamily --endian big --alignment 2`. If all goes well Generator should create a "MyProcessor" directory with all of the required files.
 90 | 5) Verify that the created processor module directory is valid and compiles with Ghidra's SLEIGH compiler. The SLEIGH compiler script can be found in `ghidra/support/`. Run `sleigh -a <path_to_MyProcessorFamily_dir>`. There should be warnings about unimplemented p-code instructions but otherwise there should be no issues. If the compilation step fails, please submit an issue and upload your instructions.txt file and I will take a look at it.  
 91 | 
 92 | Ex:  
 93 | > <path_to_ghidra>/ghidra/support/sleigh -a MyProcFamily/  
 94 | > Compiling MyProcFamily/data/languages/MyProc.slaspec:  
 95 | > WARN  187 NOP constructors found (SleighCompile)  
 96 | > WARN  Use -n switch to list each individually (SleighCompile)  
 97 | >  
 98 | > 1 languages successfully compiled  
 99 | 
100 | 6) Now that you've compiled your processor module, you can run `generator-validator` to disassemble your input file and diff the results. This will help you find which instructions require modifications. Run with: `./generator-validator --input-file examples/sh2.txt --sla-file MyProcFamily/data/languages/MyProc.sla --output-file output.txt`. Diff the input file and the output file to find issues. If you find issues, manually correct the .slaspec and recompile with Ghidra's sleigh compiler.  
101 | 7) If the processor successfully compiled you should be able to copy your MyProcessor directory to `<path_to_ghidra>/Ghidra/Processors/` directory. When you restart Ghidra your new processor should be listed. Make sure you open your binary as "raw" and manually select your processor module.  
102 | 
103 | ### Usage (4 Byte ISAs)
104 | A 4-byte ISA is too large for Generator to store in memory. To work around this, we split the the input disassembly file into multiple input files and run Generator multiple. The steps involved look this:
105 | 
106 | 1) Create 256 newline delimited text files that contain a list of all valid hex opcode and instructions. Each file should be 1/256th of the total instruction set or approximately 16 million lines each. 
107 | 2) Run Generator on the 256 input text files, creating 256 .slaspec files
108 | 3) Use Ghidra's SLEIGH compiler to compile the 256 .slaspec files into 256 .sla files
109 | 4) Re-run Generator, but with the 256 .sla files as input to combine them into a single .slaspec file
110 | 5) Copy the created processor module directory to your Ghidra/Processors directory
111 | 6) Launch Ghidra. Your processor will show up in the list of supported processors
112 | 
113 | 1) Create 256 newline delimited texts that each contain 1/256th of the ISA. As before exclude any invalid instructions. Again the opcode must begin with 0x and must be byte aligned.  
114 | 2) Run Generator with `generator --input-disassembly-dir examples/split --print-registers-only` flag. This will parse all the text files in the "examples/split" directory the instructions and will print out only the registers. Verify the output is correct before proceeding. Depending on how many files are present and the size of each file this can take a significant amount of time. In the examples/split directory there are two SH-2 files that will be combined.  
115 | 
116 | Ex:  
117 | 
118 | > ./generator --input-disassembly-dir examples/split --print-registers-only  
119 | > Ghidra Processor Module Generator  
120 | > [\*] Using 16 worker thread(s)  
121 | > [\*] Initializing default Ghidra registers  
122 | > [\*] Parsing instructions examples/split/sh2_1.txt  
123 | >   [\*] Updating bit length from 0 to 16  
124 | > [\*] Parsed 26872 instructions  
125 | > [\*] Freeing parser data  
126 | > [\*] Parsing instructions examples/split/sh2_2.txt  
127 | > [\*] Parsed 26880 instructions  
128 | > [\*] Freeing parser data  
129 | > [\*] Found registers: gbr mach macl pc pr r0 r1 r10 r11 r12 r13 r14 r15 r2 r3 r4 r5 r6 r7 r8 r9 sr vbr  
130 | > [\*] Found mnemonics: # ( ) + , - @ add addc addv and and.b bf bf/s bra braf bsr bsrf bt bt/s clrmac clrt cmp/eq cmp/ge cmp/gt cmp/hi cmp/hs cmp/pl cmp/pz cmp/str  div0s div0u div1 dmuls.l dmulu.l dt exts.b exts.w extu.b extu.w jmp jsr ldc ldc.l lds lds.l mac.l mac.w mov mov.b mov.l mov.w mova movt mul.l muls.w mulu.w neg > negc nop not or or.b rotcl rotcr rotl rotr rte rts sett shal shar shll shll16 shll2 shll8 shlr shlr16 shlr2 shlr8 sleep stc stc.l sts sts.l sub subc subv swap.b > swap.w tas.b trapa tst tst.b xor xor.b xtrct  
131 | > If there are any issues edit registers.h before proceeding.  
132 | > [\*] Freeing parser data  
133 | 
134 | 3) Manually verify that the registers and mnemonics lists are correct. You can use the `--additional-registers` command line option to add missing registers. On some architectures you may need to remove registers from registers.h and re-compile. **If the registers/mnemonics are incorrect Generator will not work**.
135 | 4) Now you are ready to run Generator: `./generator --input-disassembly-dir examples/split --processor-name MyProc --processor-family MyProcFamily --endian big --alignment 2`. If all goes well Generator should create a "MyProcFamily" directory with a .slaspec file for each of the input disassembly text files.
136 | 5) Verify that the created processor module directory is valid and compiles with Ghidra's SLEIGH compiler. The SLEIGH compiler script can be found in `ghidra/support/`. Run `sleigh -a -y <path_to_MyProcessorFamily_dir>`. **You must use the -y flag as it forces the SLEIGH compiler to output in the legacy XML format. This is required for the next step.** There should be warnings about unimplemented p-code instructions but otherwise there should be no issues. If the compilation step fails, please submit an issue and upload your instructions.txt file and I will take a look at it. When using examples/split it should successfully compile two languages, one for each input file.  
137 | 
138 | Ex:  
139 | > <path_to_ghidra>/ghidra/support/sleigh -a MyProcFamily/  
140 | > Compiling MyProcFamily/data/languages/MyProc.slaspec:  
141 | > WARN  104 NOP constructors found (SleighCompile)  
142 | > WARN  Use -n switch to list each individually (SleighCompile)  
143 | >
144 | > WARN  30 NOP constructors found (SleighCompile)  
145 | > WARN  Use -n switch to list each individually (SleighCompile)  
146 | >  
147 | > 2 languages successfully compiled__
148 | 
149 | 6) Step 5 should create one .sla file for each input language. Copy those files .sla (not.slaspec) files into a seperate directory
150 | 7) Re-run Generator, but supplying the .sla directory as input: `./generator --input-sleigh-dir intermediate --processor-name SH2 --processor-family SuperH --endian big --alignment 2`. If all goes well Generator will parse and combine all of the .sla files into a single "SuperH" directory with all of the required files.
151 | 8) Verify that the created processor module directory is valid and compiles with Ghidra's SLEIGH compiler. The SLEIGH compiler script can be found in `ghidra/support/`. Run `sleigh -a <path_to_MyProcessorFamily_dir>`. There should be warnings about unimplemented p-code instructions but otherwise there should be no issues. If the compilation step fails, please submit an issue and upload your instructions.txt file and I will take a look at it.  
152 | 9) Now that you've compiled your processor module, you can run `generator-validator` to disassemble your input file and diff the results. This will help you find which instructions require modifications. Run with: `./generator-validator --input-file examples/sh2.txt --sla-file MyProcFamily/data/languages/MyProc.sla --output-file output.txt`. Diff the input file and the output file to find issues. If you find issues, manually correct the .slaspec and recompile with Ghidra's sleigh compiler.  
153 | 10) If the processor successfully compiled you should be able to copy your MyProcessor directory to `<path_to_ghidra>/Ghidra/Processors/` directory. When you restart Ghidra your new processor should be listed. Make sure you open your binary as "raw" and manually select your processor module.  
154 | 
155 | ### Troubleshooting
156 | The most important step in troubleshooting is verifying that the output of --print-registers-only is correct. The registers list must only registers and no mnemonics or instruction components. The mnemonics list must not contain any registers. To remedy this you can manually add\remove registers from regsiter.h or use the --additional-registers command line option to add missing registers. 
157 | 
158 | There are also register names that are also mnemonics. For example: "b" can be the “b” register or a “branch“ instruction, "lsr" can be “line shift register” or “logical shift right“ instrution. It's important that Generator be told what's a register if it can't figure it out on it's own. 
159 | 
160 | While most instructions will parse with Generator, there are certain types of instructions that won't merge properly. For example in ARM:
161 | 
162 | > 0x9eb4 push {r1, r2, r3, r4, r7}  
163 | > 0x9fb4 push {r0, r1, r2, r3, r4, r7}  
164 | > 0xa0b4 push {r5, r7}  
165 | 
166 | There is a variable list of registers depending on the instruction. Unfortunately this breaks Generator's merging algorithm and must be implemented by hand. I would recommend dropping such instructions from the input disassembly. 
167 | 
168 | ## Manual Next Steps (See Existing Processors for Examples):
169 | Now that you have verified Ghidra can load your processor module you can begin implementing p-code and other changes to get the decompiler to work.
170 | 
171 | 1) Edit the .pspec, .cspec, .ldef files.  
172 | 2) Edit the .slaspec file. Rename registers to make more sense. If an instruction uses an immediate and modifies it before displaying you will have to edit the instruction.  
173 | 3) Implement p-code for all of the instructions in the .slaspec file to get decompiler support.  
174 | 
175 | ## Issues
176 | * Not all instruction sets are compatible
177 | * Display fields for immediates aren't handled. generator-validator will help show you these. 
178 | * Doesn't work with PC relative addressing. Will require manual fix-ups. generator-validator will help show you these. 
179 | * Won't work on instruction sets where bitfields are not contigious. Example if bits 0-2 and 4-6 are combined to compute an immediate value.
180 | * Not tested with floating point
181 | 
182 | Please attach your input file when creating an issue.
183 | 
184 | ## Future Work
185 | * Add support for specifying bit patterns as input
186 | 
187 | ## Build
188 | `make generator`  
189 | `make generator-validator GHIDRA_TRUNK=<path_to_Ghidra_trunk>` (requires Ghidra's decompiler headers and libsla.a. GHIDRA_TRUNK points to a clone of Ghidra from trunk, not a release build of Ghidra)
190 | 
191 | ### Build Dependencies
192 | libboost >= 1.76 is required  
193 | libboost-dev  
194 | libboost-filesystem-dev  
195 | libboost-program-options-dev  
196 | libboost-regex-dev  
197 | libboost-system-dev  
198 | libboost-thread-dev  
199 | libboost-timer-dev  
200 | libsla.a (only needed for generator-validator)
201 | 
202 | #### Building libsla.a
203 | If you want to use generator-validator to validate your processor module against your input file, you will need to build Ghidra's libsla.
204 | 1) Checkout Ghidra from trunk. A release build of Ghidra is not sufficient. `git clone https://github.com/NationalSecurityAgency/ghidra` This path will be your GHIDRA_TRUNK directory. 
205 | 2) CD to the decompiler source directory: `cd ~/ghidra/Ghidra/Features/Decompiler/src/decompile/cpp`
206 | 3) Compile: `make libsla.a`
207 | 
208 | ## License
209 | Licensed under the Apache 2.0 license. See LICENSE.
210 | 


--------------------------------------------------------------------------------
/bitspan.cpp:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // File: bitspan.cpp
 3 | //
 4 | // Calculate the longest span of bits that can be combined in an instruction
 5 | // opcode
 6 | //
 7 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
 8 | // Licensed under the Apache 2.0 License.
 9 | //-----------------------------------------------------------------------------
10 | #include "bitspan.h"
11 | 
12 | // initialize bitspan
13 | void initBitSpan(BITSPAN& bitSpan)
14 | {
15 |     bitSpan.length = 0;
16 |     bitSpan.replacementChar = '\0';
17 |     bitSpan.differencePosition = -1;
18 |     bitSpan.bitPos = 0;
19 |     bitSpan.hasZero = false;
20 | }
21 | 
22 | // increment the bit span size
23 | void incrementBitSpan(BITSPAN& bitSpan)
24 | {
25 |     bitSpan.length++;
26 | }
27 | 
28 | // update longest bitspan if curr is longer
29 | // We only want to combine the longest bitspans
30 | void updateLongestBitSpan(BITSPAN& curr, BITSPAN& longest)
31 | {
32 |     if(curr.length <= longest.length)
33 |     {
34 |         return;
35 |     }
36 | 
37 |     // we only care if the current bitspan has a 0 that we can move to 1
38 |     if(curr.hasZero == false)
39 |     {
40 |         // longer string but no zero
41 |         return;
42 |     }
43 | 
44 |     if(curr.replacementChar == '\0')
45 |     {
46 |         // should never happen
47 |         return;
48 |     }
49 | 
50 |     // update longest bitspan
51 |     longest = curr;
52 |     return;
53 | }
54 | 
55 | // replaces all 0s and 1s in the string with replacementChar starting at 
56 | // position pos
57 | void replacesBitsFromSpan(std::string& bitString,
58 |                           unsigned int pos,
59 |                           unsigned int count,
60 |                           char replacementChar)
61 | {
62 |     bitString[pos] = replacementChar;
63 | 
64 |     for(unsigned int i = 0; i < count; i++)
65 |     {
66 |         if(bitString[pos - i - 1] == '0' || bitString[pos - i - 1] == '1')
67 |         {
68 |             bitString[pos - i - 1] = replacementChar;
69 |         }
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/bitspan.h:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // File: bitspan.h
 3 | //
 4 | // Calculate the longest span of bits that can be combined in an instruction
 5 | // opcode
 6 | //
 7 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
 8 | // Licensed under the Apache 2.0 License.
 9 | //-----------------------------------------------------------------------------
10 | #pragma once
11 | 
12 | #include <string>
13 | 
14 | // represents a span of bits that can be combined
15 | // together in the opcode bitstring
16 | typedef struct _BITSPAN
17 | {
18 |     unsigned int length; // number of bits in bitspan
19 |     char replacementChar;
20 |     unsigned int bitPos;
21 |     int differencePosition;
22 |     bool hasZero;
23 | } BITSPAN, *PBITSPAN;
24 | 
25 | void initBitSpan(BITSPAN& bitSpan);
26 | void incrementBitSpan(BITSPAN& bitSpan);
27 | void updateLongestBitSpan(BITSPAN& curr, BITSPAN& longest);
28 | void replacesBitsFromSpan(std::string& bitString,
29 |                           unsigned int pos,
30 |                           unsigned int count,
31 |                           char replacementChar);
32 | 


--------------------------------------------------------------------------------
/combine.cpp:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // File: combine.cpp
  3 | //
  4 | // Combining instructions
  5 | //
  6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
  7 | // Licensed under the Apache 2.0 License.
  8 | //------------------------------------------------------------------------------
  9 | #include <boost/timer/timer.hpp>
 10 | #include <boost/thread/thread.hpp>
 11 | #include <boost/asio/thread_pool.hpp>
 12 | #include <boost/asio/post.hpp>
 13 | #include "combine.h"
 14 | #include "bitspan.h"
 15 | #include "thread_pool.h"
 16 | 
 17 | static bool compareInstructionCombine(const INSTRUCTION_COMBINE& a,
 18 |                                       const INSTRUCTION_COMBINE& b);
 19 | static bool areInstructionsCombinable(Instruction& a, Instruction& b,
 20 |                                       char& replacementChar,
 21 |                                       int& differencePosition);
 22 | static void combineInstructionsWorker(PARSED_DATA& parsedData,
 23 |                                       const string& curBitString,
 24 |                                       Instruction* instruction,
 25 |                                       set<INSTRUCTION_COMBINE, decltype(compareInstructionCombine)*>& combinedInstructions,
 26 |                                       unordered_map<string, unsigned int>& visitedInstructions);
 27 | 
 28 | // Set of instructions to combine. It is populated by the workers but only
 29 | // inserted into the parserData.combinedInstructions by the parent thread
 30 | static set<INSTRUCTION_COMBINE,  decltype(compareInstructionCombine)*> g_TempCombinedInstructions(compareInstructionCombine);
 31 | 
 32 | // synchronize access to g_TempCombinedInstructions set
 33 | static boost::mutex g_TempCombinedInstructionsMutex;
 34 | 
 35 | // Custom comparator for inserting INSTRUCTION_COMBINEs into the 
 36 | // g_TempCombinedInstructions set. 
 37 | // We want:
 38 | // - higher counts (meaning more bits in the bit span)
 39 | // - otherwise sort by lower opcode string
 40 | static bool compareInstructionCombine(const INSTRUCTION_COMBINE& a,
 41 |                                       const INSTRUCTION_COMBINE& b)
 42 | {
 43 |     if(a.length != b.length)
 44 |     {
 45 |         // comparison flipped here because we actually want
 46 |         // higher counts first in our set
 47 |         return a.length > b.length;
 48 |     }
 49 | 
 50 |     if(a.opcodeA != b.opcodeA)
 51 |     {
 52 |         return a.opcodeA < b.opcodeA;
 53 |     }
 54 | 
 55 |     if(a.opcodeB != b.opcodeB)
 56 |     {
 57 |         return a.opcodeB < b.opcodeB;
 58 |     }
 59 | 
 60 |     return false;
 61 | }
 62 | 
 63 | // Returns true if instruction a and b are combinable
 64 | static bool areInstructionsCombinable(Instruction& a,
 65 |                                       Instruction& b,
 66 |                                       char& replacementChar,
 67 |                                       int& differencePosition)
 68 | {
 69 |     bool isEqual = false;
 70 | 
 71 |     if(a.getOpcode().length() != b.getOpcode().length())
 72 |     {
 73 |         // safety check against variable length instructions
 74 |         // shouldn't ever hit
 75 |         cout << "Attempting to combine different instruction length sizes!!" << endl;
 76 |         throw 1;
 77 |         return false;
 78 |     }
 79 | 
 80 |     for(unsigned int j = 0; j < COMBINE_MAX; j++)
 81 |     {
 82 |         switch(j)
 83 |         {
 84 |             case COMBINE_DUPLICATES:
 85 |                 isEqual = a.areInstructionComponentsEqual(&b);
 86 |                 if(isEqual)
 87 |                 {
 88 |                     replacementChar = '*';
 89 |                     return true;
 90 |                 }
 91 |                 break;
 92 |             case COMBINE_IMMEDIATES:
 93 |                 isEqual = a.areInstructionComponentsEqualExceptImmediate(&b, &differencePosition);
 94 |                 if(isEqual == false)
 95 |                 {
 96 |                     isEqual = a.areInstructionComponentsEqualExceptNegativeSign(&b, &differencePosition, TYPE_IMMEDIATE);
 97 |                 }
 98 | 
 99 |                 if(isEqual)
100 |                 {
101 |                     replacementChar = a.getComponentLetterFromPosition(TYPE_IMMEDIATE, differencePosition);
102 |                     return true;
103 |                 }
104 |                 break;
105 |             case COMBINE_REGISTERS:
106 |                 isEqual = a.areInstructionComponentsEqualExceptRegister(&b, &differencePosition);
107 |                 if(isEqual)
108 |                 {
109 |                     replacementChar = a.getComponentLetterFromPosition(TYPE_REGISTER, differencePosition);
110 |                     return true;
111 |                 }
112 |                 break;
113 |             default:
114 |                 // BUGBUG: handle errors gracefully
115 |                 cout << "[-] Invalid combine type specified!!" << endl;
116 |                 return false;
117 |         }
118 |     }
119 | 
120 |     return false;
121 | }
122 | 
123 | // Iterates over all bits of the curBitString and attempts to see if
124 | // instruction can be merged with any other instruction one bit away. If a 
125 | // match candidate is found, inserts it into g_TempCombinedInstructions.
126 | // Attempts to find the longest bit span of combinable instructions
127 | static void combineInstructionsWorker(PARSED_DATA& parsedData,
128 |                                       const string& curBitString,
129 |                                       Instruction* instruction,
130 |                                       set<INSTRUCTION_COMBINE, decltype(compareInstructionCombine)*>& combinedInstructions,
131 |                                       unordered_map<string, unsigned int>& visitedInstructions)
132 | {
133 |     BITSPAN longestBitSpan = {0, 0, 0, 0, 0};
134 |     BITSPAN curBitSpan = {0, 0, 0, 0, 0};
135 |     string spanBitString;
136 | 
137 |     // loop through each bit of the current instruction
138 |     for(unsigned int i = 0; i < curBitString.length(); i++)
139 |     {
140 |         map<string, Instruction*>:: iterator zeroItr;
141 |         map<string, Instruction*>:: iterator oneItr;
142 |         string zeroBitString;
143 |         string oneBitString;
144 |         bool isEqual = false;
145 |         bool hasZero = false;
146 |         char replacementChar = '\0';
147 |         int differencePosition = -1;
148 | 
149 |         if(curBitString[i] != '0' && curBitString[i] != '1')
150 |         {
151 |             // this bit has already been combined
152 |             // check if it increases our span
153 |             if(curBitString[i] == curBitSpan.replacementChar)
154 |             {
155 |                 incrementBitSpan(curBitSpan);
156 |             }
157 |             else
158 |             {
159 |                 // we are starting a new bit span
160 |                 updateLongestBitSpan(curBitSpan, longestBitSpan);
161 |                 initBitSpan(curBitSpan);
162 |                 curBitSpan.length = 1;
163 |                 curBitSpan.replacementChar = curBitString[i];
164 |             }
165 | 
166 |             // this is already a combined instruction, no need to do more work
167 |             continue;
168 |         }
169 | 
170 |         zeroBitString = curBitString;
171 |         oneBitString = curBitString;
172 | 
173 |         // create two opcoded bit strings:
174 |         // - replace all bits in the span with 0s
175 |         // - replace all bitgs in the span with 1s
176 |         // both new opcode bit strings must be presented and combinable
177 |         // for us to increase our bitspan count
178 |         replacesBitsFromSpan(zeroBitString, i, curBitSpan.length, '0');
179 |         replacesBitsFromSpan(oneBitString, i, curBitSpan.length, '1');
180 | 
181 |         if(curBitString[i] == '0')
182 |         {
183 |             hasZero = true;
184 |         }
185 | 
186 |         // current bit position is 0, increment it to a 1 and see if another 
187 |         // string is there
188 |         zeroItr = parsedData.combinedInstructions.find(zeroBitString);
189 |         if(zeroItr == parsedData.combinedInstructions.end())
190 |         {
191 |             // didn't find an adjacent instruction
192 |             if(curBitSpan.length > 0)
193 |             {
194 |                 i -= 1;
195 |             }
196 | 
197 |             updateLongestBitSpan(curBitSpan, longestBitSpan);
198 |             initBitSpan(curBitSpan);
199 |             continue;
200 |         }
201 | 
202 |         oneItr = parsedData.combinedInstructions.find(oneBitString);
203 |         if(oneItr == parsedData.combinedInstructions.end())
204 |         {
205 |             // didn't find an adjacent instruction
206 |             if(curBitSpan.length > 0)
207 |             {
208 |                 i -= 1;
209 |             }
210 | 
211 |             updateLongestBitSpan(curBitSpan, longestBitSpan);
212 |             initBitSpan(curBitSpan);
213 |             continue;
214 |         }
215 | 
216 |         //
217 |         // We have a candidate adjacent instruction, check if they are 
218 |         // combinable
219 |         //
220 |         isEqual = areInstructionsCombinable(*zeroItr->second,
221 |                                             *oneItr->second,
222 |                                             replacementChar,
223 |                                             differencePosition);
224 | 
225 |         // TODO: review this logic
226 |         if(!isEqual)
227 |         {
228 |             // no match
229 |             updateLongestBitSpan(curBitSpan, longestBitSpan);
230 |             initBitSpan(curBitSpan);
231 |             continue;
232 |         }
233 | 
234 |         // check if instructions are combinable but not the same replacement
235 |         // char
236 |         if(replacementChar != curBitSpan.replacementChar)
237 |         {
238 |             updateLongestBitSpan(curBitSpan, longestBitSpan);
239 |             initBitSpan(curBitSpan);
240 |             incrementBitSpan(curBitSpan);
241 | 
242 |             if(hasZero)
243 |             {
244 |                 curBitSpan.hasZero = true;
245 |                 curBitSpan.bitPos = i;
246 |             }
247 |             curBitSpan.replacementChar = replacementChar;
248 |             continue;
249 |         }
250 | 
251 |         if(isEqual)
252 |         {
253 |             if(hasZero && curBitSpan.hasZero == false)
254 |             {
255 |                 curBitSpan.hasZero = true;
256 |                 curBitSpan.bitPos = i;
257 |                 curBitSpan.replacementChar = replacementChar;
258 |             }
259 | 
260 |             if(curBitSpan.differencePosition == -1)
261 |             {
262 |                 curBitSpan.differencePosition = differencePosition;
263 |             }
264 |         }
265 | 
266 |         incrementBitSpan(curBitSpan);
267 | 
268 |     } // for(unsigned int i = 0; i < curBitString.length(); i++)
269 | 
270 |     updateLongestBitSpan(curBitSpan, longestBitSpan);
271 | 
272 |     // if longestBitSpan.count is non-zero that means:
273 |     // - we found at least one bit span to combine
274 |     // - this is the longest one
275 |     if(longestBitSpan.length > 0)
276 |     {
277 |         // two instructions to delete
278 |         // new instruction to insert
279 |         unordered_map<string, unsigned int>::iterator itr;
280 |         INSTRUCTION_COMBINE newCombine;
281 |         string tempBitString;
282 | 
283 |         tempBitString = curBitString;
284 |         tempBitString[longestBitSpan.bitPos] = '1';
285 | 
286 |         // check if we already have a better match
287 |         itr = visitedInstructions.find(tempBitString);
288 |         if(itr != visitedInstructions.end())
289 |         {
290 |             // we have seen this address already, check if our current span is better or worse
291 |             if(longestBitSpan.length > itr->second)
292 |             {
293 |                 // this new span is better, insert it in
294 |                 visitedInstructions.insert({{tempBitString, longestBitSpan.length}});
295 |             }
296 |             else
297 |             {
298 |                 // this new span is worse, ignore it
299 |                 return;
300 |             }
301 |         }
302 |         else
303 |         {
304 |             // we haven't seen this address yet, add it in
305 |             visitedInstructions.insert({{tempBitString, longestBitSpan.length}});
306 |         }
307 | 
308 |         // instructions are equal, combine them
309 |         newCombine.length = longestBitSpan.length;
310 |         newCombine.instruction = new Instruction();
311 |         *newCombine.instruction = *instruction;
312 | 
313 |         newCombine.opcodeA = curBitString;
314 |         newCombine.opcodeB = tempBitString;
315 | 
316 |         tempBitString[longestBitSpan.bitPos] = longestBitSpan.replacementChar;
317 | 
318 |         //cout << "MATCH " << longestBitSpan.count << " " << longestBitSpan.replacementChar << " " << newCombine.opcodeA << " " << newCombine.opcodeB << " " << tempBitString << endl;
319 | 
320 |         newCombine.instruction->setOpcodeBitString(tempBitString);
321 |         newCombine.instruction->setCombined(true);
322 |         newCombine.instruction->setNeedsFree(true);
323 | 
324 |         if(longestBitSpan.differencePosition != -1)
325 |         {
326 |             newCombine.instruction->setComponentPositionCombined(longestBitSpan.differencePosition);
327 |         }
328 | 
329 |         // insert our newly created instruction into our temp set it will be
330 |         // sorted by bit span count so we can ensure we merge only the optimal
331 |         // instructions into parsedData.combinedInstructions
332 |         combinedInstructions.insert(std::move(newCombine));
333 |     }
334 | 
335 |     return;
336 | }
337 | 
338 | static int combineInstructionsThread(PARSED_DATA& parsedData,
339 |                                      unsigned long long start,
340 |                                      unsigned long long end)
341 | {
342 |     map<string, Instruction*>::iterator startItr = parsedData.combinedInstructions.begin();
343 |     map<string, Instruction*>::iterator endItr = parsedData.combinedInstructions.begin();
344 |     set<INSTRUCTION_COMBINE, decltype(compareInstructionCombine)*> combinedInstructions(compareInstructionCombine);
345 |     unordered_map<string, unsigned int> visitedInstructions;
346 | 
347 |     if(start >= parsedData.combinedInstructions.size() ||
348 |        end >= parsedData.combinedInstructions.size())
349 |     {
350 |         cout << "Bad sizes!!\n";
351 |         cout << start << " " << end << " " << parsedData.combinedInstructions.size() << endl;
352 |         throw 1;
353 |     }
354 | 
355 |     if(start > end)
356 |     {
357 |         cout << "Bad sizes 2 !!\n";
358 |         cout << start << " " << end << " " << parsedData.combinedInstructions.size() << endl;
359 |         throw 2;
360 |     }
361 | 
362 |     std::advance(startItr, start);
363 |     endItr = startItr;
364 |     std::advance(endItr, end - start + 1);
365 | 
366 |     for(; startItr != endItr; startItr++)
367 |     {
368 |         combineInstructionsWorker(parsedData,
369 |                                   startItr->first,
370 |                                   startItr->second,
371 |                                   combinedInstructions,
372 |                                   visitedInstructions);
373 |     }
374 | 
375 |     g_TempCombinedInstructionsMutex.lock();
376 |     g_TempCombinedInstructions.merge(combinedInstructions);
377 |     g_TempCombinedInstructionsMutex.unlock();
378 | 
379 |     incrementWorkerCompletions();
380 |     return 0;
381 | }
382 | 
383 | // Queue each instruction to the thread pool to be combined by worker threads
384 | static unsigned int combineInstructionsScheduler(PARSED_DATA& parsedData)
385 | {
386 |     boost::asio::thread_pool threadPool(parsedData.numThreads);
387 |     unsigned long long numInstructions = 0;
388 |     unsigned long long portionSize = 0;
389 |     unsigned long long start = 0;
390 |     unsigned int submissions = 0;
391 | 
392 |     resetThreadPool();
393 | 
394 |     //
395 |     // split the instructions into 1/num threads pieces
396 |     //
397 |     numInstructions = parsedData.combinedInstructions.size();
398 |     portionSize = numInstructions/parsedData.numThreads;
399 | 
400 |     if(portionSize == 0)
401 |     {
402 |         // we can end up with a 0 portionSize if numThreads > numInstructions
403 |         portionSize = 1;
404 |     }
405 | 
406 |     for(unsigned int i = 0; i < parsedData.numThreads; i++)
407 |     {
408 |         unsigned long long end = 0;
409 | 
410 |         start = i * portionSize;
411 | 
412 |         if(i == parsedData.numThreads - 1)
413 |         {
414 |             // last thread, always set end to numInstructions
415 |             end = numInstructions - 1;
416 |         }
417 |         else
418 |         {
419 |             end = start + portionSize - 1;
420 |         }
421 | 
422 |         if(start >= numInstructions)
423 |         {
424 |             continue;
425 |         }
426 | 
427 |         // queue a worker to work on 1/n of the disassembly
428 |         boost::asio::post(threadPool, 
429 |                           boost::bind(combineInstructionsThread,
430 |                                       boost::ref(parsedData),
431 |                                       start,
432 |                                       end));
433 |         submissions++;
434 |     }
435 | 
436 |     // wait for threads
437 |     // TODO: improve poll logic
438 |     while(1)
439 |     {
440 |         boost::this_thread::sleep(boost::posix_time::milliseconds(100));
441 |         unsigned int completedCount = getWorkerCompletions();
442 | 
443 |         // check if we finished our submitted jobs
444 |         if(completedCount >= submissions)
445 |         {
446 |             // finished
447 |             break;
448 |         }
449 |     }
450 | 
451 |     threadPool.join();
452 | 
453 |     // short-circuit exit if we didn't combine any instructions during this
454 |     // loop
455 |     if(g_TempCombinedInstructions.size() == 0)
456 |     {
457 |         //cout << "  [*] No instructions combined during pass. Short-circuiting" << endl;
458 |         return 0;
459 |     }
460 | 
461 |     //cout << "g_TempCombinedInstructions: " << g_TempCombinedInstructions.size() << endl;
462 | 
463 |     g_TempCombinedInstructionsMutex.lock();
464 | 
465 |     // Update parsedData.combinedInstructions with the newly created combined
466 |     // instructions. Remove two instructions for every onec combined
467 |     // instruction we add back in.
468 |     for(set<INSTRUCTION_COMBINE>:: iterator currItr = g_TempCombinedInstructions.begin();
469 |         currItr != g_TempCombinedInstructions.end();
470 |         currItr++)
471 |     {
472 |         // Verify both opcodeA and opcodeB are present. It's possible we remove
473 |         // one or both while combining another instruction
474 |         auto tempItr = parsedData.combinedInstructions.find(currItr->opcodeA);
475 |         if(tempItr == parsedData.combinedInstructions.end())
476 |         {
477 |             delete currItr->instruction;
478 |             continue;
479 |         }
480 | 
481 |         auto tempItr2 = parsedData.combinedInstructions.find(currItr->opcodeB);
482 |         if(tempItr2 == parsedData.combinedInstructions.end())
483 |         {
484 |             delete currItr->instruction;
485 |             continue;
486 |         }
487 | 
488 |         // We only delete instructions that were previously combined as they
489 |         // were allocated here.
490 |         if(tempItr->second->getNeedsFree() == true)
491 |         {
492 |             delete tempItr->second;
493 |         }
494 |         parsedData.combinedInstructions.erase(tempItr);
495 | 
496 |         if(tempItr2->second->getNeedsFree() == true)
497 |         {
498 |             delete tempItr2->second;
499 |         }
500 |         parsedData.combinedInstructions.erase(tempItr2);
501 | 
502 |         // insert the new combined instruction
503 |         parsedData.combinedInstructions.insert({{std::move(currItr->instruction->getOpcode()),
504 |                                                            currItr->instruction}});
505 |     }
506 | 
507 |     g_TempCombinedInstructions.clear();
508 |     g_TempCombinedInstructionsMutex.unlock();
509 | 
510 |     return 1;
511 | }
512 | 
513 | // Attempts to combine instructions into one. To combine two instructions into
514 | // one:
515 | // -- the opcodes must bit one bit apart
516 | // -- the instructions must be identical (COMBINE_DUPLICATE)
517 | // -- the instructions must be identical except for an immediate field 
518 | //    (COMBINE_IMMEDIATE)
519 | // -- the instructions must be identical except for a register field
520 | //    (COMBINE_REGISTER)
521 | //
522 | // When we find two instructions to combine we must:
523 | // -- remove the first instruction from combinedInstructions set
524 | // -- remove the second instruction from the combinedInstruction set
525 | // -- change the shared bit to another character such as:
526 | // ---- '*' for duplicates
527 | // ---- lowercase letter for immediates
528 | // ---- uppercase letter for registers
529 | // -- create a new combined instruction and add it to the combinedInstructions
530 | //    set
531 | // -- ensure that we have the best (AKA longest) combination possible
532 | //
533 | // Because we are inserting and deleting while iterating through the set we need be
534 | // careful with our iterators
535 | //
536 | void combineInstructions(PARSED_DATA& parsedData)
537 | {
538 |     boost::timer::auto_cpu_timer t;
539 |     unsigned int result = 0;
540 | 
541 |     // worst case we must run this algorithm once for every bit in the opcode
542 |     // we have a short-circuit exit if execute a loop without combining any 
543 |     // instructions
544 |     // TODO: is this still true with failures in combining??
545 |     for(unsigned int k = 0; k < parsedData.maxOpcodeBits; k++)
546 |     {
547 |         cout << "  [*] Pass: " << k << " Instructions: " << parsedData.combinedInstructions.size() << endl;
548 | 
549 |         result = combineInstructionsScheduler(parsedData);
550 |         if(result == 0)
551 |         {
552 |             // no more to combine, return early
553 |             return;
554 |         }
555 |     }
556 | 
557 |     return;
558 | }
559 | 


--------------------------------------------------------------------------------
/combine.h:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // File: combine.h
 3 | //
 4 | // Combining instructions
 5 | //
 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
 7 | // Licensed under the Apache 2.0 License.
 8 | //-----------------------------------------------------------------------------
 9 | #pragma once
10 | 
11 | #include "parser.h"
12 | 
13 | // two instructions to combine to a single one
14 | typedef struct _INSTRUCTION_COMBINE
15 | {
16 |     unsigned int length; // count of bits being combined
17 |     Instruction* instruction;
18 |     string opcodeA;
19 |     string opcodeB;
20 | } INSTRUCTION_COMBINE, *PINSTRUCTION_COMBINE;
21 | 
22 | void combineInstructions(PARSED_DATA& parsedData);
23 | 


--------------------------------------------------------------------------------
/examples/ethereum.txt:
--------------------------------------------------------------------------------
  1 | 0x00 STOP
  2 | 0x01 ADD
  3 | 0x02 MUL
  4 | 0x03 SUB
  5 | 0x04 DIV
  6 | 0x05 SDIV
  7 | 0x06 MOD
  8 | 0x07 SMOD
  9 | 0x08 ADDMOD
 10 | 0x09 MULMOD
 11 | 0x0a EXP
 12 | 0x0b SIGNEXTEND
 13 | 0x10 LT
 14 | 0x11 GT
 15 | 0x12 SLT
 16 | 0x13 SGT
 17 | 0x14 EQ
 18 | 0x15 ISZERO
 19 | 0x16 AND
 20 | 0x17 OR
 21 | 0x18 XOR
 22 | 0x19 NOT
 23 | 0x1a BYTE
 24 | 0x20 SHA3
 25 | 0x30 ADDRESS
 26 | 0x31 BALANCE
 27 | 0x32 ORIGIN
 28 | 0x33 CALLER
 29 | 0x34 CALLVALUE
 30 | 0x35 CALLDATALOAD
 31 | 0x36 CALLDATASIZE
 32 | 0x37 CALLDATACOPY
 33 | 0x38 CODESIZE
 34 | 0x39 CODECOPY
 35 | 0x3a GASPRICE
 36 | 0x3b EXTCODESIZE
 37 | 0x3c EXTCODECOPY
 38 | 0x40 BLOCKHASH
 39 | 0x41 COINBASE
 40 | 0x42 TIMESTAMP
 41 | 0x43 NUMBER
 42 | 0x44 DIFFICULTY
 43 | 0x45 GASLIMIT
 44 | 0x50 POP
 45 | 0x51 MLOAD
 46 | 0x52 MSTORE
 47 | 0x53 MSTORE8
 48 | 0x54 SLOAD
 49 | 0x55 SSTORE
 50 | 0x56 JUMP
 51 | 0x57 JUMPI
 52 | 0x58 PC
 53 | 0x59 MSIZE
 54 | 0x5a GAS
 55 | 0x5b JUMPDEST
 56 | 0x80 DUP1
 57 | 0x81 DUP2
 58 | 0x82 DUP3
 59 | 0x83 DUP4
 60 | 0x84 DUP5
 61 | 0x85 DUP6
 62 | 0x86 DUP7
 63 | 0x87 DUP8
 64 | 0x88 DUP9
 65 | 0x89 DUP10
 66 | 0x8a DUP11
 67 | 0x8b DUP12
 68 | 0x8c DUP13
 69 | 0x8d DUP14
 70 | 0x8e DUP15
 71 | 0x8f DUP16
 72 | 0x90 SWAP1
 73 | 0x91 SWAP2
 74 | 0x92 SWAP3
 75 | 0x93 SWAP4
 76 | 0x94 SWAP5
 77 | 0x95 SWAP6
 78 | 0x96 SWAP7
 79 | 0x97 SWAP8
 80 | 0x98 SWAP9
 81 | 0x99 SWAP10
 82 | 0x9a SWAP11
 83 | 0x9b SWAP12
 84 | 0x9c SWAP13
 85 | 0x9d SWAP14
 86 | 0x9e SWAP15
 87 | 0x9f SWAP16
 88 | 0xa0 LOG0
 89 | 0xa1 LOG1
 90 | 0xa2 LOG2
 91 | 0xa3 LOG3
 92 | 0xa4 LOG4
 93 | 0xf0 CREATE
 94 | 0xf1 CALL
 95 | 0xf2 CALLCODE
 96 | 0xf3 RETURN
 97 | 0xf4 DELEGATE_CALL
 98 | 0xf5 CREATE2
 99 | 0xfa STATICCALL
100 | 0xfd REVERT
101 | 0xff SUICIDE
102 | 


--------------------------------------------------------------------------------
/instruction.h:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // File: instruction.h
 3 | //
 4 | // Instruction class definition
 5 | //
 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
 7 | // Licensed under the Apache 2.0 License.
 8 | //-----------------------------------------------------------------------------
 9 | #pragma once
10 | 
11 | #include <boost/algorithm/string.hpp>
12 | #include <boost/regex.hpp>
13 | #include <set>
14 | #include "slautil/slautil.h"
15 | using namespace std;
16 | 
17 | enum InstructionComponentType
18 | {
19 |     TYPE_INSTRUCTION = 0,
20 |     TYPE_REGISTER,
21 |     TYPE_IMMEDIATE,
22 |     TYPE_SIGNED_IMMEDIATE, // TODO: not used
23 |     TYPE_MAX, // Not a valid type, must be the last one
24 | };
25 | 
26 | class InstructionComponent
27 | {
28 |     public:
29 |         InstructionComponent(const InstructionComponentType newType, const string &newComponent);
30 |         InstructionComponent(const InstructionComponentType newType, const string &newComponent, bool isCombined);
31 | 
32 | 
33 |         // BUGBUG: should these really be public? I'm treating InstructionComponent more as struct than as a class
34 |         InstructionComponentType type;
35 |         string component;
36 |         string combinedComponent;
37 |         bool isCombined; 
38 | };
39 | 
40 | class Instruction
41 | {
42 |     public:
43 |         // gets and sets the opcode bitstring
44 |         string getOpcode(void);
45 |         void setOpcode(const string &opcodeBitString); // opcode must be a hex string begining with 0x
46 |         void setOpcodeBitString(const string &newOpcode); // opcode is a binary string without a prefix
47 | 
48 |         // gets and sets the combined flag
49 |         // BUGBUG: do I really need this??
50 |         bool getCombined(void);
51 |         void setCombined(bool isCombined);
52 |         int setComponentPositionCombined(const unsigned int componentPosition);
53 | 
54 |         bool getNeedsFree(void);
55 |         void setNeedsFree(bool needsToBeFreed);
56 | 
57 |         // adds a new instruction component to the instruction
58 |         void addComponent(const InstructionComponentType newType, const string &newComponent);
59 |         void addComponent(const InstructionComponentType newType, const string &newComponent, bool isCombined);
60 | 
61 |         // helper functions for identifying combined bits by letters
62 |         char getComponentLetterFromPosition(const InstructionComponentType type, const unsigned int componentPosition);
63 |         unsigned int getComponentPositionFromLetter(const char componentLetter);
64 | 
65 |         // prints the instruction
66 |         string printInstruction(set<string>& tokenInstructions);
67 |         string getInstructionOutputString(bool getCombined, bool escapeDuplicateRegisters);
68 |         int getInstructionDuplicatedRegisters(bool getCombined, map<string, unsigned int>& duplicatedRegisters);
69 |         string getOpcodeOutputString(set<string>& tokenInstructions);
70 | 
71 |         // basic checks that the instruction is sane
72 |         bool validateInstruction(void);
73 | 
74 |         // tests to check if two instruction can be combined
75 |         bool areInstructionComponentsEqual(Instruction* right);
76 |         bool areInstructionComponentsEqualExceptImmediate(Instruction* right, int* differencePosition);
77 |         bool areInstructionComponentsEqualExceptNegativeSign(Instruction* right, int* differencePosition, InstructionComponentType componentType);
78 |         bool areInstructionComponentsEqualExceptRegister(Instruction* right, int* differencePosition);
79 | 
80 |         // for creating the .slaspec
81 |         void separateOpcode();
82 |         int computeAttachVariables(map<string, Instruction*>& allInstructions, map<string, string>& attachVariables, vector<Slautil>& slas);
83 |         int generateAttachedRegisters(string opcode, unsigned int regStart, unsigned int regEnd, map<string, Instruction*>& allInstructions, vector<Slautil>& slas, string& foundRegisters);
84 | 
85 |     //private:
86 |         string opcode; // entire opcode of instruction in binary
87 |         vector<string> splitOpcode; // opcode split into individual components
88 |         vector<InstructionComponent> components; // the instruction broken up as components
89 |         bool combined; // is the instruction combined or not
90 |         bool needsFree;
91 | };
92 | 
93 | // misc utility functions used by the Instruction class
94 | // BUGBUG: should this be a part of the class??
95 | int convertHexNibbletoInteger(unsigned char x);
96 | bool isInstructionComponentFiller(string& str);
97 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // File: main.cpp
  3 | //
  4 | // Handles command line argument parsing and calling the parsing, combining, 
  5 | // and output routines.
  6 | //
  7 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
  8 | // Licensed under the Apache 2.0 License.
  9 | //-----------------------------------------------------------------------------
 10 | #include <iostream>
 11 | #include <boost/timer/timer.hpp>
 12 | #include <boost/program_options.hpp>
 13 | #include <boost/filesystem.hpp>
 14 | #include <boost/range/iterator_range.hpp>
 15 | #include <boost/thread.hpp>
 16 | #include "combine.h"
 17 | #include "parser.h"
 18 | #include "parser_sla.h"
 19 | #include "output.h"
 20 | using namespace std;
 21 | 
 22 | using namespace boost::filesystem;
 23 | 
 24 | int generateFromSleigh(PARSED_DATA& parsedData,
 25 |                        bool printRegistersOnly,
 26 |                        bool skipInstructionCombining);
 27 | int generateFromText(PARSED_DATA& parsedData,
 28 |                      bool printRegistersOnly,
 29 |                      bool skipInstructionCombining);
 30 | int readFilenamesFromDirectory(PARSED_DATA& parsedData,
 31 |                                const string& dirPath,
 32 |                                const string& extension);
 33 | 
 34 | int main(int argc, char *argv[])
 35 | {
 36 |     boost::program_options::options_description desc{"Ghidra Processor Module Generator"};
 37 |     boost::program_options::variables_map args;
 38 |     vector<string> additionalRegisters; // list of additional registers passed
 39 |                                         // in at the command line
 40 |     PARSED_DATA parsedData;
 41 |     bool skipInstructionCombining; // if set, skip attempting to combine
 42 |                                    // instructions. Useful for debugging
 43 |     bool printRegistersOnly; // if set parse the instruction set and only
 44 |                              // display the registers. Useful for debugging purposes.
 45 |     bool parseSleigh; // if set the input is .sla, not disassembly text
 46 |     string inputFilename;
 47 |     string inputDirectory;
 48 |     boost::timer::auto_cpu_timer t;
 49 |     int result = 0;
 50 | 
 51 |     parsedData.maxOpcodeBits = 0;
 52 |     skipInstructionCombining = false;
 53 |     printRegistersOnly = false;
 54 |     parseSleigh = false;
 55 | 
 56 |     cout << "Ghidra Processor Module Generator" << endl;
 57 | 
 58 |     //
 59 |     // command line arg parsing
 60 |     //
 61 | 
 62 |     try
 63 |     {
 64 |         desc.add_options()
 65 |             ("input-disassembly,i", boost::program_options::value<string>(&inputFilename), "Path to a newline delimited text file containing all opcodes and instructions for the processor module.")
 66 |             ("input-disassembly-dir", boost::program_options::value<string>(&inputDirectory), "Path to a directory with multiple newline delimited text files containing all opcodes and instructions for the processor module.")
 67 |             ("input-sleigh,s", boost::program_options::value<string>(&inputFilename), "Path to a XML .sla file containing all opcodes and instructions for the processor module.")
 68 |             ("input-sleigh-dir", boost::program_options::value<string>(&inputDirectory), "Path to a directory with multiple XML .sla files containing all opcodes and instructions for the processor module.")
 69 |             ("num-threads,t", boost::program_options::value<unsigned int>(&parsedData.numThreads), "Number of worker threads to use. Optional. Defaults to number of physical CPUs if not specified")
 70 |             ("processor-name,n",boost::program_options::value<string>(&parsedData.processorName)->default_value("MyProc"), "Name of the target processor. Defaults to \"MyProc\" if not specified")
 71 |             ("processor-family,f",boost::program_options::value<string>(&parsedData.processorFamily)->default_value("MyProcFamily"), "Name of the target processor's family. Defaults to \"MyProcFamily\" if not specified")
 72 |             ("endian,e", boost::program_options::value<string>(&parsedData.endianness)->default_value("big"), "Endianness of the processor. Must be either \"little\" or \"big\". Defaults to big if not specified")
 73 |             ("alignment,a", boost::program_options::value<unsigned int>(&parsedData.alignment)->default_value(1), "Instruction alignment of the processor. Defaults to 1 if not specified")
 74 |             ("bitness,b", boost::program_options::value<unsigned int>(&parsedData.bitness)->default_value(32), "Bitness of the processor. Defaults to 32 if not specified")
 75 |             ("print-registers-only", boost::program_options::bool_switch(&printRegistersOnly), "Only print parsed registers. Useful for debugging purposes. False by default")
 76 |             ("omit-opcodes", boost::program_options::bool_switch(&parsedData.omitOpcodes)->default_value(false), "Don't print opcodes in the outputted .sla file. False by default")
 77 |             ("omit-example-instructions", boost::program_options::bool_switch(&parsedData.omitExampleInstructions)->default_value(false), "Don't print example combined instructions in the outputted .sla file. False by default")
 78 |             ("skip-instruction-combining", boost::program_options::bool_switch(&skipInstructionCombining), "Don't combine instructions. Useful for debugging purposes. False by default")
 79 |             ("additional-registers,ar", boost::program_options::value<vector<string>>(&additionalRegisters)->multitoken(), "List of additional registers. Use this option if --print-registers-only is missing registers for your instruction set")
 80 |             ("help,h", "Help screen");
 81 | 
 82 |         store(parse_command_line(argc, argv, desc), args);
 83 |         notify(args);
 84 | 
 85 |         if(args.count("help") || argc == 1)
 86 |         {
 87 |             cout << desc << endl;
 88 |             return 0;
 89 |         }
 90 | 
 91 |         if(parsedData.endianness != "big" && parsedData.endianness != "little")
 92 |         {
 93 |             cout << "Processor endianness must be either little or big" << endl;
 94 |             return -1;
 95 |         }
 96 | 
 97 |         // make sure exactly one input method is specified by the user
 98 |         int inputFlagCount = args.count("input-disassembly") +
 99 |                              args.count("input-disassembly-dir") + 
100 |                              args.count("input-sleigh") +
101 |                              args.count("input-sleigh-dir");
102 |         if(inputFlagCount != 1)
103 |         {
104 |             cout << "Specifiy exactly one of: --input-disassembly,--input-disassembly-dir, --input-sleigh, or --input-sleigh-dir" << endl;
105 |             return -1;
106 |         }
107 | 
108 |         if(args.count("input-disassembly") != 0 &&
109 |            args.count("input-disassembly-dir") != 0)
110 |         {
111 |             cout << "Specify either input disassembly file or dir, not both!!" << endl;
112 |             return -1;
113 |         }
114 | 
115 |         if(args.count("input-disassembly") != 0)
116 |         {
117 |             parsedData.inputFilenames.push_back(inputFilename);
118 |         }
119 | 
120 |         if(args.count("input-disassembly-dir") != 0)
121 |         {
122 |             result = readFilenamesFromDirectory(parsedData,
123 |                                                 inputDirectory,
124 |                                                 "*");
125 |             if(result != 0)
126 |             {
127 |                 return result;
128 |             }
129 |         }
130 | 
131 |         if(args.count("input-sleigh") != 0)
132 |         {
133 |             parsedData.inputFilenames.push_back(inputFilename);
134 |             parseSleigh = true;
135 |         }
136 | 
137 |         if(args.count("input-sleigh-dir") != 0)
138 |         {
139 |             result = readFilenamesFromDirectory(parsedData,
140 |                                                 inputDirectory,
141 |                                                 ".sla");
142 |             if(result != 0)
143 |             {
144 |                 cout << "Failed to find any .sla files" << endl;
145 |                 return result;
146 |             }
147 |             parseSleigh = true;
148 |         }
149 | 
150 |         if(parsedData.inputFilenames.size() == 0)
151 |         {
152 |             cout << "Failed to find input files" << endl;
153 |             return -1;
154 |         }
155 | 
156 |         if(args.count("num-threads") == 0)
157 |         {
158 |             // user didn't specify number of threads
159 |             // default to number of physical cpus
160 |             parsedData.numThreads = boost::thread::physical_concurrency();
161 |             if(parsedData.numThreads == 0)
162 |             {
163 |                 cout << "Unable to determine number of CPUs. Please specify thread count with --num-threads at the command line." << endl;
164 |                 return -1;
165 |             }
166 |         }
167 | 
168 |         if(parsedData.numThreads == 0)
169 |         {
170 |             cout << "Invalid number of threads specified" << endl;
171 |             return -1;    
172 |         }
173 |     }
174 |     catch (const boost::program_options::error &ex)
175 |     {
176 |         cout << "[-] Error parsing command line: " << ex.what() << endl;
177 |         return -1;
178 |     }
179 | 
180 |     cout << "[*] Using " << parsedData.numThreads << " worker thread(s)" << endl;
181 | 
182 |     //
183 |     // initialize the default set of registers from Ghidra
184 |     //
185 |     cout << "[*] Initializing default Ghidra registers" << endl;
186 |     result = initRegisters();
187 |     if(result != 0)
188 |     {
189 |         cout << "[-] Failed to initialize default Ghidra registers!!" << endl;
190 |         goto ERROR_CLEANUP;
191 |     }
192 | 
193 |     result = addRegisters(additionalRegisters);
194 |     if(result != 0)
195 |     {
196 |         cout << "[-] Failed to add additional registers!!" << endl;
197 |         goto ERROR_CLEANUP;
198 |     }
199 | 
200 |     if(parseSleigh == false)
201 |     {
202 |         // user supplied one or more text files of disassembly
203 |         result = generateFromText(parsedData,
204 |                                   printRegistersOnly,
205 |                                   skipInstructionCombining);
206 |         if(!result)
207 |         {
208 |             return result;
209 |         }
210 |     }
211 |     else
212 |     {
213 |         // user supplied one or more .sla files
214 |         result = generateFromSleigh(parsedData,
215 |                                     printRegistersOnly,
216 |                                     skipInstructionCombining);
217 |         if(!result)
218 |         {
219 |             return result;
220 |         }
221 |     }
222 | 
223 | ERROR_CLEANUP:
224 |     clearParserData(parsedData, false);
225 |     return result;
226 | }
227 | 
228 | // search directory for all files of extension type
229 | int readFilenamesFromDirectory(PARSED_DATA& parsedData,
230 |                                const string& dirPath,
231 |                                const string& extension)
232 | {
233 |     if(!is_directory(dirPath))
234 |     {
235 |         cout << "Invalid directory: " << dirPath << endl;
236 |         return -1;
237 |     }
238 | 
239 |     for(auto& dir_entry : boost::make_iterator_range(directory_iterator(dirPath), {}))
240 |     {
241 |         if(extension == "*" || extension == dir_entry.path().extension())
242 |         {
243 |             parsedData.inputFilenames.push_back(dir_entry.path().string());
244 |         }
245 |     }
246 | 
247 |     // make sure we have at least one input file
248 |     if(parsedData.inputFilenames.size() == 0)
249 |     {
250 |         cout << "Failed to find any input files in: " << dirPath << endl;
251 |         return -1;
252 |     }
253 | 
254 |     // TODO: numeric sort vs alpha sort?
255 |     sort(parsedData.inputFilenames.begin(), parsedData.inputFilenames.end());
256 | 
257 |     return 0;
258 | }
259 | 
260 | // Generate one or more .sla files from the supplied text disassembly files
261 | int generateFromText(PARSED_DATA& parsedData,
262 |                      bool printRegistersOnly,
263 |                      bool skipInstructionCombining)
264 | {
265 |     int result = 0;
266 | 
267 |     for(unsigned int i = 0; i < parsedData.inputFilenames.size(); i++)
268 |     {
269 |         //
270 |         // read the input file and parse the instructions into parsedData
271 |         //
272 |         cout << "[*] Parsing instructions " << parsedData.inputFilenames[i] << endl;
273 | 
274 |         result = parseInstructions(parsedData, i);
275 |         if(result != 0)
276 |         {
277 |             cout << "[-] Failed to parse instructions" << endl;
278 |             goto ERROR_CLEANUP;
279 |         }
280 |         cout << "[*] Parsed " << parsedData.allInstructions.size() << " instructions" << endl;
281 | 
282 |         // only print registers and exit if option is set
283 |         if(printRegistersOnly)
284 |         {
285 |             goto CONTINUE_LOOP;            
286 |         }
287 | 
288 |         //
289 |         // combine the instructions and process data for output
290 |         //
291 | 
292 |         // skip combining if option is set
293 |         if(skipInstructionCombining == false)
294 |         {
295 |             cout << "[*] Combining instructions" << endl;
296 |             combineInstructions(parsedData);
297 |         }
298 | 
299 |         cout << "[*] Computing attach registers" << endl;
300 |         computeAttachVariables(parsedData);
301 | 
302 |         cout << "[*] Computing token instructions" << endl;
303 |         computeTokenInstructions(parsedData);
304 | 
305 |         //
306 |         // Output the completed Ghidra Processor Specification
307 |         //
308 | 
309 |         cout << "[*] Generating Ghidra processor specification" << endl;
310 |         createProcessorModule(parsedData, i);
311 | 
312 | CONTINUE_LOOP:        
313 |         clearParserData(parsedData, printRegistersOnly);
314 |         result = 0;
315 |     } // for(unsigned int i = 0; i < parsedData.inputFilenames.size(); i++)
316 | 
317 |     // only print registers and exit if option is set
318 |     if(printRegistersOnly)
319 |     {
320 |         cout << "[*] Found registers: " << getOutputRegisters(parsedData) << endl;
321 |         cout << "[*] Found mnemonics: " << getOutputMnemonics(parsedData) << endl;
322 |         cout << "If there are any issues edit registers.h before proceeding." << endl;
323 | 
324 |         result = 0;
325 |         goto ERROR_CLEANUP;
326 |     }
327 | 
328 |     cout << "[*] Creating .ldefs" << endl;
329 |     result = createLdefs(parsedData);
330 |     if(result != 0)
331 |     {
332 |         return result;
333 |     }        
334 | 
335 | ERROR_CLEANUP:
336 |     clearParserData(parsedData, false);
337 |     return result;
338 | }
339 | 
340 | // Generate a .sla files from the one or more supplied .sla files
341 | int generateFromSleigh(PARSED_DATA& parsedData,
342 |                        bool printRegistersOnly,
343 |                        bool skipInstructionCombining)
344 | {
345 |     int result = 0;
346 | 
347 |     for(unsigned int i = 0; i < parsedData.inputFilenames.size(); i++)
348 |     {
349 |         //
350 |         // read the input file and parse the instructions into parsedData
351 |         //
352 |         cout << "[*] Parsing instructions: " << parsedData.inputFilenames[i] << endl;
353 | 
354 |         result = parseInstructionsSla(parsedData, i);
355 |         if(result != 0)
356 |         {
357 |             cout << "[-] Failed to parse instructions" << endl;
358 |             goto ERROR_CLEANUP;
359 |         }
360 |         cout << "[*] Parsed " << parsedData.combinedInstructions.size() << " instructions" << endl;
361 | 
362 |         // only print registers and exit if option is set
363 |         if(printRegistersOnly)
364 |         {
365 |             continue;
366 |         }
367 |     }
368 | 
369 |     // only print registers and exit if option is set
370 |     if(printRegistersOnly)
371 |     {
372 |         cout << "[*] Found registers: " << getOutputRegisters(parsedData) << endl;
373 |         cout << "If there are any issues edit registers.h before proceeding." << endl;
374 |         result = 0;
375 |         goto ERROR_CLEANUP;
376 |     }
377 | 
378 |     //
379 |     // combine the instructions and process data for output
380 |     //
381 | 
382 |     // skip combining if option is set
383 |     if(skipInstructionCombining == false)
384 |     {
385 |         cout << "[*] Combining instructions" << endl;
386 |         combineInstructions(parsedData);
387 |     }
388 | 
389 |     cout << "[*] Computing attach registers" << endl;
390 |     computeAttachVariables(parsedData);
391 | 
392 |     cout << "[*] Computing token instructions" << endl;
393 |     computeTokenInstructions(parsedData);
394 | 
395 |     //
396 |     // Output the completed Ghidra Processor Specification
397 |     //
398 | 
399 |     cout << "[*] Generating Ghidra processor specification" << endl;
400 |     createProcessorModule(parsedData, 0);
401 | 
402 |     cout << "[*] Created Processor Module Directory" << endl;
403 | 
404 |     cout << "  [*] Creating .ldefs" << endl;
405 |     parsedData.inputFilenames.resize(1); // TODO: make this a flag to createLdefs
406 |     result = createLdefs(parsedData);
407 |     if(result != 0)
408 |     {
409 |         return result;
410 |     }
411 | 
412 | ERROR_CLEANUP:
413 |     clearParserData(parsedData, false);
414 |     return result;
415 | }
416 | 


--------------------------------------------------------------------------------
/output.cpp:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // File: output.cpp
  3 | //
  4 | // Outputs the files that comprise the Ghidra processor module.
  5 | //
  6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
  7 | // Licensed under the Apache 2.0 License.
  8 | //-----------------------------------------------------------------------------
  9 | 
 10 | #include <boost/timer/timer.hpp>
 11 | #include "output.h"
 12 | 
 13 | #include <boost/filesystem.hpp>
 14 | 
 15 | // Creates the directory structure required by the processor spec. Processor 
 16 | // specs must be in the <ProcessorFamily>/data/languages/ directory structure
 17 | int createDirectoryStructure(PARSED_DATA& parsedData)
 18 | {
 19 |     bool result = false;
 20 | 
 21 |     boost::filesystem::path p{parsedData.processorFamily};
 22 | 
 23 |     p.append("data");
 24 |     p.append("languages");
 25 | 
 26 |     if(boost::filesystem::exists(p) && boost::filesystem::is_directory(p))
 27 |     {
 28 |         // directory already exists
 29 |         return 0;
 30 |     }
 31 | 
 32 |     // create the directory
 33 |     // BUGBUG: catch exceptions or use no throw
 34 |     result = boost::filesystem::create_directories(p);
 35 |     if(result == false)
 36 |     {
 37 |         cout << "  [-] Failed to create processor directories!!" << endl;
 38 |         return -1;
 39 |     }
 40 | 
 41 |     return 0;
 42 | }
 43 | 
 44 | // Creates an empty Module.manifest inside the <ProcessorFamily> directory.
 45 | // Unsure why this is required by Ghidra
 46 | // <ProcessorFamily>/Module.manifest
 47 | int createModuleManifest(PARSED_DATA& parsedData)
 48 | {
 49 |     boost::filesystem::path p{parsedData.processorFamily};
 50 | 
 51 |     // BUGBUG: why is this file needed?
 52 |     p.append("Module.manifest");
 53 | 
 54 |     boost::filesystem::ofstream ofs(p);
 55 |     ofs.close();
 56 | 
 57 |     return 0;
 58 | }
 59 | 
 60 | // Creates the bare minimum processor cspec file required to be loaded into
 61 | // Ghidra. It is up to the enduser to fully define this file to get decompiler
 62 | // support to work
 63 | // <ProcessorFamily>/data/languages/<Processor>.cspec
 64 | int createCspec(PARSED_DATA& parsedData)
 65 | {
 66 |     string cspecFilename;
 67 | 
 68 |     boost::filesystem::path p{parsedData.processorFamily};
 69 | 
 70 |     cspecFilename = parsedData.processorFamily + ".cspec";
 71 | 
 72 |     p.append("data");
 73 |     p.append("languages");
 74 |     p.append(cspecFilename);
 75 | 
 76 |     boost::filesystem::ofstream ofs(p);
 77 | 
 78 |     ofs << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
 79 |     ofs << "\n";
 80 |     ofs << "<!-- TODO: setup a valid cspec file -->\n";
 81 |     ofs << "<compiler_spec>\n";
 82 |     ofs << "\t<default_proto>\n";
 83 |     ofs << "\t\t<prototype name=\"__fake\" extrapop=\"0\" stackshift=\"0\">\n";
 84 |     ofs << "\t\t\t<input/>\n";
 85 |     ofs << "\t\t\t<output/>\n";
 86 |     ofs << "\t\t</prototype>\n";
 87 |     ofs << "\t</default_proto>\n";
 88 |     ofs << "</compiler_spec>\n";
 89 | 
 90 |     ofs.close();
 91 |     return 0;
 92 | }
 93 | 
 94 | // creates the bare minimum processor ldefs file required to be loaded into
 95 | // Ghidra. Uses values passed in at the command line to fill out the file.
 96 | // <ProcessorFamily>/data/languages/<Processor>.ldefs
 97 | int createLdefs(PARSED_DATA& parsedData)
 98 | {
 99 |     boost::timer::auto_cpu_timer t;
100 |     string ldefsFilename;
101 |     string bigOrLittle;
102 | 
103 |     boost::filesystem::path p{parsedData.processorFamily};
104 | 
105 |     ldefsFilename = parsedData.processorFamily + ".ldefs";
106 | 
107 |     p.append("data");
108 |     p.append("languages");
109 |     p.append(ldefsFilename);
110 | 
111 |     boost::filesystem::ofstream ofs(p);
112 | 
113 |     if(parsedData.endianness == "big")
114 |     {
115 |         bigOrLittle = "BE";
116 |     }
117 |     else
118 |     {
119 |         bigOrLittle = "LE";
120 |     }
121 | 
122 |     ofs << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
123 |     ofs << "\n";
124 |     ofs << "<!-- TODO: sanity check these values -->\n";
125 |     ofs << "<language_definitions>\n";
126 | 
127 |     for(unsigned int i = 0; i < parsedData.inputFilenames.size(); i++)
128 |     {
129 |         ofs << "\t<language processor=\"" << parsedData.processorFamily << "\"\n";
130 |         ofs << "\t          endian=\"" << parsedData.endianness << "\"\n";
131 |         ofs << "\t          size=\"" << parsedData.bitness << "\"\n";
132 |         ofs << "\t          variant=\"" << parsedData.processorName << "\"\n";
133 |         ofs << "\t          version=\"1.0\"\n";
134 | 
135 |         if(i == 0)
136 |         {
137 |             ofs << "\t          slafile=\"" << parsedData.processorName << ".sla\"\n";
138 |         }
139 |         else
140 |         {
141 |             ofs << "\t          slafile=\"" << parsedData.processorName << to_string(i) << ".sla\"\n";
142 |         }
143 | 
144 |         ofs << "\t          processorspec=\"" << parsedData.processorFamily << ".pspec\"\n";
145 |         ofs << "\t          id=\"" << parsedData.processorFamily << ":" << bigOrLittle << ":" << parsedData.bitness << ":" << parsedData.processorName << "\">\n";
146 |         ofs << "\t\t<description>" << parsedData.processorFamily << " " << parsedData.processorName << " processor " << parsedData.bitness << "-bit " << bigOrLittle << "</description>\n";
147 |         ofs << "\t\t<compiler name=\"default\" spec=\"" << parsedData.processorFamily << ".cspec\" id=\"default\"/>\n";
148 |         ofs << "\t</language>\n";
149 |     }
150 |     ofs << "</language_definitions>\n";
151 |     
152 |     ofs.close();
153 | 
154 |     return 0;
155 | }
156 | 
157 | // Creates the bare minimum processor pspec file required to be loaded into
158 | // Ghidra. It is up to the enduser to fully define this file to get decompiler
159 | // support to work
160 | // <ProcessorFamily>/data/languages/<Processor>.pspec
161 | int createPspec(PARSED_DATA& parsedData)
162 | {
163 |     string pspecFilename;
164 | 
165 |     boost::filesystem::path p{parsedData.processorFamily};
166 | 
167 |     pspecFilename = parsedData.processorFamily + ".pspec";
168 | 
169 |     p.append("data");
170 |     p.append("languages");
171 |     p.append(pspecFilename);
172 | 
173 |     boost::filesystem::ofstream ofs(p);
174 | 
175 |     ofs << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
176 |     ofs << "\n";
177 |     ofs << "<processor_spec>\n";
178 |     ofs << "\t<!-- TODO: <programcounter register=\"pc\"/> -->\n";
179 |     ofs << "</processor_spec>\n";
180 | 
181 |     ofs.close();
182 | 
183 |     return 0;
184 | }
185 | 
186 | // Uses the filled out parsedData structure to create a .slaspec file,
187 | // the core of the processor module. This file contains all of the registers,
188 | // defined tokens, and instructions of the instruction set
189 | // <ProcessorFamily>/data/languages/<Processor>.slaspec
190 | int createSlaspec(PARSED_DATA& parsedData, unsigned int fileId)
191 | {
192 |     string pspecFilename;
193 | 
194 |     boost::filesystem::path p{parsedData.processorFamily};
195 | 
196 |     if(fileId == 0)
197 |     {
198 |         pspecFilename = parsedData.processorName + ".slaspec";
199 |     }
200 |     else
201 |     {
202 |         pspecFilename = parsedData.processorName + to_string(fileId) + ".slaspec";
203 |     }
204 | 
205 |     p.append("data");
206 |     p.append("languages");
207 |     p.append(pspecFilename);
208 | 
209 |     boost::filesystem::ofstream ofs(p);
210 | 
211 |     ofs << "# File autogenerated by Ghidra Processor Module Generator\n";
212 |     ofs << "# https://github.com/oberoisecurity/ghidra-processor-module-generator\n";
213 |     ofs << "\n";
214 | 
215 |     // endianness and alignment
216 |     ofs << "# TODO: Verify these\n";
217 |     ofs << "define endian=" << parsedData.endianness << ";\n";
218 |     ofs << "define alignment=" << parsedData.alignment << ";\n";
219 |     ofs << "\n";
220 | 
221 |     // ram and register spaces
222 |     ofs << "# TODO: Verify these\n";
223 |     ofs << "define space ram type=ram_space size=4 wordsize=1 default;\n";
224 |     ofs << "define space register type=register_space size=4;\n";
225 |     ofs << "\n";
226 | 
227 |     // define registers
228 |     if(parsedData.registers.size() > 0)
229 |     {
230 |         ofs << "# TODO: Verify these\n";
231 |         ofs << "define register offset=0 size=4\n";
232 |         ofs << "[" << getOutputRegisters(parsedData) << "];\n";
233 |         ofs << "\n";
234 |     }
235 | 
236 |     // flags
237 |     ofs << "# TODO: Add flags if needed\n";
238 |     ofs << "# ex. @define MY_FLAG\t\"my_reg[0,1]\"\n";
239 |     ofs << "\n";
240 | 
241 |     // define token registers
242 |     for(unsigned int i = 0; i < sizeof(parsedData.tokenInstructions)/sizeof(parsedData.tokenInstructions[0]); i++)
243 |     {
244 |         unsigned int opcodeBitSize = 0;
245 | 
246 |         if(parsedData.tokenInstructions[i].size() > 0)
247 |         {
248 |             opcodeBitSize = (i + 1) * 8;
249 | 
250 |             ofs << "# TODO: Simplify these where possible\n";
251 |             ofs << "# TODO: Combine signed immediates where it makes sense\n";
252 |             ofs << "define token instr" << opcodeBitSize;
253 | 
254 |             // TODO: make if statement here for VLA
255 | 
256 |             ofs << "(" << opcodeBitSize << ")\n";
257 |             ofs << getOutputTokenInstructions(parsedData.tokenInstructions[i]);
258 |             ofs << ";\n";
259 |             ofs << "\n";
260 |         }
261 |     }
262 | 
263 |     // attach variables
264 |     if(parsedData.attachVariables.size() > 0)
265 |     {
266 |         ofs << "# TODO: Simplify these where possible\n";
267 |         ofs << getOutputAttachVariables(parsedData);
268 |         ofs << "\n";
269 |     }
270 | 
271 |     // check if any instructions have duplicated registers
272 |     // we need to zero for every slaspec file
273 |     // or we can run into export statements that fail
274 |     // to compile in the SLEIGH compiler
275 |     parsedData.duplicatedRegisters.clear(); 
276 |     for(auto& combinedInstruction: parsedData.combinedInstructions)
277 |     {
278 |         // combinedInstruction.first = the opcode
279 |         // combinedInstruction.second = pointer to the Instruction
280 |         combinedInstruction.second->getInstructionDuplicatedRegisters(true,
281 |                                                                       parsedData.duplicatedRegisters);    
282 |     }
283 | 
284 |     if(parsedData.duplicatedRegisters.size() > 0)
285 |     {
286 |         ofs << "# Duplicated registers" << endl;
287 |         ofs << "# To workaround: https://github.com/NationalSecurityAgency/ghidra/issues/6874" << endl;
288 |         ofs << getOutputDuplicateRegisters(parsedData);
289 |         ofs << "\n";
290 |     }   
291 | 
292 |     //
293 |     // Instructions
294 |     //
295 |     ofs << "#\n";
296 |     ofs << "# Instructions\n";
297 |     ofs << "#\n\n";
298 | 
299 |     ofs << "#\n";
300 |     ofs << "# Example Instruction:\n";
301 |     ofs << "#\n";
302 |     ofs << "# 1) # BBBBBAAAAAaaaaaaaaaaaaaa00000100\n";
303 |     ofs << "# 2) # addi r0,r0,0x0\n";
304 |     ofs << "# 3) #:addi regA_22_26,regB_27_31,imm_08_21 is regB_27_31 & regA_22_26 & imm_08_21 & opcode_00_05=0b000100\n";
305 |     ofs << "# 4) {}\n";
306 |     ofs << "#\n";
307 |     ofs << "# Line one is the opcode written in bits from MSB to LSB\n";
308 |     ofs << "# - 0 and 1s represent bits of the opcode that are required and cannot change\n";
309 |     ofs << "# - upper case letters represent registers\n";
310 |     ofs << "# - lower case letters represent immediate values\n";
311 |     ofs << "# Line two is an example decoding of the instruction if all registers and immediates are set to 0\n";
312 |     ofs << "# Line three is the SLEIGH encoded instruction\n";
313 |     ofs << "# Line four is the empty p-code implementation which must be completed for decompiler support\n";
314 |     ofs << "#\n\n";
315 | 
316 |     // sorted instructions
317 |     // string = the text of the instruction itself not the opcode
318 |     map<string, Instruction*> sortedCombinedInstructions;
319 | 
320 |     // sort the instructions
321 |     for(auto combinedInstruction: parsedData.combinedInstructions )
322 |     {
323 |         string instructionString;
324 | 
325 |         // combinedInstruction.first = the opcode
326 |         // combinedInstruction.second = pointer to the Instruction
327 |         instructionString = getOutputInstruction(combinedInstruction.second,
328 |                                                  parsedData);
329 |         sortedCombinedInstructions.insert({{instructionString,
330 |                                             combinedInstruction.second}});
331 |     }
332 | 
333 |     for(auto sortedCombinedInstruction: sortedCombinedInstructions)
334 |     {
335 |         string instruction = sortedCombinedInstruction.first;
336 | 
337 |         // escape forward slash
338 |         boost::replace_all(instruction, "/", "_");
339 | 
340 |         if(parsedData.omitOpcodes == false)
341 |         {
342 |             ofs << "# " << sortedCombinedInstruction.second->getOpcode() << "\n";
343 |         }
344 |         
345 |         if((parsedData.omitExampleInstructions == false) &&
346 |            (sortedCombinedInstruction.second->getCombined() == true))
347 |         {
348 |             ofs << "# " << getOriginalOutputString(sortedCombinedInstruction.second, parsedData) << "\n";
349 |         }        
350 |         
351 |         ofs << instruction << "\n";
352 |         ofs << "{}\n";
353 |         ofs << "\n";
354 |     }
355 |     sortedCombinedInstructions.clear();
356 | 
357 |     ofs.close();
358 |     return 0;
359 | }
360 | 
361 | // Gets a list of all registers define register section of the processor module
362 | string getOutputRegisters(PARSED_DATA& parsedData)
363 | {
364 |     string output;
365 |     std::set<string>::iterator it;
366 | 
367 |     for(it = parsedData.registers.begin();
368 |         it != parsedData.registers.end();
369 |         ++it)
370 |     {
371 |         if(it == parsedData.registers.begin())
372 |         {
373 |             output += *it;
374 |         }
375 |         else
376 |         {
377 |             output += " " + *it;
378 |         }
379 |     }
380 |     return output;
381 | }
382 | 
383 | // Gets a list of instruction mnemonics found
384 | // only used for debugging purposes
385 | string getOutputMnemonics(PARSED_DATA& parsedData)
386 | {
387 |     string output;
388 |     std::set<string>::iterator it;
389 | 
390 |     for(it = parsedData.mnemonics.begin();
391 |         it != parsedData.mnemonics.end();
392 |         ++it)
393 |     {
394 |         if(it == parsedData.mnemonics.begin())
395 |         {
396 |             output += *it;
397 |         }
398 |         else
399 |         {
400 |             output += " " + *it;
401 |         }
402 |     }
403 |     return output;
404 | }
405 | 
406 | // Outputs a list of the define token instructions for the processor module
407 | // ex:
408 | //	imm_00_00 = (0, 0)
409 | //	simm_00_00 = (0, 0) signed
410 | //	imm_00_03 = (0, 3)
411 | //	opcode_00_03 = (0, 3)
412 | //  opcode_00_04 = (0, 4)
413 | //  regA_04_07 = (4, 7)
414 | //	regA_05_05 = (5, 5)
415 | //	regA_05_05_2 = (5, 5)
416 | string getOutputTokenInstructions(set<string>& tokenInstructions)
417 | {
418 |     string output = "";
419 |     std::set<Instruction*>::iterator it;
420 | 
421 |     for (auto& token: tokenInstructions)
422 |     {
423 |         int start, end;
424 |         vector<string> result;
425 | 
426 |         //cout << "token: " << token << endl;
427 | 
428 |         boost::split(result, token, boost::is_any_of("_"));
429 |         if(result.size() < 3)
430 |         {
431 |             cout << "Failed to split token!!\n";
432 |             return "";
433 |         }
434 | 
435 |         start = std::stoi(result[1]);
436 |         end = std::stoi(result[2]);
437 | 
438 |         output += "\t" + token + " = (" + to_string(start) + ", " + to_string(end) + ")\n";
439 | 
440 |         // if this was an immediate value, create a signed immediate as well
441 |         // we do this because we can't tell the difference between an unsigned
442 |         // immediate and a postive signed immediate
443 |         if(token.find("imm_") != string::npos)
444 |         {
445 |             output += "\ts" + token + " = (" + to_string(start) + ", " + to_string(end) + ") signed\n";
446 |         }
447 |     }
448 | 
449 |     return output;
450 | }
451 | 
452 | // Outputs the processor module's attached variables field
453 | // There can be multiple attach variables for a single processor module
454 | // ex: attach variables [ regA_05_05 regC_05_05_2 regE_05_05_2 ] [
455 | //         sr vbr
456 | //     ];
457 | string getOutputAttachVariables(PARSED_DATA& parsedData)
458 | {
459 |     std::set<Instruction*>::iterator it;
460 |     string output = "";
461 | 
462 |     for (auto& x: parsedData.attachVariables)
463 |     {
464 |         // x.first = string of registers
465 |         // x.second = set containing all register variables using x.first
466 |         string registers;
467 | 
468 |         for(auto& y: x.second)
469 |         {
470 |             registers += y + " ";
471 |         }
472 | 
473 |         output += "attach variables [ " + registers + "] [\n";
474 |         output += "\t " + x.first + "\n";
475 |         output += "];\n";
476 |         output += "\n";
477 |     }
478 | 
479 |     return output;
480 | }
481 | 
482 | // Add an export statement in the form of:
483 | // a0_dup1: a0 is a0 { export a0; }
484 | // this is required to avoid duplicate registers
485 | string getOutputDuplicateRegisters(PARSED_DATA& parsedData)
486 | {
487 |     string output;
488 | 
489 |     for (auto& x: parsedData.duplicatedRegisters)
490 |     {
491 |         string reg = x.first;
492 |         unsigned int count = x.second;
493 | 
494 |         if(count <= 1)
495 |         {
496 |             // shouldn't ever get here
497 |             continue;
498 |         }
499 | 
500 |         for(unsigned int i = 1; i < count; i ++)
501 |         {
502 |             output += reg + "_dup" + std::to_string(i) + ": " + reg + " is " + reg + " {export " + reg + ";}\n";
503 |         }
504 |     }
505 | 
506 |     return output;
507 | }
508 | 
509 | // Takes an instruction and converts into SLEIGH format
510 | // example: ":mov rm_04_07, rn_08_11 is opcode_12_15=0b0110 & rn_08_11 & rm_04_07 & opcode_00_03=0b0011"
511 | string getOutputInstruction(Instruction* instruction, PARSED_DATA& parsedData)
512 | {
513 |     string output;
514 |     int index = 0;
515 | 
516 |     // instruction decorator
517 |     output += ":";
518 | 
519 |     output += instruction->getInstructionOutputString(true, true);
520 | 
521 |     output += " is ";
522 | 
523 |     index = convertOpcodeSizeToIndex(instruction->getOpcode().length());
524 |     if(index < 0)
525 |     {
526 |         cout << "Invalid opcode size!!" << endl;
527 |         throw 1;
528 |     }
529 | 
530 |     output += instruction->getOpcodeOutputString(parsedData.tokenInstructions[index]);
531 | 
532 |     return output;
533 | }
534 | 
535 | // Takes a combined instruction and converts into SLEIGH format, removing the
536 | // combined pieces. It does this by converting all of the non-binary pieces of
537 | // the opcode into 0s
538 | // example: "mov r0, r1"
539 | string getOriginalOutputString(Instruction* instruction, PARSED_DATA& parsedData)
540 | {
541 |     int result = 0;
542 |     string disassembledString;    
543 |     
544 |     // zeroize the combined opcode string
545 |     string zeroizedOpcode = instruction->getOpcode();
546 |     for(unsigned int i = 0; i < zeroizedOpcode.length(); i++)
547 |     {
548 |         if(zeroizedOpcode[i] != '0' && zeroizedOpcode[i] != '1')
549 |         {
550 |             zeroizedOpcode[i] = '0';
551 |         }
552 |     }
553 | 
554 |     result = disassembleOpcodeFromParsedData(parsedData,
555 |                                              zeroizedOpcode,
556 |                                              disassembledString);
557 |     if(result != 0)
558 |     {
559 |         return "";
560 |     }
561 | 
562 |     return disassembledString;
563 | }
564 | 
565 | int getOriginalOutputStringFromSla(PARSED_DATA& parsedData,
566 |                                    string zeroizedOpcode,
567 |                                    string& disassembledString)
568 | {
569 |     int result = 0;
570 | 
571 |     // loop through all of the loaded .sla files attempting to disassemble
572 |     // zeroizedOpcode
573 |     // TODO: improve speed?
574 |     for(unsigned int i = 0; i < parsedData.slas.size(); i++)
575 |     {
576 |         result = parsedData.slas[i].getConstructorTextByBitPattern(zeroizedOpcode,
577 |                                                                    disassembledString);
578 |         if(result == 0)
579 |         {
580 |             // successfully found the string
581 |             // cout << "Succeeded " << i << "\n" << disassembledString << endl;
582 |             return 0;
583 |         }
584 |     }
585 |     
586 |     // not found in any of the loaded .sla files
587 |     // cout << "Failed" << endl;
588 |     return -1;
589 | }
590 | 
591 | int disassembleOpcodeFromParsedData(PARSED_DATA& parsedData,
592 |                                     string zeroizedOpcode,
593 |                                     string& disassembledString)
594 | {
595 |     int result = 0;
596 | 
597 |     // check through the allInstructions first    
598 |     auto itr = parsedData.allInstructions.find(zeroizedOpcode);
599 |     if(itr != parsedData.allInstructions.end())
600 |     {     
601 |         disassembledString = itr->second->getInstructionOutputString(false,
602 |                                                                      false);
603 |         return 0;        
604 |     }
605 | 
606 |     // check through all of the sla files
607 |     result = getOriginalOutputStringFromSla(parsedData,
608 |                                             zeroizedOpcode,
609 |                                             disassembledString);
610 |     if(result == 0)
611 |     {
612 |         return 0;
613 |     }
614 | 
615 |     cout << "Failed to find zeroized opcode!!" << endl;
616 |     cout << zeroizedOpcode << endl;        
617 |     return -1;
618 | }
619 | 
620 | // Wrapper function for creating the various files required for the processor
621 | // module. parsedData has already been filled out at this point
622 | int createProcessorModule(PARSED_DATA& parsedData, unsigned int fileId)
623 | {
624 |     boost::timer::auto_cpu_timer t;
625 |     int result = 0;
626 | 
627 |     cout << "  [*] Creating Processor Directory Structure" << endl;
628 |     result = createDirectoryStructure(parsedData);
629 |     if(result != 0)
630 |     {
631 |         return result;
632 |     }
633 | 
634 |     cout << "  [*] Creating Module.manifest" << endl;
635 |     result = createModuleManifest(parsedData);
636 |     if(result != 0)
637 |     {
638 |         return result;
639 |     }
640 | 
641 |     cout << "  [*] Creating .cspec" << endl;
642 |     result = createCspec(parsedData);
643 |     if(result != 0)
644 |     {
645 |         return result;
646 |     }
647 | 
648 |     cout << "  [*] Creating .pspec" << endl;
649 |     result = createPspec(parsedData);
650 |     if(result != 0)
651 |     {
652 |         return result;
653 |     }
654 | 
655 |     cout << "  [*] Creating .slapec" << endl;
656 |     result = createSlaspec(parsedData, fileId);
657 |     if(result != 0)
658 |     {
659 |         return result;
660 |     }
661 | 
662 |     return 0;
663 | }
664 | 


--------------------------------------------------------------------------------
/output.h:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // File: output.h
 3 | //
 4 | // Outputs the files that comprise the Ghidra processor module.
 5 | //
 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
 7 | // Licensed under the Apache 2.0 License.
 8 | //-----------------------------------------------------------------------------
 9 | #pragma once
10 | 
11 | #include "parser.h"
12 | using namespace std;
13 | 
14 | int createProcessorModule(PARSED_DATA& parsedData, unsigned int fileId);
15 | int createDirectoryStructure(PARSED_DATA& parsedData);
16 | int createModuleManifest(PARSED_DATA& parsedData);
17 | int createPspec(PARSED_DATA& parsedData);
18 | int createCspec(PARSED_DATA& parsedData);
19 | int createLdefs(PARSED_DATA& parsedData);
20 | int createSlaspec(PARSED_DATA& parsedData, unsigned int fileId);
21 | 
22 | string getOutputRegisters(PARSED_DATA& parsedData);
23 | string getOutputMnemonics(PARSED_DATA& parsedData);
24 | string getOutputTokenInstructions(set<string>& tokenInstructions);
25 | string getOutputAttachVariables(PARSED_DATA& parsedData);
26 | string getOutputDuplicateRegisters(PARSED_DATA& parsedData);
27 | string getOutputInstruction(Instruction* instruction, PARSED_DATA& parserData);
28 | string getOriginalOutputString(Instruction* instruction,
29 |                                PARSED_DATA& parsedData);
30 | int getOriginalOutputStringFromSla(PARSED_DATA& parsedData,
31 |                                    string zeroizedOpcode,
32 |                                    string& disassembledString);
33 | int disassembleOpcodeFromParsedData(PARSED_DATA& parsedData,
34 |                                     string zeroizedOpcode,
35 |                                     string& disassembledString);
36 | 


--------------------------------------------------------------------------------
/parser.cpp:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // File: parser.cpp
  3 | //
  4 | // Parsing instructions from disassembly text file
  5 | //
  6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
  7 | // Licensed under the Apache 2.0 License.
  8 | //-----------------------------------------------------------------------------
  9 | #include <boost/timer/timer.hpp>
 10 | #include <boost/asio/post.hpp>
 11 | #include <boost/asio/thread_pool.hpp>
 12 | #include <boost/thread/thread.hpp>
 13 | #include <boost/date_time/posix_time/posix_time.hpp>
 14 | #include "parser.h"
 15 | #include "registers.h"
 16 | #include "thread_pool.h"
 17 | 
 18 | const boost::regex g_opcodeRegex{"0[xX][0-9a-fA-F]+"};
 19 | const boost::regex g_integerRegex{"\\d+"};
 20 | 
 21 | // used to track if we have a variable length instruction set
 22 | static bool g_opcodeSize[4] = {false, false, false, false};
 23 | 
 24 | set<string> g_allRegisters;
 25 | extern const char* ALL_REGISTERS[];
 26 | 
 27 | static bool splitChar(char ch);
 28 | static bool isCharWhiteSpace(char ch);
 29 | static int splitDisassemblyLine(vector<string>& lineSplit, const string& line);
 30 | static void updateOpcodeSize(unsigned int opcodeSize);
 31 | static bool hasVariableLengthOpcodes(void);
 32 | 
 33 | static int parseInstructionsWorker(PARSED_DATA& parsedData,
 34 |                                    const char* buffer,
 35 |                                    unsigned long long start,
 36 |                                    unsigned long long end);
 37 | static int parseInstructionsParser(PARSED_DATA& parsedData,
 38 |                                    unsigned int lineNum,
 39 |                                    string& line,
 40 |                                    set<string>& registers,
 41 |                                    set<string>& mnemonics,
 42 |                                    map<string, Instruction*>& allInstructions);
 43 | 
 44 | // helper to convert number of opcode bits to to index into tokenInstructions array
 45 | int convertOpcodeSizeToIndex(unsigned int opcodeSizeInBits)
 46 | {
 47 |     switch(opcodeSizeInBits)
 48 |     {
 49 |         // bits to index into tokenInsructions array
 50 |         case 8:
 51 |             return 0;
 52 |         case 16:
 53 |             return 1;
 54 |         case 24:
 55 |             return 2;
 56 |         case 32:
 57 |             return 3;
 58 |         default:
 59 |             cout << "[-] convertOpcodeSizeToIndex: Invalid opcode size (" << opcodeSizeInBits << ") specified!!" << endl;
 60 |             throw 1;
 61 |     }
 62 | 
 63 |     // never get here, will throw in default case of switch statement
 64 |     return -1;
 65 | }
 66 | 
 67 | // check which opcode sizes we have seen during parsing
 68 | static void updateOpcodeSize(unsigned int opcodeSizeInBits)
 69 | {
 70 |     switch(opcodeSizeInBits)
 71 |     {
 72 |         case 8:
 73 |             g_opcodeSize[0] = true;
 74 |             break;
 75 |         case 16:
 76 |             g_opcodeSize[1] = true;
 77 |             break;
 78 |         case 24:
 79 |             g_opcodeSize[2] = true;
 80 |             break;
 81 |         case 32:
 82 |             g_opcodeSize[3] = true;
 83 |             break;
 84 |         default:
 85 |             cout << "[-] updateOpcodeSize: Invalid opcode size (" << opcodeSizeInBits << ") specified!!" << endl;
 86 |             break;
 87 |     }
 88 | }
 89 | 
 90 | // returns true if the parsed architecture has variable length opcodes
 91 | // supported opcode lengths are 1-4 bytes
 92 | static bool hasVariableLengthOpcodes(void)
 93 | {
 94 |     unsigned int count = 0;
 95 | 
 96 |     for(unsigned int i = 0; i < sizeof(g_opcodeSize)/sizeof(g_opcodeSize[0]); i++)
 97 |     {
 98 |         if(g_opcodeSize[i] == true)
 99 |         {
100 |             count++;
101 |         }
102 |     }
103 | 
104 |     if(count > 1)
105 |     {
106 |         return true;
107 |     }
108 | 
109 |     return false;
110 | }
111 | 
112 | // Load all the registers extracted from Ghidra into a set
113 | // When parsing the instructions this is how we will tell the difference
114 | // between an instruction mnemonic versus a register
115 | int initRegisters(void)
116 | {
117 |     for(unsigned int i = 0;
118 |         i < sizeof(ALL_REGISTERS)/sizeof(ALL_REGISTERS[0]);
119 |         i++)
120 |     {
121 |         g_allRegisters.insert(ALL_REGISTERS[i]);
122 |     }
123 | 
124 |     return 0;
125 | }
126 | 
127 | // additionalRegisters is a list of additional registers specified by the user
128 | // at the command line or queried from the .sla file
129 | int addRegisters(vector<string>& additionalRegisters)
130 | {
131 |     for(auto additionalRegister : additionalRegisters)
132 |     {
133 |         g_allRegisters.insert(additionalRegister);
134 |     }
135 | 
136 |     return 0;
137 | }
138 | 
139 | // Returns true if the passed in string is a register. This is determined 
140 | // seeing if it's in the g_allRegisters set
141 | bool isRegister(const string& str)
142 | {
143 |     set<string>::iterator it;
144 | 
145 |     // workaround when parsing .sla that contain register sets
146 |     if(str == "__register_list__")
147 |     {
148 |         return true;
149 |     }
150 | 
151 |     it = g_allRegisters.find(str);
152 |     if(it == g_allRegisters.end())
153 |     {
154 |         return false;
155 |     }
156 |     return true;
157 | }
158 | 
159 | // Returns true if the passed in string is an opcode. We determine a string is
160 | // an opcode if it is a hex string beginning with 0x
161 | bool isOpcode(const string& str)
162 | {
163 |     if(str.length() > 2)
164 |     {
165 |         if(str[0] == '0' && (str[1] == 'x' || str[1] == 'X'))
166 |         {
167 |             return true;
168 |         }
169 |     }
170 | 
171 |     return false;
172 | 
173 |     // regex method was too slow
174 |     // return boost::regex_match(str, g_opcodeRegex);
175 | }
176 | 
177 | // returns true if the passed in string is an integer
178 | bool isInteger(const string& str)
179 | {
180 |     if(str.length() >= 1)
181 |     {
182 |         if(str[0] >= '0' && str[0] <= '9')
183 |         {
184 |             return true;
185 |         }
186 |     }
187 | 
188 |     return false;
189 | 
190 |     // regex method was too slow
191 |     //return boost::regex_match(str, g_integerRegex);
192 | }
193 | 
194 | // an immediate is a hex string or decimal string
195 | bool isImmediate(const string& str)
196 | {
197 |     // workaround when parsing .sla that contain register sets
198 |     if(str == "__immediate_list__")
199 |     {
200 |         return true;
201 |     }
202 | 
203 |     if(isOpcode(str) || isInteger(str))
204 |     {
205 |         return true;
206 |     }
207 | 
208 |     return false;
209 | }
210 | 
211 | // TODO: comment
212 | static int parseInstructionsWorker(PARSED_DATA& parsedData,
213 |                                    const char* buffer,
214 |                                    unsigned long long start,
215 |                                    unsigned long long end)
216 | {
217 |     // to improve performance each thread has it's own copy of these data 
218 |     // structures that are merged together later
219 |     set<string> registers;
220 |     set<string> mnemonics;
221 |     map<string, Instruction*> allInstructions;
222 |     const char* bufferStart = NULL;
223 | 
224 |     // loop through the file portion line by line
225 |     bufferStart = buffer + start;
226 |     for(unsigned long long i = start; i <= end; i++)
227 |     {        
228 |         if(buffer[i] == '\n')
229 |         {
230 |             int result = 0;
231 |             unsigned long long len = 0;
232 | 
233 |             len = &buffer[i] - bufferStart;
234 |             string line(bufferStart, len);
235 | 
236 |             // parse each line
237 |             result = parseInstructionsParser(parsedData,
238 |                                              0,
239 |                                              line,
240 |                                              registers,
241 |                                              mnemonics,
242 |                                              allInstructions);
243 |             if(result != 0)
244 |             {
245 |                 goto ERROR_EXIT;
246 |             }
247 | 
248 |             bufferStart = &buffer[i];
249 |         }    
250 |     }
251 | 
252 |     // merge the data back up
253 |     parsedData.mnemonicsMutex.lock();
254 |     parsedData.mnemonics.merge(mnemonics);
255 |     parsedData.mnemonicsMutex.unlock();
256 | 
257 |     parsedData.registersMutex.lock();
258 |     parsedData.registers.merge(registers);
259 |     parsedData.registersMutex.unlock();
260 | 
261 |     parsedData.registersMutex.lock();
262 |     parsedData.allInstructions.merge(allInstructions);
263 |     parsedData.registersMutex.unlock();
264 | 
265 |     incrementWorkerCompletions();
266 |     return 0;
267 | 
268 | ERROR_EXIT:
269 |     incrementWorkerCompletions();
270 |     incrementWorkerFailures();
271 |     return -1;
272 | }
273 | 
274 | // returns true if the character should be split
275 | // into it's own element
276 | static bool splitChar(char ch)
277 | {
278 |     switch(ch)
279 |     {
280 |         case ',':
281 |         case '@':
282 |         case '(':
283 |         case ')':
284 |         case '[':
285 |         case ']':
286 |         case '{':
287 |         case '}':
288 |         case '+':
289 |         case '-':
290 |         case '#':
291 |         case ' ':
292 |         case '*':
293 |         case '!':
294 |         case '\t':
295 |         case '\r':
296 |         case '\n':
297 |             return true;
298 |         default:
299 |             return false;
300 |     }
301 | 
302 |     return false;
303 | }
304 | 
305 | // returns true if the character is a whitespace char
306 | static bool isCharWhiteSpace(char ch)
307 | {
308 |     switch(ch)
309 |     {
310 |         case ' ':
311 |         case '\t':
312 |         case '\r':
313 |         case '\n':
314 |             return true;
315 |         default:
316 |             return false;
317 |     }
318 | 
319 |     return false;
320 | }
321 | 
322 | // splits a line of disassembly into a vector of strings
323 | static int splitDisassemblyLine(vector<string>& lineSplit, const string& line)
324 | {
325 |     string currSplit = "";
326 | 
327 |     for(unsigned int i = 0; i < line.size(); i++)
328 |     {
329 |         bool shouldSplit = false;
330 |         bool shouldSkip = false;
331 | 
332 |         shouldSplit = splitChar(line[i]);
333 |         if(shouldSplit == true)
334 |         {
335 |             if(currSplit.size() > 0)
336 |             {
337 |                 //cout << "currSplit: " << currSplit << endl;
338 |                 lineSplit.emplace_back(currSplit);
339 |                 currSplit = "";
340 |             }
341 | 
342 |             shouldSkip = isCharWhiteSpace(line[i]);
343 |             if(shouldSkip == false)
344 |             {
345 |                 // non-ws char, append to our vector
346 |                 lineSplit.emplace_back(std::string(1, line[i]));
347 |             }
348 |         }
349 |         else
350 |         {
351 |             currSplit.push_back(line[i]);
352 |         }
353 |     }
354 | 
355 |     if(currSplit.size() > 0)
356 |     {
357 |         lineSplit.emplace_back(currSplit);
358 |     }
359 | 
360 |     return 0;
361 | }
362 | 
363 | 
364 | // tokenizes the input instructions and appends them to the allInstructions set
365 | static int parseInstructionsParser(PARSED_DATA& parsedData, 
366 |                                    unsigned int lineNum,
367 |                                    string& line,
368 |                                    set<string>& registers,
369 |                                    set<string>& mnemonics,
370 |                                    map<string, Instruction*>& allInstructions)
371 | {
372 |     map<string, Instruction*>::iterator itr;
373 |     vector<string> lineSplit;
374 |     string opcode;
375 |     int result = 0;
376 | 
377 |     Instruction* currInstruction = new Instruction();
378 |     if(currInstruction == NULL)
379 |     {
380 |         cout << "[-] Error line " << lineNum << ": Failed to allocate!!" << endl;
381 |         goto ERROR_EXIT;        
382 |     }
383 | 
384 |     // We want to split these fillers from register values
385 |     // TODO: improve performance here
386 |     splitDisassemblyLine(lineSplit, line);
387 |     
388 |     // Our combining algorithm needs to be rewritten to support more than 26 
389 |     // tokens. For the time being bail
390 |     if(lineSplit.size() > MAX_TOKENS)
391 |     {
392 |         cout << "[-] Error line " << lineNum << ": Line has more than MAX_TOKENS!!" << endl;
393 |         cout << line << endl;
394 |         throw 1;
395 |         delete currInstruction;
396 |         goto ERROR_EXIT;
397 |     }
398 | 
399 |     // tokenize each line component and add it to the Instruction
400 |     for(unsigned int i = 0; i < lineSplit.size(); i++)
401 |     {
402 |         if(i == 0)
403 |         {
404 |             unsigned int opcodeBitLength = 0;
405 | 
406 |             // the first element on the line must be the opcode
407 |             result = isOpcode(lineSplit[i]);
408 |             if(result != true)
409 |             {
410 |                 cout << "[-] Error line " << lineNum << ": First field is not an hex opcode!!" << endl;
411 |                 cout << "[-] Got: " << lineSplit[i] << endl;
412 |                 delete currInstruction;
413 |                 goto ERROR_EXIT;
414 |             }
415 | 
416 |             currInstruction->setOpcode(lineSplit[i]);
417 | 
418 |             // we need to keep track of the maximum bit length for the
419 |             // combining stage
420 |             opcodeBitLength = currInstruction->getOpcode().length();
421 |             updateOpcodeSize(opcodeBitLength);
422 | 
423 |             if(opcodeBitLength > parsedData.maxOpcodeBits)
424 |             {
425 |                 parsedData.maxOpcodeBitsMutex.lock();
426 |                 if(opcodeBitLength > parsedData.maxOpcodeBits)
427 |                 {
428 |                     cout << "  [*] Updating bit length from " << parsedData.maxOpcodeBits << " to " << opcodeBitLength << endl;
429 |                     parsedData.maxOpcodeBits = opcodeBitLength;
430 |                 }
431 |                 parsedData.maxOpcodeBitsMutex.unlock();
432 |             }
433 |         }
434 |         else
435 |         {
436 |             InstructionComponentType currType;
437 | 
438 |             // all remaining elements on the line are components of the 
439 |             // instruction
440 |             if(isRegister(lineSplit[i]))
441 |             {
442 |                 currType = TYPE_REGISTER;                
443 |                 registers.insert(lineSplit[i]);
444 |                 
445 |             }
446 |             else if(isImmediate(lineSplit[i]))
447 |             {
448 |                 currType = TYPE_IMMEDIATE;
449 |             }
450 |             else
451 |             {            
452 |                 mnemonics.insert(lineSplit[i]);
453 |                 currType = TYPE_INSTRUCTION;
454 |             }
455 | 
456 |             currInstruction->addComponent(currType, lineSplit[i]);
457 |         }
458 |     } // for (int i = 0; i < lineSplit.size(); i++)
459 | 
460 |     // sanity check the instruction
461 |     result = currInstruction->validateInstruction();
462 |     if(result != true)
463 |     {
464 |         cout << "[-] Error line " << lineNum << ": Instruction is invalid!!" << endl;
465 |         delete currInstruction;
466 |         goto ERROR_EXIT;
467 |     }
468 | 
469 |     opcode = currInstruction->getOpcode();
470 | 
471 |     // check for duplicate instructions before inserting
472 |     itr = allInstructions.find(opcode);
473 |     if(itr != allInstructions.end())
474 |     {
475 |         cout << "[-] Error line " << lineNum << ": Found duplicate opcode!!" << endl;
476 |         delete currInstruction;
477 |         goto ERROR_EXIT;
478 |     }
479 | 
480 |     // everything is good, insert instruction into our set    
481 |     allInstructions.insert({{std::move(opcode), currInstruction}});
482 | 
483 |     return 0;
484 | 
485 | ERROR_EXIT:    
486 |     return -1;
487 | }
488 | 
489 | // tokenizes the input instructions and appends them to the allInstructions set
490 | int parseInstructions(PARSED_DATA& parsedData, unsigned int fileId)
491 | {
492 |     boost::timer::auto_cpu_timer t;
493 |     boost::asio::thread_pool threadPool(parsedData.numThreads);
494 |     unsigned int portion = 0;
495 |     unsigned long long fileSize = 0;
496 |     char* fileBuffer = NULL;
497 |     unsigned long long portionSize = 0;
498 |     unsigned long long start = 0;
499 | 
500 |     // sanity check thread value
501 |     if(parsedData.numThreads == 0)
502 |     {
503 |         cout << "[-] numThreads cannot be 0" << endl;
504 |         return -1;
505 |     }
506 | 
507 |     resetThreadPool();
508 | 
509 |     // TODO: review exit error flow
510 |     // TODO: why pass in fileId?
511 | 
512 |     // open the input file for parsing
513 |     boost::filesystem::path infile{parsedData.inputFilenames[fileId]};
514 |     boost::filesystem::ifstream ifs{infile, std::ios::ate};
515 | 
516 |     if(!ifs)
517 |     {
518 |         cout << "[-] Failed to open input file!!" << endl;
519 |         return -1;
520 |     }
521 | 
522 |     // get the file size
523 |     fileSize = ifs.tellg();    
524 |     ifs.seekg(0, std::ios::beg);
525 | 
526 |     // TODO: this throws
527 |     fileBuffer = new char[fileSize];
528 |     if(!fileBuffer)
529 |     {
530 |         cout << "[-] Failed to allocate buffer!!" << endl;
531 |         return -1;
532 |     }
533 | 
534 |     ifs.read(fileBuffer, fileSize);
535 |     ifs.close();
536 |     
537 |     //
538 |     // split the disassembly into 1/num threads pieces
539 |     //
540 |     portionSize = fileSize/parsedData.numThreads;
541 |     
542 |     for(unsigned int i = 0; i < parsedData.numThreads; i++)
543 |     {        
544 |         unsigned long long end = 0;
545 | 
546 |         if(start >= fileSize)
547 |         {
548 |             cout << "Reached end of file " << endl;
549 |             continue;
550 |         }
551 | 
552 |         if(i == parsedData.numThreads - 1)
553 |         {
554 |             // last thread, always set end to fileSize
555 |             end = fileSize - 1;
556 |         }
557 |         else
558 |         {
559 |             end = start + portionSize;
560 |             for(unsigned long long j = end; j < fileSize; j++)
561 |             {
562 |                 if(fileBuffer[j] == '\n')
563 |                 {
564 |                     end = j;
565 |                     break;
566 |                 }
567 |             }
568 |         }        
569 | 
570 |         // queue a worker to work on 1/n of the disassembly
571 |         boost::asio::post(threadPool, 
572 |                           boost::bind(parseInstructionsWorker,
573 |                                       boost::ref(parsedData),
574 |                                       fileBuffer,
575 |                                       start,
576 |                                       end));
577 |         start = end + 1;
578 |     }   
579 | 
580 |     // TODO: improve poll logic
581 |     while(1)
582 |     {
583 |         boost::this_thread::sleep(boost::posix_time::milliseconds(100));
584 | 
585 |         unsigned int completedCount = getWorkerCompletions();
586 |         unsigned int failCount = getWorkerFailures();
587 | 
588 |         //cout << "Test cases: " << completed_count << "/" << lineNum  << " Fail cases: " << fail_count << endl;
589 |         
590 |         // check if we exceeded our max number of failures
591 |         if(failCount > 0)
592 |         {
593 |             // abort the rest of the threads         
594 |             threadPool.stop();
595 |             break;
596 |         }
597 | 
598 |         // check if we finished our submitted jobs
599 |         if(completedCount >= portion)
600 |         {
601 |             // finished
602 |             break;
603 |         }
604 |     }
605 | 
606 |     threadPool.join();
607 | 
608 |     delete [] fileBuffer;
609 | 
610 |     if(getWorkerFailures() > 0)
611 |     {
612 |         return -1;
613 |     }
614 | 
615 |     // Copy the instructions into the combined instructions set. We need to
616 |     // save the original allInstructions to recreate the registers lists when
617 |     // we print out the instructions
618 |     parsedData.combinedInstructions = parsedData.allInstructions;
619 | 
620 |     // check if we have a variable length opcodes
621 |     parsedData.variableLengthISA = hasVariableLengthOpcodes();
622 |     return 0;
623 | }
624 | 
625 | // Walks through all instructions that have combined registers and figures out
626 | // the register list and register variable name and appends them to 
627 | // registerVariables. Once registerVariables is filled out attachVariables is 
628 | // filled out
629 | void  computeAttachVariables(PARSED_DATA& parsedData)
630 | {    
631 |     boost::timer::auto_cpu_timer t;
632 |     std::set<Instruction*>::iterator it;
633 | 
634 |     // iterate through all combined instructions and update registerVariables
635 |     for(auto& x: parsedData.combinedInstructions)
636 |     {
637 |         x.second->computeAttachVariables(parsedData.allInstructions,
638 |                                          parsedData.registerVariables,
639 |                                          parsedData.slas);
640 |     }
641 | 
642 |     for(auto& y: parsedData.registerVariables)
643 |     {
644 |         // y.second = string consisting all delimited by space
645 |         // y.first = register variable name
646 |         parsedData.attachVariables[y.second].insert(y.first);
647 |     }
648 |     return;
649 | }
650 | 
651 | // TODO: wrong comment
652 | // Walks through all instructions that have combined registers and figures out
653 | // the register list and register variable name and appends them to 
654 | // registerVariables. Once registerVariables is filled out attachVariables is 
655 | // filled out.
656 | void computeTokenInstructions(PARSED_DATA& parsedData)
657 | {    
658 |     boost::timer::auto_cpu_timer t;
659 |     std::set<Instruction*>::iterator it;
660 | 
661 |     // iterate through all combined instructions. getOpcodeOutputString() will
662 |     // append new tokens to the tokenInstructions set
663 |     for(auto& x: parsedData.combinedInstructions)
664 |     {
665 |         int index = convertOpcodeSizeToIndex(x.first.length());
666 |         if(index < 0)
667 |         {
668 |             cout << "Invalid opcode size!!" << endl;
669 |             throw 1;
670 |         }
671 | 
672 |         x.second->getOpcodeOutputString(parsedData.tokenInstructions[index]);
673 |     }
674 | 
675 |     return;
676 | }
677 | 
678 | // worker that deletes the instruction from parsedData.allinstructions
679 | int clearParserWorker(PARSED_DATA& parsedData, 
680 |                       unsigned long long start,
681 |                       unsigned long long end)
682 | {
683 |     map<string, Instruction*>::iterator startItr;
684 |     map<string, Instruction*>::iterator endItr;
685 | 
686 |     startItr = parsedData.allInstructions.begin();
687 | 
688 |     std::advance(startItr, start);
689 |     endItr = startItr;
690 |     std::advance(endItr, end-start + 1);
691 | 
692 |     for(; startItr != endItr; startItr++)
693 |     {
694 |         delete startItr->second;
695 |     }
696 | 
697 |     incrementWorkerCompletions();
698 |     return 0;
699 | }
700 | 
701 | // splits the instructions to be deleted onto a thread pool
702 | int clearParserScheduler(PARSED_DATA& parsedData)
703 | {
704 |     boost::asio::thread_pool threadPool(parsedData.numThreads);
705 |     unsigned int portion = 0;
706 |     unsigned long long numInstructions = 0;
707 |     unsigned long long portionSize = 0;
708 |     unsigned long long start = 0;
709 | 
710 |     // sanity check thread value
711 |     if(parsedData.numThreads == 0)
712 |     {
713 |         cout << "[-] numThreads cannot be 0" << endl;
714 |         return -1;
715 |     }
716 | 
717 |     resetThreadPool();
718 | 
719 |     //
720 |     // split freeing the instructions into 1/num threads pieces
721 |     //
722 |     numInstructions = parsedData.allInstructions.size();
723 |     if(numInstructions < 1024)
724 |     {
725 |         parsedData.numThreads = 1;
726 |         portionSize = numInstructions;
727 |     }
728 |     else
729 |     {
730 |         portionSize = numInstructions/parsedData.numThreads;
731 |     }
732 |     
733 |     for(unsigned int i = 0; i < parsedData.numThreads; i++)
734 |     {        
735 |         unsigned long long end = 0;
736 | 
737 |         if(i == parsedData.numThreads - 1)
738 |         {
739 |             // last thread, always set end to fileSize
740 |             end = numInstructions - 1;
741 |         }
742 |         else
743 |         {
744 |             end = start + portionSize;
745 |         }
746 | 
747 |         // queue a worker to work on 1/n of the disassembly
748 |         boost::asio::post(threadPool, 
749 |                           boost::bind(clearParserWorker,
750 |                                       boost::ref(parsedData),
751 |                                       start,
752 |                                       end));
753 |         start = end + 1;
754 |         portion++;
755 |     }
756 | 
757 |     // TODO: improve poll logic
758 |     while(1)
759 |     {
760 |         boost::this_thread::sleep(boost::posix_time::milliseconds(100));
761 |         unsigned int completedCount = getWorkerCompletions();
762 | 
763 |         // check if we finished our submitted jobs
764 |         if(completedCount >= portion)
765 |         {
766 |             // finished
767 |             break;
768 |         }
769 |     }
770 | 
771 |     threadPool.join();
772 |     return 0;
773 | }
774 | 
775 | // free the parser data structure
776 | void clearParserData(PARSED_DATA& parsedData, bool save_registers)
777 | {
778 |     boost::timer::auto_cpu_timer t;
779 |     cout << "[*] Freeing parser data" << endl;
780 | 
781 |     // free any instructions that we allocated during the combine phase
782 |     for(auto& y: parsedData.combinedInstructions)
783 |     {
784 |         if(y.second->getNeedsFree() == true)
785 |         {
786 |             delete y.second;
787 |         }
788 |     }
789 | 
790 |     // multithreaded free the allInstructions map
791 |     // this is a perf bottleneck, even after multithreading
792 |     if(parsedData.allInstructions.size() > 0)
793 |     {
794 |         clearParserScheduler(parsedData);
795 |         parsedData.allInstructions.clear();
796 |     }
797 | 
798 |     // no other data structures allocated Instructions    
799 |     parsedData.combinedInstructions.clear();
800 |     parsedData.registerVariables.clear();
801 |     parsedData.attachVariables.clear();
802 | 
803 |     for(unsigned int i = 0; i < sizeof(parsedData.tokenInstructions)/sizeof(parsedData.tokenInstructions[0]); i++)
804 |     {
805 |         parsedData.tokenInstructions[i].clear();
806 |     }
807 | 
808 |     if(!save_registers)
809 |     {
810 |         parsedData.registers.clear();
811 |         parsedData.mnemonics.clear();
812 |     }
813 | }
814 | 


--------------------------------------------------------------------------------
/parser.h:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // File: parser.h
  3 | //
  4 | // Parsing instructions from disassembly text file
  5 | //
  6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
  7 | // Licensed under the Apache 2.0 License.
  8 | //-----------------------------------------------------------------------------
  9 | #pragma once
 10 | 
 11 | #include <iostream>
 12 | #include <set>
 13 | #include <boost/thread/mutex.hpp>
 14 | #include <boost/algorithm/string.hpp>
 15 | #include <boost/filesystem/fstream.hpp>
 16 | #include <boost/regex.hpp>
 17 | #include "instruction.h"
 18 | #include "slautil/slautil.h"
 19 | using namespace std;
 20 | 
 21 | // if there are too many tokens we have to change our bit naming algorithm
 22 | #define MAX_TOKENS 26 
 23 | 
 24 | enum COMBINE_TYPE
 25 | {
 26 |     COMBINE_DUPLICATES = 0, // instructions are identical except for a single 
 27 |                             // bit in the opcode
 28 |     COMBINE_IMMEDIATES = 1, // instructions are identical except for a single
 29 |                             // bit in the opcode and a single immediate field
 30 |     COMBINE_REGISTERS = 2,  // instructions are identical except for a single
 31 |                             // bit in the opcode and a single register field
 32 |     COMBINE_MAX = 3,
 33 | };
 34 | 
 35 | // data we parsed from the instruction set
 36 | // we need this to create our output
 37 | typedef struct _PARSED_DATA
 38 | {
 39 |     // all instructions parsed. Instruction* was allocated by new and must be 
 40 |     // deleted.
 41 |     // string = the instruction opcode as a text string of 0s and 1s
 42 |     map<string, Instruction*> allInstructions;
 43 | 
 44 |     // synchronize access to allinstructions map
 45 |     boost::mutex allInstructionsMutex;
 46 | 
 47 |     // combined instructions (e.g. merge duplicates, registers, immediates, 
 48 |     // etc).
 49 |     // Shallow copy from allInstructions to start with and for that reason we
 50 |     // should not call delete on instructions.
 51 |     // string = the combined instrution opcode as a text string 0s, 1s, *s, 
 52 |     // capital letters (for registers), and lower case registers (for 
 53 |     // immediates)
 54 |     map<string, Instruction*> combinedInstructions;
 55 | 
 56 |     // all registers seen while parsing the instruction set
 57 |     set<string> registers;
 58 | 
 59 |     // synchronize access to registers set
 60 |     boost::mutex registersMutex;
 61 | 
 62 |     // all instruction mnemonics seen while parsing the instruction set
 63 |     // only used for debugging with --print-registers-only option
 64 |     set<string> mnemonics;
 65 | 
 66 |     // synchronize access to mnemonics set
 67 |     boost::mutex mnemonicsMutex;
 68 | 
 69 |     // number of bits for the biggest instruction opcode parsed
 70 |     unsigned int maxOpcodeBits;
 71 | 
 72 |     // synchronize access tot he maxOpcodeBits
 73 |     boost::mutex maxOpcodeBitsMutex;
 74 | 
 75 |     // set to true if the architecture has variable length instructions
 76 |     bool variableLengthISA;
 77 | 
 78 |     //
 79 |     // Output datas
 80 |     //
 81 | 
 82 |     // registerVariables and attachVariables are used for outputting the 
 83 |     // "attach variables" field in the output
 84 |     // key = register variable name. Ex "regA_04_07"
 85 |     // value = string of space delimited registers. Ex: "r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15"
 86 |     map<string, string> registerVariables;
 87 | 
 88 |     // can be thought of as an inverse registerVariables where we group all 
 89 |     // register variables that have the same list of registers
 90 |     // key = string of space delimted registers. Ex: "r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15"
 91 |     // value = set of all register variable names that have the same list of registers. Ex "regA_10_10", "regC_10_10", "regE_10_10"
 92 |     map<string, set<string>> attachVariables;
 93 | 
 94 |     // used for outputting the "define token instr pieces"
 95 |     // to support variable length architectectures:
 96 |     // - [0] - 1 byte instructions
 97 |     // - [0] - 2 byte instructions
 98 |     // - [0] - 3 byte instructions
 99 |     // - [0] - 4 byte instructions
100 |     set<string> tokenInstructions[4];
101 | 
102 |     // used for outtputing the "duplicated registers" export section
103 |     map<string, unsigned int> duplicatedRegisters;
104 | 
105 |     //
106 |     // command line options, we need some of these for our output
107 |     //
108 | 
109 |     // Path to file(s) for parsing
110 |     vector<string> inputFilenames;
111 |     
112 |     // list of loaded .sla files
113 |     // needed similar to the allInstructions map for generating register attach
114 |     // directives
115 |     vector<Slautil> slas;
116 | 
117 |     // endianess of the instruction set. Can be either "little" or "big".
118 |     // Needed in the output files
119 |     string endianness;
120 | 
121 |     // name of the processor
122 |     string processorName;
123 | 
124 |     // family of the processor
125 |     string processorFamily;
126 | 
127 |     // alignment of the instruction set
128 |     unsigned int alignment;
129 | 
130 |     // bitness of the instruction set
131 |     unsigned int bitness;
132 | 
133 |     // whether or not to display opcodes as comments in the outputted .sla file
134 |     // useful for debugging
135 |     bool omitOpcodes;
136 |     
137 |     // whether or not to display an example combined instruction as comments in
138 |     // the outputted .sla file. useful for debugging
139 |     bool omitExampleInstructions;
140 | 
141 |     // number of threads to use for each thread pool
142 |     // defaults to number of physical CPUs by default
143 |     unsigned int numThreads;
144 | 
145 | } PARSED_DATA, *PPARSED_DATA;
146 | 
147 | int initRegisters(void);
148 | int addRegisters(vector<string>& additionalRegisters);
149 | bool isOpcode(const string& str);
150 | bool isInteger(const string &str);
151 | bool isImmediate(const string& str);
152 | bool isRegister(const string& str);
153 | int parseInstructions(PARSED_DATA& parsedData, unsigned int fileId);
154 | void computeAttachVariables(PARSED_DATA& parsedData);
155 | void computeTokenInstructions(PARSED_DATA& parsedData);
156 | void clearParserData(PARSED_DATA& parsedData, bool save_registers);
157 | int convertOpcodeSizeToIndex(unsigned int opcodeSizeInBits);
158 | 


--------------------------------------------------------------------------------
/parser_sla.cpp:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // File: parser_sla.cpp
  3 | //
  4 | // Parsing and combining the instructions from .sla
  5 | //
  6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
  7 | // Licensed under the Apache 2.0 License.
  8 | //-----------------------------------------------------------------------------
  9 | #include "slautil/slautil.h"
 10 | #include "parser.h"
 11 | 
 12 | // Tokenizes the input instructions from the .sla and appends them to the
 13 | // allInstructions set
 14 | int parseInstructionsSla(PARSED_DATA& parsedData, unsigned int fileId)
 15 | {   
 16 |     Slautil slautil;
 17 |     vector<string> registers;
 18 |     unsigned int count = 0;
 19 |     int result = 0;
 20 | 
 21 |     result = slautil.loadSla(parsedData.inputFilenames[fileId]);
 22 |     if(result != 0)
 23 |     {
 24 |         return result;
 25 |     }
 26 | 
 27 |     result = slautil.getConstructorCount(count);
 28 |     if(result != 0)
 29 |     {
 30 |         cout << "Failed to get constructor count" << endl;
 31 |         return result;
 32 |     }
 33 | 
 34 |     result = slautil.getRegisters(registers);
 35 |     if(result != 0)
 36 |     {
 37 |         cout << "Failed to get sla registers" << endl;
 38 |         return result;
 39 |     }
 40 | 
 41 |     result = addRegisters(registers);
 42 |     if(result != 0)
 43 |     {
 44 |         cout << "Failed to add sla registers" << endl;
 45 |     }
 46 | 
 47 |     for(unsigned int j = 0; j < registers.size(); j++)
 48 |     {
 49 |         parsedData.registers.insert(registers[j]);
 50 |     }
 51 | 
 52 |     for(unsigned int i = 0; i < count; i++)
 53 |     {
 54 |         string bit_pattern;
 55 |         string constructor_text;
 56 |         string line;
 57 |         vector<string> lineSplit;
 58 |         Instruction* currInstruction = NULL;
 59 |         bool isCombined = false;
 60 |         map<string, Instruction*>::iterator itr;
 61 | 
 62 |         result = slautil.getConstructorBitPattern(i, bit_pattern);
 63 |         if(result != 0)
 64 |         {
 65 |             cout << "Failed to get bit pattern" << endl;
 66 |             return result;
 67 |         }
 68 | 
 69 |         result = slautil.getConstructorText(i, constructor_text);
 70 |         if(result != 0)
 71 |         {
 72 |             cout << "Failed to get constructor text" << endl;
 73 |             return result;
 74 |         }
 75 |         
 76 |         line = bit_pattern + " " + constructor_text;
 77 | 
 78 |         // We want to split these fillers from register values
 79 |         // The simplest way I could come up was to do this but it's slow...
 80 |         // BUGBUG: improve performance here
 81 |         // TODO; replace with other impl
 82 |         boost::replace_all(line, ",", " , ");
 83 |         boost::replace_all(line, "@", " @ ");
 84 |         boost::replace_all(line, "(", " ( ");
 85 |         boost::replace_all(line, ")", " ) ");
 86 |         boost::replace_all(line, "[", " [ ");
 87 |         boost::replace_all(line, "]", " ] ");
 88 |         boost::replace_all(line, "+", " + ");
 89 |         boost::replace_all(line, "-", " - ");
 90 |         boost::replace_all(line, "#", " # ");
 91 |         boost::replace_all(line, "_DUP", ""); // TODO: hack to workaround not being able to have duplicate
 92 |                                               // registers in a single instruction
 93 |         boost::trim(line);
 94 | 
 95 |         // split the line into components
 96 |         boost::split(lineSplit, line, boost::algorithm::is_space(), boost::token_compress_on);
 97 | 
 98 |         // Our combining algorithm needs to be rewritten to support more than 
 99 |         // 26 tokens. For the time being bail
100 |         if(lineSplit.size() > MAX_TOKENS)
101 |         {
102 |             cout << "[-] Error constructor " << i << ": Line has more than MAX_TOKENS!!" << endl;
103 |             return -1;
104 |         }
105 | 
106 |         currInstruction = new Instruction();
107 |         if(currInstruction == NULL)
108 |         {
109 |             cout << "[-] Error constructur " << i << ": Failed to allocate!!" << endl;
110 |             return -1;
111 |         }
112 | 
113 |         // tokenize each line component and add it to the Instruction
114 |         for (unsigned int i = 0; i < lineSplit.size(); i++)
115 |         {
116 |             if(i == 0)
117 |             {
118 |                 unsigned int opcodeBitLength = 0;
119 | 
120 |                 currInstruction->setOpcodeBitString(lineSplit[i]);
121 | 
122 |                 // we need to keep track of the maximum bit length for the 
123 |                 // combining stage
124 |                 opcodeBitLength = currInstruction->getOpcode().length();
125 |                 if(opcodeBitLength > parsedData.maxOpcodeBits)
126 |                 {
127 |                     //cout << "Updating bit length from " << parsedData.maxOpcodeBits << " to " << opcodeBitLength << endl;
128 |                     parsedData.maxOpcodeBits = opcodeBitLength;
129 |                 }
130 |             }
131 |             else
132 |             {
133 |                 InstructionComponentType currType;
134 | 
135 |                 if (lineSplit[i].find("_DUP") != std::string::npos)
136 |                 {
137 |                     std::cout << "found! " << lineSplit[i] << endl;
138 |                     throw 1;
139 |                 }            
140 | 
141 |                 // all remaining elements on the line are components of the 
142 |                 // instruction
143 |                 if(isRegister(lineSplit[i]))
144 |                 {
145 |                     currType = TYPE_REGISTER;
146 | 
147 |                     if(lineSplit[i] == "__register_list__")
148 |                     {
149 |                         currInstruction->setCombined(true);
150 |                         isCombined = true;
151 |                     }
152 |                     else
153 |                     {
154 |                         parsedData.registers.insert(lineSplit[i]);
155 |                     }
156 |                 }
157 |                 else if(isImmediate(lineSplit[i]))
158 |                 {
159 |                     currType = TYPE_IMMEDIATE;
160 | 
161 |                     if(lineSplit[i] == "__immediate_list__")
162 |                     {
163 |                         currInstruction->setCombined(true);
164 |                         isCombined = true;
165 |                     }
166 |                 }
167 |                 else
168 |                 {
169 |                     currType = TYPE_INSTRUCTION;
170 |                 }
171 | 
172 |                 currInstruction->addComponent(currType,
173 |                                               lineSplit[i],
174 |                                               isCombined);
175 |             }
176 |         } // for (int i = 0; i < lineSplit.size(); i++)
177 | 
178 |         // sanity check the instruction
179 |         result = currInstruction->validateInstruction();
180 |         if(result != true)
181 |         {
182 |             cout << "[-] Error line " << i << ": Instruction is invalid!!" << endl;
183 |             delete currInstruction;
184 |             return -1;
185 |         }
186 | 
187 |         // check for duplicate instructions before inserting
188 |         itr = parsedData.allInstructions.find(currInstruction->getOpcode());
189 |         if(itr != parsedData.allInstructions.end())
190 |         {
191 |             cout << "[-] Error line " << i << ": Found duplicate opcode!!" << endl;
192 |             delete currInstruction;
193 |             return -1;
194 |         }
195 | 
196 |         // everything is good, insert instruction into our set
197 |         parsedData.allInstructions.insert({{currInstruction->getOpcode(),
198 |                                             currInstruction}});
199 | 
200 |     } // for(unsigned int i = 0; i < count; i++)
201 | 
202 |     // Copy the instructions into the combined instructions set
203 |     // We need to save the original allInstructions to recreate the registers
204 |     // lists when we print out the instructions
205 |     parsedData.combinedInstructions.merge(parsedData.allInstructions);
206 | 
207 |     parsedData.slas.push_back(slautil);
208 |     return 0;   
209 | }
210 | 


--------------------------------------------------------------------------------
/parser_sla.h:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // File: parser_sla.h
 3 | //
 4 | // Parsing and combining the instructions from .sla
 5 | //
 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
 7 | // Licensed under the Apache 2.0 License.
 8 | //-----------------------------------------------------------------------------
 9 | #pragma once
10 | 
11 | #include <iostream>
12 | #include <set>
13 | #include <boost/algorithm/string.hpp>
14 | #include <boost/filesystem/fstream.hpp>
15 | #include <boost/regex.hpp>
16 | #include "instruction.h"
17 | using namespace std;
18 | 
19 | int parseInstructionsSla(PARSED_DATA& parsedData, unsigned int fileId);
20 | 


--------------------------------------------------------------------------------
/slautil/slautil.cpp:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // File: slautil.cpp
  3 | //
  4 | // Misc helper functions for working with .sla files
  5 | //
  6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
  7 | // Licensed under the Apache 2.0 License.
  8 | //-----------------------------------------------------------------------------
  9 | #include <boost/property_tree/ptree.hpp>
 10 | #include <boost/property_tree/xml_parser.hpp>
 11 | #include <boost/foreach.hpp>
 12 | #include <string>
 13 | #include <iostream>
 14 | #include <fstream>
 15 | #include "slautil.h"
 16 | 
 17 | using namespace std;
 18 | namespace pt = boost::property_tree;
 19 | 
 20 | // sorting bit_patterns by start_bit
 21 | struct less_than_key
 22 | {
 23 |     inline bool operator() (const BIT_PATTERN& a, const BIT_PATTERN& b)
 24 |     {
 25 |         return (b.start_bit < a.start_bit);
 26 |     }
 27 | };
 28 | 
 29 | // default constructor
 30 | Slautil::Slautil(void)
 31 | {
 32 |     m_initialized = false;
 33 | }
 34 | 
 35 | // load the processor module file
 36 | // currently only supports XML .sla files
 37 | int Slautil::loadSla(const string& filename)
 38 | {
 39 |     int status = 0;
 40 | 
 41 |     status = this->loadSlaXML(filename);
 42 |     if(status != SLA_SUCCESS)
 43 |     {
 44 |         return status;
 45 |     }
 46 | 
 47 |     m_initialized = true;
 48 |     return SLA_SUCCESS;
 49 | }
 50 | 
 51 | // return the registers from the processor module
 52 | int Slautil::getRegisters(vector<string>& registers)
 53 | {
 54 |     if(!m_initialized)
 55 |     {
 56 |         return NOT_INITIALIZED;
 57 |     }
 58 | 
 59 |     registers.reserve(m_registers.size());
 60 |     std::copy(m_registers.begin(),
 61 |               m_registers.end(),
 62 |               std::back_inserter(registers));
 63 | 
 64 |     return SLA_SUCCESS;
 65 | }
 66 | 
 67 | // get the number of instructions (constructors) in the processor module
 68 | int Slautil::getConstructorCount(unsigned int& count)
 69 | {
 70 |     if(!m_initialized)
 71 |     {
 72 |         return NOT_INITIALIZED;
 73 |     }
 74 | 
 75 |     count = m_constructors.size();
 76 |     return SLA_SUCCESS;
 77 | }
 78 | 
 79 | // generate the opcode bit patterns for immediates and registers
 80 | int Slautil::addNonOpcodeBitPatterns(void)
 81 | {
 82 |     for(unsigned int i = 0; i < m_constructors.size(); i++)
 83 |     {
 84 |         unsigned int num_immediates = 0;
 85 |         unsigned int num_registers = 0;
 86 | 
 87 |         PCONSTRUCTOR curr_constructor = &m_constructors[i];
 88 | 
 89 |         // TODO; cleanup function
 90 | 
 91 |         for(unsigned int j = 0;
 92 |             j < curr_constructor->constructor_pieces.size();
 93 |             j++)
 94 |         {
 95 |             PCONSTRUCTOR_PIECE curr_constructor_piece = NULL;
 96 | 
 97 |             curr_constructor_piece = &curr_constructor->constructor_pieces[j];
 98 | 
 99 |             if(curr_constructor_piece->type == "opprint")
100 |             {
101 |                 boost::unordered_map<unsigned int, varlist_sym> ::iterator itr;
102 |                 boost::unordered_map<unsigned int, OPERAND_SYM> ::iterator itr2;
103 | 
104 |                 //cout << curr_constructor_piece->id << endl;
105 | 
106 |                 itr = m_varlist_syms.find(curr_constructor_piece->id);
107 |                 if(itr == m_varlist_syms.end())
108 |                 {
109 |                     itr2 = m_operand_syms.find(curr_constructor_piece->id);
110 |                     if(itr2 == m_operand_syms.end())
111 |                     {
112 |                         boost::unordered_map<unsigned int, string> ::iterator itr3;
113 | 
114 |                         itr3 = m_vars.find(curr_constructor_piece->id);
115 |                         if(itr3 == m_vars.end())
116 |                         {
117 |                             cout << "Failed to find " << curr_constructor_piece->id << endl;
118 |                             throw 1;
119 |                             continue;
120 |                         }
121 | 
122 |                         if(std::find(m_registers.begin(), m_registers.end(), itr3->second) != m_registers.end())
123 |                         {
124 |                             num_registers++;
125 |                             continue;
126 |                         }
127 |                         else
128 |                         {
129 |                             cout << "What the heck should never get here!! " << itr3->second << endl;
130 |                             throw 4;
131 |                         }
132 | 
133 |                         throw 1;
134 |                         continue;
135 |                     }
136 | 
137 |                     // found operand_syms
138 |                     addBitPattern(m_constructors[i],
139 |                                   itr2->second.bitfield,
140 |                                   "imm",
141 |                                   num_immediates);
142 |                     num_immediates++;
143 |                     continue;
144 |                 }
145 | 
146 |                 addBitPattern(m_constructors[i],
147 |                              itr->second.bitfield,
148 |                              "reg",
149 |                              num_registers);
150 |                 num_registers++;
151 |                 continue;
152 |             }
153 |         } // for(unsigned int j = 0; j < curr_constructor->constructor_pieces.size(); j++)
154 | 
155 |         // sort the bit patterns
156 |         std::sort(curr_constructor->bit_patterns.begin(),
157 |                   curr_constructor->bit_patterns.end(),
158 |                   less_than_key());
159 |     }
160 | 
161 |     return SLA_SUCCESS;
162 | }
163 | 
164 | // adds a bit pattern to a constructor
165 | int Slautil::addBitPattern(CONSTRUCTOR& curr_constructor,
166 |                            const TOKENFIELD& bitfield,
167 |                            const string& type,
168 |                            unsigned int count)
169 | {
170 |     BIT_PATTERN curr_bit_pattern;
171 |     unsigned char patternChar = '\x0';
172 | 
173 |     if(count >= 25)
174 |     {
175 |         return -1;
176 |     }
177 | 
178 |     curr_bit_pattern.start_bit = bitfield.startbit;
179 |     curr_bit_pattern.end_bit = bitfield.endbit;
180 |     curr_bit_pattern.pattern_type = type;
181 | 
182 |     if(type == "imm")
183 |     {
184 |         patternChar = 'a' + count;
185 |     }
186 |     else if(type == "reg")
187 |     {
188 |         patternChar = 'A' + count;
189 |     }
190 |     else
191 |     {
192 |         patternChar = '?';
193 |     }
194 | 
195 |     for(unsigned int i = curr_bit_pattern.start_bit;
196 |        i <= curr_bit_pattern.end_bit;
197 |        i++)
198 |     {
199 |         curr_bit_pattern.pattern += patternChar;
200 |     }
201 | 
202 |     curr_constructor.bit_patterns.push_back(curr_bit_pattern);
203 | 
204 |     return SLA_SUCCESS;
205 | }
206 | 
207 | // get the opcode bit pattern given an constructor id
208 | int Slautil::getConstructorBitPattern(unsigned int id, string& bit_pattern)
209 | {
210 |     PCONSTRUCTOR curr_constructor = NULL;
211 |     unsigned int size = 0;
212 | 
213 |     if(id >= m_constructors.size())
214 |     {
215 |         cout << "Bad ID!!" << endl;
216 |         return -2;
217 |     }
218 | 
219 |     curr_constructor = &m_constructors[id];
220 |     bit_pattern = "";
221 | 
222 |     for(unsigned int k = 0; k < curr_constructor->bit_patterns.size(); k++)
223 |     {
224 |         PBIT_PATTERN curr_bit_pattern = &curr_constructor->bit_patterns[k];
225 | 
226 |         //bit_pattern += curr_bit_pattern->pattern_type + "_" + to_string(curr_bit_pattern->start_bit) + "_" + to_string(curr_bit_pattern->end_bit) + "=";
227 | 
228 |         if(curr_bit_pattern->pattern_type == "opcode")
229 |         {
230 |             bit_pattern += curr_constructor->bit_patterns[k].pattern;
231 |         }
232 |         else if(curr_bit_pattern->pattern_type == "reg")
233 |         {
234 |             size = curr_bit_pattern->end_bit -
235 |                    curr_bit_pattern->start_bit + 1;
236 |             bit_pattern += string(size, curr_bit_pattern->pattern[0]);
237 |         }
238 |         else if(curr_bit_pattern->pattern_type == "imm")
239 |         {
240 |             size = curr_bit_pattern->end_bit -
241 |                    curr_bit_pattern->start_bit + 1;
242 |             bit_pattern += string(size, curr_bit_pattern->pattern[0]);
243 |         }
244 |     }
245 | 
246 |     // sanity check the bit pattern size
247 |     if(bit_pattern.size() == 0)
248 |     {
249 |         return -1;
250 |     }
251 | 
252 |     return SLA_SUCCESS;
253 | }
254 | 
255 | // get the instruction mnemonic given a constructor id
256 | int Slautil::getConstructorText(unsigned int id, string& constructor_text)
257 | {
258 |     string unused;
259 |     return getConstructorText(id, constructor_text, false, unused);
260 | }
261 | 
262 | // get the instruction mnemonic given a constructor id
263 | int Slautil::getConstructorText(unsigned int id,
264 |                                 string& constructor_text,
265 |                                 bool use_bit_pattern,
266 |                                 const string& bit_pattern)
267 | {
268 |     PCONSTRUCTOR curr_constructor = NULL;
269 | 
270 |     if(!m_initialized)
271 |     {
272 |         return NOT_INITIALIZED;
273 |     }
274 | 
275 |     if(id >= m_constructors.size())
276 |     {
277 |         cout << "Bad ID!!" << endl;
278 |         return -2;
279 |     }
280 | 
281 |     curr_constructor = &m_constructors[id];
282 |     constructor_text = "";
283 | 
284 |     for(unsigned int j = 0; j < curr_constructor->constructor_pieces.size(); j++)
285 |     {
286 |         PCONSTRUCTOR_PIECE curr_constructor_piece = NULL;
287 | 
288 |         curr_constructor_piece = &curr_constructor->constructor_pieces[j];
289 | 
290 |         if(curr_constructor_piece->type == "print")
291 |         {
292 |             constructor_text += curr_constructor_piece->part;
293 |         }
294 |         else if(curr_constructor_piece->type == "opprint")
295 |         {
296 |             //cout << curr_constructor_piece->type << endl;
297 |             //cout << curr_constructor_piece->id << endl;
298 | 
299 |             // todo change logic
300 |             boost::unordered_map<unsigned int, varlist_sym> ::iterator itr;
301 |             boost::unordered_map<unsigned int, OPERAND_SYM> ::iterator itr2;
302 | 
303 |             itr = m_varlist_syms.find(curr_constructor_piece->id);
304 |             if(itr == m_varlist_syms.end())
305 |             {
306 |                 itr2 = m_operand_syms.find(curr_constructor_piece->id);
307 |                 if(itr2 == m_operand_syms.end())
308 |                 {
309 |                     boost::unordered_map<unsigned int, string> ::iterator itr3;
310 |                     itr3 = m_vars.find(curr_constructor_piece->id);
311 | 
312 |                     if(itr3 == m_vars.end())
313 |                     {
314 |                         cout << "Failed to find " << curr_constructor_piece->id << endl;
315 |                         throw 1;
316 |                         continue;
317 |                     }
318 | 
319 |                     constructor_text += itr3->second;
320 |                     continue;
321 |                 }
322 |                 else
323 |                 {
324 |                     if(use_bit_pattern == false)
325 |                     {
326 |                         constructor_text += "__immediate_list__";
327 |                     }
328 |                     else
329 |                     {
330 |                         unsigned int value = 0;
331 |                         convertBitFieldToValue(itr2->second.bitfield,
332 |                                                bit_pattern,
333 |                                                value);
334 | 
335 |                         stringstream ss;
336 |                         ss << setbase(16) << value;
337 | 
338 |                         constructor_text += "0x" + ss.str();
339 |                     }
340 |                 }
341 | 
342 |                 continue;
343 |             }
344 |             else
345 |             {
346 |                 if(use_bit_pattern == false)
347 |                 {
348 |                     constructor_text += "__register_list__";
349 |                 }
350 |                 else
351 |                 {
352 |                     unsigned int register_index = 0;
353 |                     convertBitFieldToValue(itr->second.bitfield,
354 |                                            bit_pattern,
355 |                                            register_index);
356 | 
357 |                     if(register_index < itr->second.register_ids.size())
358 |                     {
359 |                         boost::unordered_map<unsigned int, string> ::iterator itr3;
360 |                         itr3 = m_vars.find(itr->second.register_ids[register_index]);
361 | 
362 |                         constructor_text += itr3->second;
363 |                     }
364 |                     else
365 |                     {
366 |                         constructor_text += "___ERROR_REGISTER__INDEX__";
367 |                     }
368 |                 }
369 |             }
370 |         }
371 |     } // for(unsigned int j = 0; j < curr_constructor->constructor_pieces.size(); j++)
372 | 
373 |     return SLA_SUCCESS;
374 | }
375 | 
376 | // get the constructor register by id
377 | int Slautil::getConstructorTextRegisterById(unsigned int id,
378 |                                             string& register_name,
379 |                                             unsigned int register_number,
380 |                                             string& bit_pattern)
381 | {
382 |     PCONSTRUCTOR curr_constructor = NULL;
383 |     unsigned int registers_count = 0;
384 | 
385 |     // TODO: sloppy, add error handling
386 | 
387 |     if(!m_initialized)
388 |     {
389 |         return NOT_INITIALIZED;
390 |     }
391 | 
392 |     if(id >= m_constructors.size())
393 |     {
394 |         cout << "Bad ID!!" << endl;
395 |         return -2;
396 |     }
397 | 
398 |     curr_constructor = &m_constructors[id];
399 |     register_name = "";
400 | 
401 |     for(unsigned int j = 0;
402 |         j < curr_constructor->constructor_pieces.size();
403 |         j++)
404 |     {
405 |         PCONSTRUCTOR_PIECE curr_constructor_piece = NULL;
406 |         curr_constructor_piece = &curr_constructor->constructor_pieces[j];
407 | 
408 |         if(curr_constructor_piece->type == "opprint")
409 |         {
410 |             // todo change logic
411 |             boost::unordered_map<unsigned int, varlist_sym> ::iterator itr;
412 |             boost::unordered_map<unsigned int, OPERAND_SYM> ::iterator itr2;
413 | 
414 |             itr = m_varlist_syms.find(curr_constructor_piece->id);
415 |             if(itr != m_varlist_syms.end())
416 |             {
417 |                 if(registers_count != register_number)
418 |                 {
419 |                     registers_count++;
420 |                     continue;
421 |                 }
422 | 
423 |                 unsigned int register_index = 0;
424 | 
425 |                 /*
426 |                 cout << "itr->second.bitfield " << &itr->second.bitfield << endl;
427 |                 cout << "bitpattern " << bit_pattern << endl;
428 |                 cout << "regindex " << register_index << endl;
429 |                 */
430 | 
431 |                 convertBitFieldToValue(itr->second.bitfield,
432 |                                        bit_pattern,
433 |                                        register_index);
434 | 
435 |                 if(register_index < itr->second.register_ids.size())
436 |                 {
437 |                     boost::unordered_map<unsigned int, string> ::iterator itr3;
438 |                     itr3 = m_vars.find(itr->second.register_ids[register_index]);
439 |                     register_name += itr3->second;
440 |                     return 0;
441 |                 }
442 |                 else
443 |                 {
444 |                     cout << "bad bad bad" << endl;
445 |                     register_name += "___ERROR_REGISTER__INDEX__";
446 |                     cout << register_name << endl;
447 |                     throw 1;
448 |                     return 0;
449 |                 }
450 |             }
451 | 
452 |             boost::unordered_map<unsigned int, string> ::iterator itr3;
453 |             itr3 = m_vars.find(curr_constructor_piece->id);
454 |             if(itr3 == m_vars.end())
455 |             {
456 |                 cout << "Failed to find " << curr_constructor_piece->id << endl;
457 |                 throw 1;
458 |                 continue;
459 |             }
460 |             else
461 |             {
462 |                 if(std::find(m_registers.begin(), m_registers.end(), itr3->second) != m_registers.end())
463 |                 {
464 |                     if(registers_count == register_number)
465 |                     {
466 |                         register_name = itr3->second;
467 |                         //cout << "FOUND " << register_name << endl;
468 |                         //throw 1;
469 |                         return 0;
470 |                     }
471 | 
472 |                     registers_count++;
473 |                     continue;
474 |                 }
475 |                 continue;
476 |             }
477 |         }
478 |         else if(curr_constructor_piece->type == "print")
479 |         {
480 |             // TODO:
481 |             // BUGBUG: incorrect hack
482 |             if(curr_constructor_piece->part[0] == 'r' &&
483 |                curr_constructor_piece->part[1] == '0')
484 |             {
485 |                 //cout << "print: " << curr_constructor_piece->part << endl;
486 | 
487 |                 if(registers_count == register_number)
488 |                 {
489 |                     register_name = "r0";
490 |                     //cout << "FOUND " << register_name << endl;
491 |                     return 0;
492 |                 }
493 | 
494 |                 registers_count++;
495 |                 continue;
496 |             }
497 |         }
498 |     }
499 | 
500 |     // TODO; should never get here
501 |     cout << "RC " << registers_count << "  register_number " << register_number << endl;
502 |     cout << "Failing here" << endl;
503 |     return -1;
504 | }
505 | 
506 | // get a constructor mnemonic via opcode bit string
507 | int Slautil::getConstructorTextByBitPattern(const string& bit_pattern,
508 |                                             string& constructor_text)
509 | {
510 |     unsigned int id = 0;
511 |     int result = 0;
512 | 
513 |     result = getConstructorIdByBitPattern(bit_pattern, id);
514 |     if(result != 0)
515 |     {
516 |         //cout << "Failed to find bit pattern" << endl;
517 |         return result;
518 |     }
519 | 
520 |     result = getConstructorText(id, constructor_text, true, bit_pattern);
521 |     if(result != 0)
522 |     {
523 |         //cout << "Failed to get constructor text" << endl;
524 |         return result;
525 |     }
526 | 
527 |     return SLA_SUCCESS;
528 | }
529 | 
530 | // get a constructor ID by opcode bit string
531 | int Slautil::getConstructorIdByBitPattern(const string& bit_pattern,
532 |                                           unsigned int& id)
533 | {
534 |     unsigned int count;
535 |     int result = 0;
536 |     id = 0xffffffff;
537 | 
538 |     if(!m_initialized)
539 |     {
540 |         return NOT_INITIALIZED;
541 |     }
542 | 
543 |     result = this->getConstructorCount(count);
544 |     if(result != 0)
545 |     {
546 |         cout << "Failed to get constructor count" << endl;
547 |         return result;
548 |     }
549 | 
550 |     for(unsigned int i = 0; i < count; i++)
551 |     {
552 |         string bit_pattern2;
553 |         result = this->getConstructorBitPattern(i, bit_pattern2);
554 |         if(result != 0)
555 |         {
556 |             cout << "Failed to get bit pattern" << endl;
557 |             return result;
558 |         }
559 | 
560 |         result = this->compareBitPatterns(bit_pattern, bit_pattern2);
561 |         if(result == 0)
562 |         {
563 |             if(id != 0xffffffff)
564 |             {
565 |                 //cout << "Found duplicate!!" << endl;
566 |                 //return -1;
567 |             }
568 |             id = i;
569 |         }
570 |     }
571 | 
572 |     if(id != 0xffffffff)
573 |     {
574 |         return SLA_SUCCESS;
575 |     }
576 | 
577 |     return -1;
578 | }
579 | 
580 | // compare two opcode bit patterns
581 | // has fuzzy logic for combined fields
582 | int Slautil::compareBitPatterns(const string& a, const string& b)
583 | {
584 |     bool a_is_digit = true;
585 |     bool b_is_digit = true;
586 | 
587 |     if(a.size() != b.size())
588 |     {
589 |         return -1;
590 |     }
591 | 
592 |     for(unsigned int i = 0; i < a.size(); i++)
593 |     {
594 |         a_is_digit = ((a[i] == '0') || (a[i] == '1'));
595 |         b_is_digit = ((b[i] == '0') || (b[i] == '1'));
596 | 
597 |         if(a_is_digit != b_is_digit)
598 |         {
599 |             // one is a digit, the other isn't
600 |             // this is fine
601 |             continue;
602 |         }
603 | 
604 |         // both are digits or non-digits
605 |         // must be the same
606 |         if(a[i] != b[i])
607 |         {
608 |             return -1;
609 |         }
610 |     }
611 | 
612 |     return 0;
613 | }
614 | 
615 | // converts a bit field into a value
616 | int Slautil::convertBitFieldToValue(TOKENFIELD& bitfield,
617 |                                     const string& bit_pattern,
618 |                                     unsigned int& value)
619 | {
620 |     value = 0;
621 | 
622 |     unsigned int bit_pattern_end = bit_pattern.length();
623 | 
624 |     if(bitfield.startbit >= bit_pattern_end ||
625 |        bitfield.endbit >= bit_pattern_end)
626 |     {
627 |         cout << "Invalid bit field\bit pattern combination!!" << endl;
628 |         throw 2;
629 |     }
630 | 
631 |     for(unsigned int i = bitfield.startbit; i <= bitfield.endbit; i++)
632 |     {
633 |         unsigned int bit_pos = i - bitfield.startbit;
634 | 
635 |         if(bit_pattern[bit_pattern_end - i - 1] == '1')
636 |         {
637 |             value += (1 << bit_pos);
638 |         }
639 |         else if(bit_pattern[bit_pattern_end - i - 1] == '0')
640 |         {
641 |             // don't do anything for zero
642 |         }
643 |         else
644 |         {
645 |             // TODO fix
646 |             cout << "Unexpected bit string val!!" << endl;
647 |             throw 1;
648 |         }
649 |     }
650 | 
651 |     return 0;
652 | }
653 | 


--------------------------------------------------------------------------------
/slautil/slautil.h:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // File: slautil.h
  3 | //
  4 | // Misc helper functions for working with .sla files
  5 | //
  6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
  7 | // Licensed under the Apache 2.0 License.
  8 | //-----------------------------------------------------------------------------
  9 | #pragma once
 10 | 
 11 | #include <boost/property_tree/ptree.hpp>
 12 | #include <boost/property_tree/xml_parser.hpp>
 13 | #include <boost/foreach.hpp>
 14 | #include <boost/unordered_map.hpp>
 15 | #include <string>
 16 | #include <iostream>
 17 | #include <fstream>
 18 | 
 19 | using namespace std;
 20 | namespace pt = boost::property_tree;
 21 | 
 22 | #define SLEIGH_VERSION 4
 23 | #define SLA_SUCCESS (0)
 24 | #define NOT_INITIALIZED (-1)
 25 | 
 26 | typedef struct _DECISION_PAIR
 27 | {
 28 |     unsigned int id;
 29 |     unsigned int off;
 30 |     unsigned int nonzero;
 31 |     unsigned int mask;
 32 |     unsigned int val;
 33 | } DECISION_PAIR, *PDECISION_PAIR;
 34 | 
 35 | typedef struct _BIT_PATTERN
 36 | {
 37 |     unsigned int start_bit;
 38 |     unsigned int end_bit;
 39 |     string pattern_type;
 40 |     string pattern;
 41 | 
 42 | } BIT_PATTERN, *PBIT_PATTERN;
 43 | 
 44 | typedef struct _TOKENFIELD
 45 | {
 46 |     bool bigendian;
 47 |     bool signbit;
 48 |     unsigned int startbit;
 49 |     unsigned int endbit;
 50 |     unsigned int startbyte;
 51 |     unsigned int endbyte;
 52 |     unsigned int shift;
 53 | 
 54 | } TOKENFIELD, *PTOKENFIELD;
 55 | 
 56 | typedef struct _varlist_sym
 57 | {
 58 |     unsigned int id;
 59 |     TOKENFIELD bitfield;
 60 |     vector<unsigned int> register_ids;
 61 | } varlist_sym, *pvarlist_sym;
 62 | 
 63 | typedef struct _OPERAND_SYM
 64 | {
 65 |     unsigned int id;
 66 |     TOKENFIELD bitfield;
 67 | } OPERAND_SYM, *POPERAND_SYM;
 68 | 
 69 | typedef struct _CONSTRUCTOR_PIECE
 70 | {
 71 |     string type; // print or opprint
 72 |     unsigned int id; // needed for opprint
 73 |     string part;
 74 | } CONSTRUCTOR_PIECE, *PCONSTRUCTOR_PIECE;
 75 | 
 76 | typedef struct _CONSTRUCTOR
 77 | {
 78 |     unsigned int id;
 79 |     unsigned int constructor_length; // length of the instruction in bytes
 80 |     unsigned int source_file;
 81 |     unsigned int line_number;
 82 |     vector<CONSTRUCTOR_PIECE> constructor_pieces;
 83 |     vector<BIT_PATTERN> bit_patterns;
 84 | } CONSTRUCTOR, *PCONSTRUCTOR;
 85 | 
 86 | class Slautil
 87 | {
 88 |     public:
 89 |         Slautil();
 90 | 
 91 |         int loadSla(const string& filename);
 92 |         int getRegisters(vector<string>& registers);
 93 | 
 94 |         // various way to look up instructions
 95 |         int getConstructorCount(unsigned int& count);
 96 |         int getConstructorText(unsigned int id, string& constructor_text);
 97 |         int getConstructorBitPattern(unsigned int id, string& bit_pattern);
 98 |         int getConstructorTextByBitPattern(const string& bit_pattern,
 99 |                                            string& constructor_text);
100 |         int getConstructorIdByBitPattern(const string& bit_pattern,
101 |                                          unsigned int& id);
102 |         int getConstructorTextRegisterById(unsigned int id,
103 |                                            string& register_name,
104 |                                            unsigned int register_number,
105 |                                            string& bit_pattern);
106 | 
107 |     private:
108 |         int loadSlaXML(const string& filename);
109 | 
110 |         // parsing fields within the xml
111 |         int parseRegisters(void);
112 |         int parseVars(void);
113 |         int parseSubtableSymHeads(void);
114 |         int parseConstructors(void);
115 |         int parseVarlistSym(void);
116 |         int parseOperandSyms(void);
117 |         int parseDecisionPairs(void);
118 |         int convertDecisionPairsToBitPatterns(void);
119 |         int recursiveParseDecisionPairs(const boost::property_tree::ptree & subtree);
120 |         int parseDecisionPair(const boost::property_tree::ptree& subtree);
121 |         int addNonOpcodeBitPatterns(void);
122 | 
123 |         // various helper routines
124 |         int getConstructorText(unsigned int id,
125 |                                string& constructor_text,
126 |                                bool use_bitpattern,
127 |                                const string& bit_pattern);
128 |         int checkSubsym(unsigned int& id);
129 |         int countAdjacentOnes(unsigned int id,
130 |                               unsigned int mask,
131 |                               unsigned int value);
132 |         string extractBits(unsigned int start_bit,
133 |                            unsigned int number_of_bits,
134 |                            unsigned int value);
135 |         int addBitPattern(CONSTRUCTOR& curr_constructor,
136 |                           const TOKENFIELD& bitfield,
137 |                           const string& type,
138 |                           unsigned int count);
139 |         int compareBitPatterns(const string& a, const string& b);
140 |         int convertBitFieldToValue(TOKENFIELD& bitfield,
141 |                                    const string& bit_pattern,
142 |                                    unsigned int& value);
143 | 
144 |         // member vars
145 |         boost::unordered_map<unsigned int, varlist_sym> m_varlist_syms;
146 |         boost::unordered_map<unsigned int, OPERAND_SYM> m_operand_syms;
147 |         boost::unordered_map<unsigned int, unsigned int> m_subsyms;
148 |         boost::unordered_map<unsigned int, string> m_vars;
149 |         vector<CONSTRUCTOR> m_constructors;
150 |         vector<DECISION_PAIR> m_decision_pairs;
151 |         vector<string> m_registers;
152 |         unsigned int m_constructor_count;
153 |         unsigned int m_sleigh_version;
154 |         pt::ptree m_tree;
155 |         bool m_initialized;
156 | };
157 | 


--------------------------------------------------------------------------------
/slautil/slaxml.cpp:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // File: slautil.h
  3 | //
  4 | // Parsing XML SLA files
  5 | //
  6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
  7 | // Licensed under the Apache 2.0 License.
  8 | //-----------------------------------------------------------------------------
  9 | #include <boost/property_tree/ptree.hpp>
 10 | #include <boost/property_tree/xml_parser.hpp>
 11 | #include <boost/foreach.hpp>
 12 | #include <string>
 13 | #include <iostream>
 14 | #include <fstream>
 15 | #include "slautil.h"
 16 | 
 17 | using namespace std;
 18 | namespace pt = boost::property_tree;
 19 | 
 20 | // load the XML SLA processor module
 21 | int Slautil::loadSlaXML(const string& filename)
 22 | {
 23 |     // Parse the XML into the property tree.
 24 |     try
 25 |     {
 26 |         pt::read_xml(filename, m_tree);
 27 |     }
 28 |     catch(...)
 29 |     {
 30 |         cout << "[-] Exception when opening sla (" << filename << ")!" << endl;
 31 |         return -1;
 32 |     }
 33 | 
 34 |     m_sleigh_version = m_tree.get("sleigh.<xmlattr>.version", 0);
 35 |     if(m_sleigh_version != SLEIGH_VERSION)
 36 |     {
 37 |         cout << "[-] Invalid sleigh version (" << m_sleigh_version << ")!" << endl;
 38 |         cout << "[-] Is the .sla file correct?" << endl;
 39 |         return -1;
 40 |     }
 41 | 
 42 |     this->parseVars();
 43 |     this->parseSubtableSymHeads();
 44 |     this->parseOperandSyms();
 45 |     this->parseConstructors();
 46 |     this->parseDecisionPairs();
 47 |     this->convertDecisionPairsToBitPatterns();
 48 |     this->parseVarlistSym();
 49 |     this->parseRegisters(); // TODO: needs to happen before add_non_opcode_bit_patterns()
 50 |     this->addNonOpcodeBitPatterns();
 51 | 
 52 |     return SLA_SUCCESS;
 53 | }
 54 | 
 55 | // read the variables from the processor module
 56 | int Slautil::parseVars(void)
 57 | {
 58 |     for(auto &v : m_tree.get_child("sleigh.symbol_table"))
 59 |     {
 60 |         if(v.first != "varnode_sym_head" &&
 61 |            v.first != "value_sym_head" &&
 62 |            v.first != "operand_sym_head")
 63 |         {
 64 |             continue;
 65 |         }
 66 | 
 67 |         // todo can throw
 68 |         std::string name = v.second.get_child("<xmlattr>.name").data();
 69 |         std::string id_str = v.second.get_child("<xmlattr>.id").data();
 70 | 
 71 |         unsigned int id = stoi(id_str, 0, 0x10);
 72 | 
 73 |         m_vars.emplace(id, name);
 74 |     }
 75 | 
 76 |     return SLA_SUCCESS;
 77 | }
 78 | 
 79 | // read the subtable sym heads from the processor module
 80 | int Slautil::parseSubtableSymHeads(void)
 81 | {
 82 |     for(auto &v : m_tree.get_child("sleigh.symbol_table"))
 83 |     {
 84 |         size_t pos = 0;
 85 | 
 86 |         if(v.first != "subtable_sym_head")
 87 |         {
 88 |             continue;
 89 |         }
 90 | 
 91 |         // todo can throw
 92 |         std::string name = v.second.get_child("<xmlattr>.name").data();
 93 |         std::string id_str = v.second.get_child("<xmlattr>.id").data();
 94 | 
 95 |         // silly workaround to support instructions that reference the same reg more than once
 96 |         pos = name.find("_dup");
 97 |         if (pos == std::string::npos)
 98 |         {
 99 |             continue;
100 |         }
101 | 
102 |         name.resize(pos);
103 | 
104 |         unsigned int id = stoi(id_str, 0, 0x10);
105 |         m_vars.emplace(id, name);
106 |     }
107 | 
108 |     return SLA_SUCCESS;
109 | }
110 | 
111 | // read the operand syms from the processor module
112 | int Slautil::parseOperandSyms(void)
113 | {
114 |     for(auto &operand_sym_node : m_tree.get_child("sleigh.symbol_table"))
115 |     {
116 |         OPERAND_SYM curr_operand_sym;
117 |         std::string var_subsym_id_str;
118 |         std::string var_id_str;
119 | 
120 |         if(operand_sym_node.first != "operand_sym")
121 |         {
122 |             continue;
123 |         }
124 | 
125 |         var_id_str = operand_sym_node.second.get("<xmlattr>.id", "");
126 |         if(var_id_str == "")
127 |         {
128 |             continue;
129 |         }
130 |         unsigned int var_id = stoi(var_id_str, 0, 0x10);
131 | 
132 |         var_subsym_id_str = operand_sym_node.second.get("<xmlattr>.subsym", "");
133 |         if(var_subsym_id_str != "")
134 |         {
135 |             unsigned int var_subsym_id = stoi(var_subsym_id_str, 0, 0x10);
136 |             m_subsyms[var_id] = var_subsym_id;
137 |             continue;
138 |         }
139 | 
140 |         curr_operand_sym.id = var_id;
141 |         curr_operand_sym.bitfield.startbit = operand_sym_node.second.get("tokenfield.<xmlattr>.startbit", 0);
142 |         curr_operand_sym.bitfield.endbit = operand_sym_node.second.get("tokenfield.<xmlattr>.endbit", 0);
143 |         curr_operand_sym.bitfield.startbyte = operand_sym_node.second.get("tokenfield.<xmlattr>.startbyte", 0);
144 |         curr_operand_sym.bitfield.endbyte = operand_sym_node.second.get("tokenfield.<xmlattr>.endbyte", 0);
145 |         curr_operand_sym.bitfield.shift = operand_sym_node.second.get("tokenfield.<xmlattr>.shift", 0);
146 | 
147 |         m_operand_syms[curr_operand_sym.id] = curr_operand_sym;
148 |     }
149 | 
150 |     return SLA_SUCCESS;
151 | }
152 | 
153 | // read the instruction constructors from the processor module
154 | int Slautil::parseConstructors(void)
155 | {
156 |     m_constructor_count = m_tree.get("sleigh.symbol_table.subtable_sym.<xmlattr>.numct", 0);
157 | 
158 |     for(auto &constructor_node : m_tree.get_child("sleigh.symbol_table.subtable_sym"))
159 |     {
160 |         CONSTRUCTOR temp_constructor = {};
161 |         vector<unsigned int> ids;
162 | 
163 |         if(constructor_node.first != "constructor")
164 |         {
165 |             continue;
166 |         }
167 | 
168 |         temp_constructor.constructor_length = constructor_node.second.get("<xmlattr>.length", 0);
169 |         temp_constructor.source_file = constructor_node.second.get("<xmlattr>.source", 0);
170 |         temp_constructor.line_number = constructor_node.second.get("<xmlattr>.line", 0);
171 | 
172 |         // todo: should we check constructor.parent = 0?
173 | 
174 |         for(auto &constructor_node_child : constructor_node.second)
175 |         {
176 |             if(constructor_node_child.first == "<xmlattr>")
177 |             {
178 |                 continue;
179 |             }
180 |             else if(constructor_node_child.first == "construct_tpl")
181 |             {
182 |                 continue;
183 |             }
184 |             else if(constructor_node_child.first == "oper")
185 |             {
186 |                 string id_str;
187 |                 unsigned int id = 0;
188 | 
189 |                 id_str = constructor_node_child.second.get("<xmlattr>.id", "");
190 |                 id = stoi(id_str, NULL, 0x10);
191 |                 ids.push_back(id);
192 |             }
193 |             else if(constructor_node_child.first == "print")
194 |             {
195 |                 CONSTRUCTOR_PIECE temp_constructor_piece;
196 | 
197 |                 temp_constructor_piece.type = "print";
198 |                 temp_constructor_piece.id = -1;
199 |                 temp_constructor_piece.part = constructor_node_child.second.get("<xmlattr>.piece", "");
200 | 
201 |                 temp_constructor.constructor_pieces.push_back(temp_constructor_piece);
202 |             }
203 |             else if(constructor_node_child.first == "opprint")
204 |             {
205 |                 string id_str;
206 |                 unsigned int id = 0;
207 |                 unsigned int id2 = 0;
208 | 
209 |                 id_str = constructor_node_child.second.get("<xmlattr>.id", "");
210 |                 id = stoi(id_str);
211 | 
212 |                 CONSTRUCTOR_PIECE temp_constructor_piece;
213 | 
214 |                 id2 = ids[id];
215 | 
216 |                 checkSubsym(id2);
217 | 
218 |                 string var = m_vars[id2];
219 | 
220 |                 temp_constructor_piece.type = "opprint";
221 |                 temp_constructor_piece.id = id2;
222 |                 temp_constructor_piece.part = var;
223 | 
224 |                 // part??
225 | 
226 |                 temp_constructor.constructor_pieces.push_back(temp_constructor_piece);
227 |             }
228 |             else
229 |             {
230 |                 cout << "Unknown constructor node child: " << constructor_node_child.first << endl;
231 |                 return -2;
232 |             }
233 |         }
234 |         m_constructors.push_back(temp_constructor);
235 |     }
236 | 
237 |     if(m_constructor_count != m_constructors.size())
238 |     {
239 |         cout << "Invalid constructors: " << m_constructor_count << " " << m_constructors.size() << endl;
240 |         return -2;
241 |     }
242 | 
243 |     return SLA_SUCCESS;
244 | }
245 | 
246 | // parse the decision pairs from the processor module
247 | // decision pairs are used to differentiate instructions via their opcode
248 | int Slautil::parseDecisionPairs(void)
249 | {
250 |     m_decision_pairs.resize(m_constructor_count);
251 | 
252 |     const boost::property_tree::ptree & subtree = m_tree.get_child("sleigh.symbol_table.subtable_sym.decision");
253 |     this->recursiveParseDecisionPairs(subtree);
254 | 
255 |     return SLA_SUCCESS;
256 | }
257 | 
258 | // decision pairs can be recursively defined
259 | int Slautil::recursiveParseDecisionPairs(const boost::property_tree::ptree& subtree)
260 | {
261 |     //TODO: why use boost foreach??
262 |     for(auto &v : subtree)
263 |     {
264 |         if(v.first == "decision")
265 |         {
266 |             this->recursiveParseDecisionPairs(v.second);
267 |         }
268 |         else if(v.first == "pair")
269 |         {
270 |             this->parseDecisionPair(v.second);
271 |         }
272 |         else if(v.first == "<xmlattr>")
273 |         {
274 |             continue;
275 |         }
276 |         else
277 |         {
278 |             cout << "Unknown value!!" << v.first << endl;
279 |             return -1;
280 |         }
281 |     }
282 | 
283 |     return 0;
284 | }
285 | 
286 | // parse an individual decision pair
287 | int Slautil::parseDecisionPair(const boost::property_tree::ptree& subtree)
288 | {
289 |     DECISION_PAIR decision_pair = {};
290 | 
291 |     // todo error checking
292 |     decision_pair.id = subtree.get("<xmlattr>.id", 0);
293 |     decision_pair.off = subtree.get("instruct_pat.pat_block.<xmlattr>.off", 0);
294 |     decision_pair.nonzero = subtree.get("instruct_pat.pat_block.<xmlattr>.nonzero", 0);
295 |     string mask = subtree.get("instruct_pat.pat_block.mask_word.<xmlattr>.mask", "");
296 |     string val = subtree.get("instruct_pat.pat_block.mask_word.<xmlattr>.val", "");
297 | 
298 |     decision_pair.mask = stol(mask, NULL, 0x10);
299 |     decision_pair.val = stol(val, NULL, 0x10);
300 | 
301 |     m_decision_pairs[decision_pair.id] = decision_pair;
302 | 
303 |     return 0;
304 | }
305 | 
306 | // convert the decision pairs into opcode bit patterns
307 | int Slautil::convertDecisionPairsToBitPatterns(void)
308 | {
309 |     for(unsigned int i = 0; i < m_constructors.size(); i++)
310 |     {
311 |         //cout << i << ")" << endl;
312 |         unsigned int constructor_length = 0;
313 |         PDECISION_PAIR curr_decision_pair = NULL;
314 |         unsigned int shift_value = 0;
315 |         unsigned int mask = 0;
316 |         unsigned int value = 0;
317 | 
318 |         curr_decision_pair = &m_decision_pairs[i];
319 |         constructor_length = m_constructors[i].constructor_length;
320 | 
321 |         //cout << curr_decision_pair->id << " " << curr_decision_pair->mask << " " << curr_decision_pair->val << endl;
322 | 
323 |         if(curr_decision_pair->nonzero > 4)
324 |         {
325 |             cout << "Invalid decision nonzero amount!!" << endl;
326 |             return -3;
327 |         }
328 | 
329 |         if(constructor_length <= curr_decision_pair->off)
330 |         {
331 |             cout << "Invalid decision offset amount!!" << endl;
332 |             return -4;
333 |         }
334 | 
335 |         shift_value = curr_decision_pair->off * 8;
336 |         mask = curr_decision_pair->mask >> shift_value;
337 |         value = curr_decision_pair->val >> shift_value;
338 | 
339 |         countAdjacentOnes(i, mask, (value & mask));
340 |     }
341 | 
342 |     return SLA_SUCCESS;
343 | }
344 | 
345 | // read the varlist syms from the processor module
346 | int Slautil::parseVarlistSym(void)
347 | {
348 |     for(auto &varlist_sym_node : m_tree.get_child("sleigh.symbol_table"))
349 |     {
350 |         varlist_sym curr_varlist_sym;
351 | 
352 |         if(varlist_sym_node.first != "varlist_sym")
353 |         {
354 |             continue;
355 |         }
356 | 
357 |         //cout << varlist_sym_node.first  << endl;
358 | 
359 |         std::string id_str = varlist_sym_node.second.get("<xmlattr>.id", "");
360 |         curr_varlist_sym.id = stoi(id_str, 0, 0x10);
361 | 
362 |         curr_varlist_sym.bitfield.startbit = varlist_sym_node.second.get("tokenfield.<xmlattr>.startbit", 0);
363 |         curr_varlist_sym.bitfield.endbit = varlist_sym_node.second.get("tokenfield.<xmlattr>.endbit", 0);
364 |         curr_varlist_sym.bitfield.startbyte = varlist_sym_node.second.get("tokenfield.<xmlattr>.startbyte", 0);
365 |         curr_varlist_sym.bitfield.endbyte = varlist_sym_node.second.get("tokenfield.<xmlattr>.endbyte", 0);
366 |         curr_varlist_sym.bitfield.shift = varlist_sym_node.second.get("tokenfield.<xmlattr>.shift", 0);
367 | 
368 |         //cout << curr_varlist_sym.bitfield.startbit  << " " <<  curr_varlist_sym.bitfield.endbit << endl;
369 | 
370 |         for(auto &var_node : varlist_sym_node.second)
371 |         {
372 |             if(var_node.first != "var")
373 |             {
374 |                 continue;
375 |             }
376 | 
377 |             std::string var_id_str = var_node.second.get("<xmlattr>.id", "");
378 |             unsigned int var_id = stoi(var_id_str, 0, 0x10);
379 |             curr_varlist_sym.register_ids.push_back(var_id);
380 | 
381 |         }
382 | 
383 |         m_varlist_syms[curr_varlist_sym.id] = curr_varlist_sym;
384 |     }
385 | 
386 |     return SLA_SUCCESS;
387 | }
388 | 
389 | // read the registers from the processor module
390 | int Slautil::parseRegisters(void)
391 | {
392 |     for(auto &v : m_tree.get_child("sleigh.symbol_table"))
393 |     {
394 |         if(v.first != "varnode_sym")
395 |         {
396 |             continue;
397 |         }
398 | 
399 |         // TODO: throws if missing
400 |         std::string space = v.second.get_child("<xmlattr>.space").data();
401 |         if(space != "register")
402 |         {
403 |             continue;
404 |         }
405 |         //cout << space << endl;
406 | 
407 |         std::string id_str = v.second.get_child("<xmlattr>.id").data();
408 | 
409 |         unsigned int id = stoi(id_str, 0, 0x10);
410 | 
411 |         boost::unordered_map<unsigned int, std::string> ::iterator itr;
412 | 
413 |         itr = m_vars.find(id);
414 |         if(itr == m_vars.end())
415 |         {
416 |             cout << "Failed to find " << id << "!!" << endl;
417 |             return -1;
418 |         }
419 | 
420 |         m_registers.push_back(itr->second);
421 |     }
422 | 
423 |     return 0;
424 | }
425 | 
426 | // helper function to count the number of adjacent ones in a bitmask
427 | int Slautil::countAdjacentOnes(unsigned int id,
428 |                                unsigned int mask,
429 |                                unsigned int value)
430 | {
431 |     unsigned int count = 0;
432 | 
433 |     for(unsigned int i = 0; i < 32; i++)
434 |     {
435 |         bool bit_on = (mask & (1 << i));
436 | 
437 |         if(bit_on)
438 |         {
439 |             count += 1;
440 |         }
441 | 
442 |         if(!bit_on)
443 |         {
444 |             if(count != 0)
445 |             {
446 |                 BIT_PATTERN temp_bit_pattern;
447 | 
448 |                 /*
449 |                 cout << "opcode_";
450 |                 cout << (i - count);
451 |                 cout << "_";
452 |                 cout << (i - 1);
453 |                 cout << "= " << endl;
454 |                 */
455 | 
456 |                 // TODO: make this a func
457 |                 temp_bit_pattern.pattern_type = "opcode";
458 |                 temp_bit_pattern.start_bit = i - count;
459 |                 temp_bit_pattern.end_bit = i - 1;
460 |                 temp_bit_pattern.pattern = extractBits(temp_bit_pattern.start_bit,
461 |                                                        temp_bit_pattern.end_bit,
462 |                                                        value);
463 | 
464 |                 m_constructors[id].bit_patterns.push_back(temp_bit_pattern);
465 |             }
466 |             count = 0;
467 |         }
468 |     }
469 | 
470 |     if(count != 0)
471 |     {
472 |         BIT_PATTERN temp_bit_pattern;
473 | 
474 |         /*
475 |         cout << "opcode_";
476 |         cout << (32 - count);
477 |         cout << "_";
478 |         cout << (32 - 1);
479 |         cout << "= " << endl;
480 |         */
481 | 
482 |         // TODO: make this a func
483 |         temp_bit_pattern.pattern_type = "opcode";
484 |         temp_bit_pattern.start_bit = 33 - count - 1;
485 |         temp_bit_pattern.end_bit = 32 - 1;
486 |         temp_bit_pattern.pattern = extractBits(temp_bit_pattern.start_bit,
487 |                                                temp_bit_pattern.end_bit,
488 |                                                 value);
489 | 
490 |         m_constructors[id].bit_patterns.push_back(temp_bit_pattern);
491 |     }
492 | 
493 |     return SLA_SUCCESS;
494 | }
495 | 
496 | // convert a number into a bit string
497 | string Slautil::extractBits(unsigned int start_bit,
498 |                             unsigned int end_bit,
499 |                             unsigned int value)
500 | {
501 |     string bit_string = "";
502 | 
503 |     for(unsigned int i = start_bit; i <= end_bit; i++)
504 |     {
505 |         if(value & (1 << i))
506 |         {
507 |             bit_string.insert(0, 1, '1');
508 |         }
509 |         else
510 |         {
511 |             bit_string.insert(0, 1, '0');
512 |         }
513 |     }
514 | 
515 |     if(bit_string.length() == 0)
516 |     {
517 |         cout << "Invalid extract bits!!" << endl;
518 |         cout << start_bit << " " << end_bit << endl;
519 |         throw 1;
520 |     }
521 | 
522 |     return bit_string;
523 | }
524 | 
525 | // remap a subsym if necessary
526 | int Slautil::checkSubsym(unsigned int& id)
527 | {
528 |     boost::unordered_map<unsigned int, unsigned int> ::iterator itr;
529 |     itr = m_subsyms.find(id);
530 |     if(itr == m_subsyms.end())
531 |     {
532 |         return -3;
533 |     }
534 | 
535 |     //cout << "replace " << id << " with " << itr->second << endl;
536 |     id = itr->second;
537 | 
538 |     return SLA_SUCCESS;
539 | }
540 | 


--------------------------------------------------------------------------------
/thread_pool.cpp:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // File: parser.cpp
 3 | //
 4 | // Thread pool helpers
 5 | //
 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
 7 | // Licensed under the Apache 2.0 License.
 8 | //------------------------------------------------------------------------------
 9 | #include "thread_pool.h"
10 | #include <boost/asio/thread_pool.hpp>
11 | #include <boost/atomic.hpp>
12 | 
13 | // count of successful and failed worker jobs
14 | // use atomic for thread-safety
15 | static boost::atomic<unsigned int> g_CompletedCount = 0;
16 | static boost::atomic<unsigned int> g_FailureCount = 0;
17 | 
18 | // reset thread pool counters
19 | void resetThreadPool(void)
20 | {
21 |     g_FailureCount = 0;
22 |     g_CompletedCount = 0;
23 | }
24 | 
25 | // increment the number of worker completions
26 | void incrementWorkerCompletions(void)
27 | {
28 |     g_CompletedCount++;
29 | }
30 | 
31 | // get the number of completed workers
32 | unsigned int getWorkerCompletions(void)
33 | {
34 |     return g_CompletedCount;
35 | }
36 | 
37 | // increment the number of failures
38 | void incrementWorkerFailures(void)
39 | {
40 |     g_FailureCount++;
41 | }
42 | 
43 | // get the number of failures
44 | unsigned int getWorkerFailures(void)
45 | {
46 |     return g_FailureCount;
47 | }
48 | 


--------------------------------------------------------------------------------
/thread_pool.h:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // File: thread_pool.h
 3 | //
 4 | // Thread pool helpers
 5 | //
 6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
 7 | // Licensed under the Apache 2.0 License.
 8 | //------------------------------------------------------------------------------
 9 | #pragma once
10 | 
11 | #include <boost/asio/thread_pool.hpp>
12 | #include <boost/atomic.hpp>
13 | 
14 | void resetThreadPool(void);
15 | void incrementWorkerFailures(void);
16 | unsigned int getWorkerFailures(void);
17 | void incrementWorkerCompletions(void);
18 | unsigned int getWorkerCompletions(void);
19 | 


--------------------------------------------------------------------------------
/validator.cpp:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // File: validator.cpp
  3 | //
  4 | // Handles command line argument parsing invoking the disassembly routine.
  5 | //
  6 | // Copyright (c) Oberoi Security Solutions. All rights reserved.
  7 | // Licensed under the Apache 2.0 License.
  8 | //-----------------------------------------------------------------------------
  9 | 
 10 | #include <iostream>
 11 | #include <boost/algorithm/string.hpp>
 12 | #include <boost/program_options.hpp>
 13 | #include <boost/filesystem/fstream.hpp>
 14 | #include <loadimage.hh>
 15 | #include <sleigh.hh>
 16 | using namespace std;
 17 | 
 18 | // This is a tiny LoadImage class which feeds the executable bytes to the translator
 19 | // Taken straight from sleighexample.cc
 20 | class MyLoadImage : public LoadImage {
 21 |   uintb baseaddr;
 22 |   int4 length;
 23 |   uint1 *data;
 24 | public:
 25 |   MyLoadImage(uintb ad,uint1 *ptr,int4 sz) : LoadImage("nofile") { baseaddr = ad; data = ptr; length = sz; }
 26 |   virtual void loadFill(uint1 *ptr,int4 size,const Address &addr);
 27 |   virtual string getArchType(void) const { return "myload"; }
 28 |   virtual void adjustVma(long adjust) { }
 29 | };
 30 | 
 31 | // This is the only important method for the LoadImage. It returns bytes from the static array
 32 | // depending on the address range requested
 33 | void MyLoadImage::loadFill(uint1 *ptr,int4 size,const Address &addr)
 34 | 
 35 | {
 36 |   uintb start = addr.getOffset();
 37 |   uintb max = baseaddr + (length-1);
 38 |   for(int4 i=0;i<size;++i) {	// For every byte requestes
 39 |     uintb curoff = start + i; // Calculate offset of byte
 40 |     if ((curoff < baseaddr)||(curoff>max)) {	// If byte does not fall in window
 41 |       ptr[i] = 0;		// return 0
 42 |       continue;
 43 |     }
 44 |     uintb diff = curoff - baseaddr;
 45 |     ptr[i] = data[(int4)diff];	// Otherwise return data from our window
 46 |   }
 47 | }
 48 | 
 49 | // Here is a simple class for emitting assembly.  In this case, we send the strings straight
 50 | // to standard out.
 51 | class AssemblyRaw : public AssemblyEmit {
 52 | public:
 53 |   virtual void dump(const Address &addr,const string &mnem,const string &body) {
 54 |         disassembly = mnem + " " + body;
 55 |         boost::trim(disassembly);
 56 |   }
 57 |   string disassembly;
 58 | };
 59 | 
 60 | // converts unsigned char to two byte hex value
 61 | #define CHAR2HEX( x ) setw(2) << setfill('0') << uppercase << hex << (unsigned int)x
 62 | 
 63 | int parseInputAndDisassemble(string& inputFilename, string& outputFilename, string& slaFilename);
 64 | int convertOpcodeToBinary(string& opcode, vector<unsigned char>& opcodeBytes);
 65 | int convertHexNibbletoInteger(unsigned char x);
 66 | int sleighDisassemble(string& slaFilename, vector<unsigned char>& opcodeBytes, string& disassembly);
 67 | 
 68 | int main(int argc, char *argv[])
 69 | {
 70 |     boost::program_options::options_description desc{"Ghidra Processor Module Generator Validator"};
 71 |     boost::program_options::variables_map args;
 72 |     string inputFilename;
 73 |     string outputFilename;
 74 |     string slaFilename;
 75 |     int result = 0;
 76 | 
 77 |     cout << "Ghidra Processor Module Generator Validator" << endl;
 78 | 
 79 |     //
 80 |     // command line arg parsing
 81 |     //
 82 | 
 83 |     try
 84 |     {
 85 |         desc.add_options()
 86 |             ("input-file,i", boost::program_options::value<string>(&inputFilename), "Path to a newline delimited text file containing all opcodes and instructions for the processor module. Required.")
 87 |             ("output-file,o",boost::program_options::value<string>(&outputFilename)->default_value("output.txt"), "Output file. Defaults to output.txt if not specified.")
 88 |             ("sla-file,s",boost::program_options::value<string>(&slaFilename), "Path to the compiled processor .sla.")
 89 |             ("help,h", "Help screen");
 90 | 
 91 |         store(parse_command_line(argc, argv, desc), args);
 92 |         notify(args);
 93 | 
 94 |         if(args.count("help") || argc == 1)
 95 |         {
 96 |             cout << desc << endl;
 97 |             return 0;
 98 |         }
 99 | 
100 |         if(args.count("input-file") == 0)
101 |         {
102 |             cout << "Input file name is required!!" << endl;
103 |             return -1;
104 |         }
105 | 
106 |         if(args.count("sla-file") == 0)
107 |         {
108 |             cout << "Sla file name is required!!" << endl;
109 |             return -1;
110 |         }
111 |     }
112 |     catch (const boost::program_options::error &ex)
113 |     {
114 |         cout << "[-] Error parsing command line: " << ex.what() << endl;
115 |         return -1;
116 |     }
117 | 
118 |     cout << "[*] Input file: " << inputFilename << endl;
119 |     cout << "[*] Compiled SLA file: " << slaFilename << endl;
120 |     cout << "[*] Outputting (might take a while) to: " << outputFilename << endl;
121 | 
122 |     result = parseInputAndDisassemble(inputFilename, outputFilename, slaFilename);
123 |     if(result != 0)
124 |     {
125 |         return result;
126 |     }
127 | 
128 |     cout << "[*] Successfully created output disassembly file. Diff input and output files to find errors in the SLA." << endl;
129 |     return 0;
130 | }
131 | 
132 | // Parses the input file for addresses and passes it to the SLEIGH disassembler for output
133 | int parseInputAndDisassemble(string& inputFilename, string& outputFilename, string& slaFilename)
134 | {
135 |     unsigned int lineNum = 0;
136 |     int result = 0;
137 |     std::string line;
138 | 
139 |     // open the input file for parsing
140 |     boost::filesystem::path infile{inputFilename};
141 |     boost::filesystem::ifstream ifs{infile};
142 | 
143 |     boost::filesystem::path outfile{outputFilename};
144 |     boost::filesystem::ofstream ofs{outfile};
145 | 
146 |     if(!ifs)
147 |     {
148 |         cout << "[-] Failed to open input file!!" << endl;
149 |         return -1;
150 |     }
151 | 
152 |     if(!ofs)
153 |     {
154 |         cout << "[-] Failed to open output file!!" << endl;
155 |         return -1;
156 |     }
157 | 
158 |     //
159 |     // parse the input file line by line
160 |     //
161 |     while (std::getline(ifs, line))
162 |     {
163 |         vector<string> lineSplit;
164 |         vector<unsigned char> opcodeBytes;
165 |         string disassembly;
166 | 
167 |         lineNum++;
168 | 
169 |         // split the line into components
170 |         boost::split(lineSplit, line, boost::algorithm::is_space(), boost::token_compress_on);
171 | 
172 |         if(lineSplit.size() < 1)
173 |         {
174 |             continue;
175 |         }
176 | 
177 |         result = convertOpcodeToBinary(lineSplit[0], opcodeBytes);
178 |         if(result != 0)
179 |         {
180 |             cout << "Failed to covert opcode!!" << endl;
181 |             goto exit;
182 |         }
183 | 
184 |         result = sleighDisassemble(slaFilename, opcodeBytes, disassembly);
185 |         if(result != 0)
186 |         {
187 |             goto exit;
188 |         }
189 | 
190 |         ofs << "0x";
191 |         for (auto& x: opcodeBytes)
192 |         {
193 |             ofs << CHAR2HEX(x);
194 |         }
195 |         ofs << " " << disassembly;
196 |         ofs << endl;
197 |     }
198 | 
199 |     result = 0;
200 | 
201 | exit:
202 |     ifs.close();
203 |     ofs.close();
204 |     return result;
205 | }
206 | 
207 | // disassembles opcode bytes using the passed in SLA file
208 | int sleighDisassemble(string& slaFilename, vector<unsigned char>& opcodeBytes, string& disassembly)
209 | {
210 |     unsigned char buffer[4096] = {0};
211 | 
212 |     // initialize instruction to disassemble
213 |     for(unsigned int i = 0; i < opcodeBytes.size(); i++)
214 |     {
215 |         buffer[i] = opcodeBytes[i];
216 |     }
217 | 
218 |     // instantiate sleigh
219 |     try
220 |     {
221 |         MyLoadImage loader(0, (uint1*)buffer, sizeof(buffer));
222 | 
223 |         // Set up the context object
224 |         ContextInternal context;
225 | 
226 |         // Set up the disassembler
227 |         Sleigh trans(&loader, &context);
228 | 
229 |         // Read sleigh file into DOM
230 |         DocumentStorage docstorage;
231 |         Element *sleighroot = docstorage.openDocument(slaFilename)->getRoot();
232 |         docstorage.registerTag(sleighroot);
233 |         trans.initialize(docstorage); // Initialize the translator
234 | 
235 |         AssemblyRaw assememit;	// Set up the disassembly dumper
236 |         Address addr(trans.getDefaultCodeSpace(), 0); // First disassembly address
237 | 
238 |         // dump the disassembly now
239 |         trans.printAssembly(assememit, addr);
240 |         disassembly = assememit.disassembly;
241 |     }
242 |     catch(XmlError e)
243 |     {
244 |         cout << "Failed to instantiate SLEIGH. Is processor SLA invalid?" << endl;
245 |         return -1;
246 |     }
247 |     catch(BadDataError e)
248 |     {
249 |         // disassembly error, just report it as a success so it appears in the output
250 |         disassembly = "Error";
251 |         return 0;
252 |     }
253 |     catch(...)
254 |     {
255 |         cout << "Unknown error during disassembly!!\n";
256 |         return -3;
257 |     }
258 | 
259 |     return 0;
260 | }
261 | 
262 | // converts an opcode in the of 0xaabb... or 0b0011... to a an array of raw bytes
263 | int convertOpcodeToBinary(string& opcode, vector<unsigned char>& opcodeBytes)
264 | {
265 |     int opcodeLength = 0;
266 | 
267 |     // opcode must begin with 0x or 0b
268 |     if(opcode[0] != '0')
269 |     {
270 |         cout << "Opcode must begin with 0x or 0b!!" << endl;
271 |         return -1;
272 |     }
273 | 
274 |     if(opcode[1] == 'x' || opcode[1] == 'X')
275 |     {
276 |         opcodeLength = opcode.length() - 2;
277 |         if((opcodeLength % 2) != 0)
278 |         {
279 |             cout << "Hex opcode length must be divisble by 2!!" << endl;
280 |             return -2;
281 |         }
282 | 
283 |          // loop through the hex string, converting each byte
284 |         for(unsigned int i = 2; i < opcode.length(); i += 2)
285 |         {
286 |             unsigned char value;
287 |             unsigned char high;
288 |             unsigned char low;
289 | 
290 |             // convert the hex string to a byte
291 |             high = convertHexNibbletoInteger(opcode[i]);
292 |             low = convertHexNibbletoInteger(opcode[i+1]);
293 | 
294 |             value = (high << 4) | low;
295 | 
296 |             opcodeBytes.push_back(value);
297 |         }
298 | 
299 |         return 0;
300 |     }
301 |     else if(opcode[1] == 'b' || opcode[1] == 'B')
302 |     {
303 |         opcodeLength = opcode.length() - 2;
304 |         if((opcodeLength % 8) != 0)
305 |         {
306 |             cout << "Binary opcode length must be divisble by 8!!" << endl;
307 |             return -2;
308 |         }
309 | 
310 |         // loop through the bit string, converting each byte
311 |         for(unsigned int i = 2; i < opcode.length(); i += 8)
312 |         {
313 |             unsigned char value = 0;
314 | 
315 |             for(unsigned int j = 0; j < 8; j++)
316 |             {
317 |                 value = value << 1;
318 |                 if(opcode[i + j] == '1')
319 |                 {
320 |                     value = value | 1;
321 |                 }
322 |             }
323 |             opcodeBytes.push_back(value);
324 |         }
325 | 
326 |         return 0;
327 |     }
328 |     else
329 |     {
330 |         cout << "Opcode must begin with 0x or 0b!!" << endl;
331 |         return -1;
332 |     }
333 | 
334 |     return 0;
335 | }
336 | 
337 | // simple utility to convert an ascii hex char to decimal
338 | int convertHexNibbletoInteger(unsigned char x)
339 | {
340 |     if(x >= '0' && x <= '9')
341 |     {
342 |         return x - '0';
343 |     }
344 | 
345 |     if(x >= 'A' && x <= 'F')
346 |     {
347 |         return x - 'A' + 0xa;
348 |     }
349 | 
350 |     if(x >= 'a' && x <= 'f')
351 |     {
352 |         return x - 'a' + 0xa;
353 |     }
354 | 
355 |     return 0;
356 | }
357 | 


--------------------------------------------------------------------------------