├── mem.txt ├── LICENSE ├── README.md ├── VM_Disassembler.py ├── .gitignore └── example.py /mem.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeffli678/VM_Disassembler/HEAD/mem.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Xusheng Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VM_Disassembler 2 | A recursive disassembler written in Python. Best suitable for VMs in CTFs. 3 | 4 | See example.py for a disassembler for https://crackmes.one/crackme/5bc0fe0033c5d4110a29b296. 5 | 6 | The VM_Diassembler class should be initialized with four parameters: 7 | 8 | ```Python 9 | vm_dis = VM_Disassembler(vm_code, disassembler, entry_point, look_ahead_len) 10 | ``` 11 | 12 | Where ```vm_code``` is the code to disassemble (a list of integer bytes); ```disassembler``` is the disassembler function (see below); ```entry_point``` is the offset of the first instruction; ```look_ahead_len``` specifies how many bytes to consider when disassembling the current instruction. This can be set to the max length of an instruction. Leave it to 0 if unsure. 13 | 14 | The user only needs to write a disassembler() function which disassembles one instruction. The VM_Disassembler will handler all the other stuff. 15 | 16 | ```Python 17 | def disassembler(addr, data): 18 | # hard work here 19 | return instr_len, instr_text, possible_next_addrs 20 | ``` 21 | 22 | disassembler() takes two parameters: 23 | 24 | ```addr```: the current address 25 | 26 | ```data```: the data to disassemble 27 | 28 | It should return three things: 29 | 30 | ```instr_len```: the length of the current instruction 31 | 32 | ```instr_text```: the disassembly text of the current instruction 33 | 34 | ```possible_next_addrs```: a list of possible next addresses after the current instruction 35 | 36 | I hope this will be helpful for your next CTF. Issues and PRs are welcome! -------------------------------------------------------------------------------- /VM_Disassembler.py: -------------------------------------------------------------------------------- 1 | class VM_Disassembler: 2 | 3 | def __init__(self, code, disassembler, entry_point, look_ahead_len = 0): 4 | # code is a list of ints 5 | self.code = code 6 | # this the core disassembler function 7 | # the user needs to write it 8 | self.disassembler = disassembler 9 | self.entry_point = entry_point 10 | 11 | # how many bytes to check when disassembling the current instruction 12 | # this can be set to the max length of an instruction 13 | # leave it to 0 if unsure 14 | self.look_ahead_len = look_ahead_len 15 | 16 | # the set of addresses that we have already processed 17 | self.disassembled_addr = set() 18 | # the list of addresses to process 19 | self.disassemble_queue = [self.entry_point] 20 | 21 | # utility 22 | def format_bytes(self, data): 23 | s = '' 24 | for c in data: 25 | s += '%02x' % c 26 | return s 27 | 28 | # the core function for the recursive disassembly 29 | def disassemble(self): 30 | # check whethe we have adress to disassemble 31 | while len(self.disassemble_queue) > 0: 32 | 33 | addr = self.disassemble_queue.pop() 34 | if addr in self.disassembled_addr: 35 | continue 36 | else: 37 | self.disassembled_addr.add(addr) 38 | 39 | if addr >= len(self.code): 40 | # there is probably an error in the disassembler 41 | # but for now we just ignore it 42 | continue 43 | 44 | # prepare the data and send it to self.disassembler() 45 | if self.look_ahead_len == 0: 46 | data_to_parse = self.code[addr : ] 47 | else: 48 | data_to_parse = self.code[addr : addr + self.look_ahead_len] 49 | instr_len, instr_text, possible_next_instrs = \ 50 | self.disassembler(addr, data_to_parse) 51 | 52 | # put every next possible addresses into the queue 53 | for next_addr in possible_next_instrs: 54 | self.disassemble_queue.append(next_addr) 55 | 56 | # print the address, raw bytes, and the disassembly text of the current instruction 57 | instr_bytes = self.code[addr : addr + instr_len] 58 | print('0x%x %s %s' % (addr, self.format_bytes(instr_bytes), instr_text)) 59 | 60 | print('Pasring done! Good luck with reversing') 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from VM_Disassembler import VM_Disassembler 2 | import struct 3 | from io import open 4 | 5 | def bytes_to_word(val_bytes): 6 | return val_bytes[0] + 0x100 * val_bytes[1] 7 | 8 | def bytes_to_int(val_bytes): 9 | val = 0 10 | for byte_val in val_bytes[::-1]: 11 | val *= 0x100 12 | val += byte_val 13 | return val 14 | 15 | def offset_to_addr(offset): 16 | vm_start = 0x1190 17 | return offset - vm_start 18 | 19 | regs = ['r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15', 20 | 'rdi', 'rsi', 'rbp', 'rbx', 'rdx', 'rax', 'rcx', 'rsp', 'rip', 'eflags'] 21 | 22 | def disassembler(addr, data): 23 | 24 | instr_len = 0xe 25 | possible_next_addrs = [addr + instr_len] 26 | # skip the invalid ud2 instruciton in the beginning 27 | data = data[2 : ] 28 | op_code = data[8] 29 | 30 | if op_code == 9: 31 | op1_data = data[0 : 8] 32 | op1 = bytes_to_int(op1_data) 33 | addr = offset_to_addr(op1) 34 | op2 = data[9] 35 | if op2 == 0x10: 36 | instr_text = 'jmp 0x%x' % (addr) 37 | else: 38 | instr_text = '%s = 0x%x' % (regs[op2], op1) 39 | elif op_code == 0x24: 40 | op1_data = data[0 : 8] 41 | op1 = bytes_to_int(op1_data) 42 | op2 = data[9] 43 | op3 = data[0xa] 44 | instr_text = 'cmp %s, 0x%x' % (regs[op3], op1) 45 | instr_text += '; setne %s' % (regs[op2]) 46 | elif op_code == 0x2a: 47 | op1 = data[0xa] 48 | instr_text = 'cmp %s, 0x0; ifne ret' % (regs[op1]) 49 | elif op_code == 0x2c: 50 | op1_data = data[0 : 8] 51 | op1 = bytes_to_int(op1_data) 52 | op2 = data[9] 53 | op3 = data[0xa] 54 | instr_text = 'add %s, %s, 0x%x' % (regs[op2], regs[op3], op1) 55 | elif op_code == 0x10: 56 | op1_data = data[0 : 8] 57 | op1 = bytes_to_int(op1_data) 58 | op2 = data[9] 59 | op3 = data[0xa] 60 | instr_text = 'mov %s, *(%s + 0x%x)' % (regs[op2], regs[op3], op1) 61 | elif op_code == 0x15: 62 | # op2 = data[9] 63 | op3 = data[0xa] 64 | instr_text = 'push %s' % (regs[op3]) 65 | elif op_code == 0x18: 66 | op2 = data[9] 67 | op3 = data[0xa] 68 | instr_text = 'mov %s, %s' % (regs[op2], regs[op3]) 69 | elif op_code == 0x14: 70 | op1_data = data[0 : 8] 71 | op1 = bytes_to_int(op1_data) 72 | op2 = data[0xa] 73 | op3 = data[0xb] 74 | instr_text = '*(%s + 0x%x) = %s' % (regs[op2], op1, regs[op3]) 75 | elif op_code == 0x28: 76 | op1_data = data[0 : 8] 77 | op1 = bytes_to_int(op1_data) 78 | addr = offset_to_addr(op1) 79 | instr_text = 'call 0x%x' % (addr) 80 | elif op_code == 0x27: 81 | op1_data = data[0 : 8] 82 | op1 = bytes_to_int(op1_data) 83 | op2 = data[0xa] 84 | addr = offset_to_addr(op1) 85 | instr_text = 'cmp %s, 0; jne 0x%x' % (regs[op2], addr) 86 | elif op_code == 0xa: 87 | op1_data = data[0 : 8] 88 | op1 = bytes_to_int(op1_data) 89 | op2 = data[0xa] 90 | op3 = data[0x9] 91 | instr_text = 'mov %s, byte* (%s + 0x%x)' % (regs[op3], regs[op2], op1) 92 | elif op_code == 0x1: 93 | op1 = data[0xa] 94 | op2 = data[0xb] 95 | op3 = data[0x9] 96 | instr_text = 'add %s, %s, %s' % (regs[op3], regs[op2], regs[op1]) 97 | elif op_code == 0x8: 98 | op1 = data[0xa] 99 | op2 = data[0x9] 100 | instr_text = '%s = neg %s' % (regs[op2], regs[op1]) 101 | elif op_code == 0x17: 102 | op1 = data[0x9] 103 | instr_text = 'pop %s' % (regs[op1]) 104 | elif op_code == 0x26: 105 | op1 = data[0xa] 106 | op2_data = data[0 : 8] 107 | op2 = bytes_to_int(op2_data) 108 | addr = offset_to_addr(op2) 109 | instr_text = 'cmp %s, 0x0; je 0x%x modified' % (regs[op1], addr) 110 | elif op_code == 0x29: 111 | instr_text = 'ret' 112 | elif op_code == 0x2b: 113 | op1 = data[0xa] 114 | instr_text = 'cmp %s, 0x0; if equal ret' % (regs[op1]) 115 | elif op_code == 0x2e: 116 | op1 = data[0xa] 117 | op3 = data[0x9] 118 | op2_data = data[0 : 8] 119 | op2 = bytes_to_int(op2_data) 120 | instr_text = 'shl %s, %s, 0x%x' % (regs[op3], regs[op1], op2) 121 | elif op_code == 2: 122 | op1 = data[0xa] 123 | op2 = data[0xb] 124 | op3 = data[0x9] 125 | instr_text = 'sub %s, %s, %s' % (regs[op3], regs[op1], regs[op2]) 126 | elif op_code == 3: 127 | op1 = data[0xa] 128 | op2 = data[0xb] 129 | op3 = data[0x9] 130 | instr_text = 'imul %s, %s, %s' % (regs[op3], regs[op2], regs[op1]) 131 | elif op_code == 0x1b: 132 | op1 = data[0xa] 133 | op2 = data[0xb] 134 | op3 = data[0x9] 135 | instr_text = 'xor %s, %s, %s' % (regs[op3], regs[op1], regs[op2]) 136 | elif op_code == 0x19: 137 | op1 = data[0xa] 138 | op2 = data[0xb] 139 | op3 = data[0x9] 140 | instr_text = 'or %s, %s, %s' % (regs[op3], regs[op1], regs[op2]) 141 | elif op_code == 0x21: 142 | op1 = data[0xa] 143 | op2 = data[0xb] 144 | op3 = data[0x9] 145 | instr_text = 'cmp %s, %s; sete %s' % (regs[op2], regs[op1], regs[op3]) 146 | else: 147 | instr_text = 'opcode used: 0x%x' % op_code 148 | 149 | return instr_len, instr_text, possible_next_addrs 150 | 151 | def main(): 152 | vm_code = open('mem.txt', 'rb').read() 153 | vm_code = list(vm_code) 154 | try: 155 | # python 2-3 compatability 156 | vm_code = [ord(c) for c in vm_code] 157 | except: 158 | pass 159 | 160 | # we need to write a disassembler(addr, data) that disassembles the data at addr, and return a list of: 1). the length of the current instruction; 2). the disassembly text of the current instruction; 3). the list of possible next addresses 161 | # if the current instruction is not a branch, then probably the possible next address is jus the address after this instruction; if it is a branch, then we might have two possible next addresses 162 | vm_dis = VM_Disassembler(vm_code, disassembler, 0, 0xe) 163 | vm_dis.disassemble() 164 | 165 | if __name__ == '__main__': 166 | main() --------------------------------------------------------------------------------