├── mem.txt
├── LICENSE
├── README.md
├── VM_Disassembler.py
├── .gitignore
└── example.py


/mem.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeffli678/VM_Disassembler/HEAD/mem.txt


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Xusheng Li
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # VM_Disassembler
 2 | A recursive disassembler written in Python. Best suitable for VMs in CTFs. 
 3 | 
 4 | See example.py for a disassembler for https://crackmes.one/crackme/5bc0fe0033c5d4110a29b296. 
 5 | 
 6 | The VM_Diassembler class should be initialized with four parameters:
 7 | 
 8 | ```Python
 9 | vm_dis = VM_Disassembler(vm_code, disassembler, entry_point, look_ahead_len)
10 | ```
11 | 
12 | Where ```vm_code``` is the code to disassemble (a list of integer bytes); ```disassembler``` is the disassembler function (see below); ```entry_point``` is the offset of the first instruction; ```look_ahead_len``` specifies how many bytes to consider when disassembling the current instruction. This can be set to the max length of an instruction. Leave it to 0 if unsure. 
13 | 
14 | The user only needs to write a disassembler() function which disassembles one instruction. The VM_Disassembler will handler all the other stuff.
15 | 
16 | ```Python
17 | def disassembler(addr, data):
18 |     # hard work here
19 |     return instr_len, instr_text, possible_next_addrs
20 | ```
21 | 
22 | disassembler() takes two parameters:
23 | 
24 | ```addr```: the current address
25 | 
26 | ```data```: the data to disassemble
27 | 
28 | It should return three things:
29 | 
30 | ```instr_len```: the length of the current instruction
31 | 
32 | ```instr_text```: the disassembly text of the current instruction
33 | 
34 | ```possible_next_addrs```: a list of possible next addresses after the current instruction
35 | 
36 | I hope this will be helpful for your next CTF. Issues and PRs are welcome! 


--------------------------------------------------------------------------------
/VM_Disassembler.py:
--------------------------------------------------------------------------------
 1 | class VM_Disassembler:
 2 |     
 3 |     def __init__(self, code, disassembler, entry_point, look_ahead_len = 0):
 4 |         # code is a list of ints 
 5 |         self.code = code
 6 |         # this the core disassembler function
 7 |         # the user needs to write it
 8 |         self.disassembler = disassembler
 9 |         self.entry_point = entry_point
10 | 
11 | 		# how many bytes to check when disassembling the current instruction
12 | 		# this can be set to the max length of an instruction
13 | 		# leave it to 0 if unsure
14 |         self.look_ahead_len = look_ahead_len
15 |         
16 |         # the set of addresses that we have already processed
17 |         self.disassembled_addr = set()
18 |         # the list of addresses to process
19 |         self.disassemble_queue = [self.entry_point]
20 | 
21 | 	# utility 
22 |     def format_bytes(self, data):
23 |         s = ''
24 |         for c in data:
25 |             s += '%02x' % c
26 |         return s
27 | 
28 | 	# the core function for the recursive disassembly
29 |     def disassemble(self):
30 | 		# check whethe we have adress to disassemble
31 |         while len(self.disassemble_queue) > 0:
32 | 
33 |             addr = self.disassemble_queue.pop()
34 |             if addr in self.disassembled_addr:
35 |                 continue
36 |             else:
37 |                 self.disassembled_addr.add(addr)
38 | 
39 |             if addr >= len(self.code):
40 | 				# there is probably an error in the disassembler
41 | 				# but for now we just ignore it
42 |                 continue
43 |             
44 | 			# prepare the data and send it to self.disassembler()
45 |             if self.look_ahead_len == 0:
46 |                 data_to_parse = self.code[addr : ]
47 |             else:
48 |                 data_to_parse = self.code[addr : addr + self.look_ahead_len]
49 |             instr_len, instr_text, possible_next_instrs = \
50 |                 self.disassembler(addr, data_to_parse)
51 |             
52 | 			# put every next possible addresses into the queue
53 |             for next_addr in possible_next_instrs:
54 |                 self.disassemble_queue.append(next_addr)
55 | 
56 | 			# print the address, raw bytes, and the disassembly text of the current instruction
57 |             instr_bytes = self.code[addr : addr + instr_len]
58 |             print('0x%x %s %s' % (addr, self.format_bytes(instr_bytes), instr_text))
59 |         
60 |         print('Pasring done! Good luck with reversing')
61 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
  1 | from VM_Disassembler import VM_Disassembler
  2 | import struct
  3 | from io import open
  4 | 
  5 | def bytes_to_word(val_bytes):
  6 |     return val_bytes[0] + 0x100 * val_bytes[1]
  7 | 
  8 | def bytes_to_int(val_bytes):
  9 | 	val = 0
 10 | 	for byte_val in val_bytes[::-1]:
 11 | 		val *= 0x100
 12 | 		val += byte_val
 13 | 	return val
 14 | 
 15 | def offset_to_addr(offset):
 16 | 	vm_start = 0x1190
 17 | 	return offset - vm_start
 18 | 
 19 | regs = ['r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15', 
 20 | 		'rdi', 'rsi', 'rbp', 'rbx', 'rdx', 'rax', 'rcx', 'rsp', 'rip', 'eflags']
 21 | 
 22 | def disassembler(addr, data):
 23 | 
 24 | 	instr_len = 0xe
 25 | 	possible_next_addrs = [addr + instr_len]
 26 | 	# skip the invalid ud2 instruciton in the beginning
 27 | 	data = data[2 : ]
 28 | 	op_code = data[8]
 29 | 
 30 | 	if op_code == 9:
 31 | 		op1_data = data[0 : 8]
 32 | 		op1 = bytes_to_int(op1_data)
 33 | 		addr = offset_to_addr(op1)
 34 | 		op2 = data[9]
 35 | 		if op2 == 0x10:
 36 | 			instr_text = 'jmp 0x%x' % (addr)
 37 | 		else:
 38 | 			instr_text = '%s = 0x%x' % (regs[op2], op1)
 39 | 	elif op_code == 0x24:
 40 | 		op1_data = data[0 : 8]
 41 | 		op1 = bytes_to_int(op1_data)
 42 | 		op2 = data[9]
 43 | 		op3 = data[0xa]
 44 | 		instr_text = 'cmp %s, 0x%x' % (regs[op3], op1)
 45 | 		instr_text += '; setne %s' % (regs[op2])
 46 | 	elif op_code == 0x2a:
 47 | 		op1 = data[0xa]
 48 | 		instr_text = 'cmp %s, 0x0; ifne ret' % (regs[op1])
 49 | 	elif op_code == 0x2c:
 50 | 		op1_data = data[0 : 8]
 51 | 		op1 = bytes_to_int(op1_data)
 52 | 		op2 = data[9]
 53 | 		op3 = data[0xa]
 54 | 		instr_text = 'add %s, %s, 0x%x' % (regs[op2], regs[op3], op1)
 55 | 	elif op_code == 0x10:
 56 | 		op1_data = data[0 : 8]
 57 | 		op1 = bytes_to_int(op1_data)
 58 | 		op2 = data[9]
 59 | 		op3 = data[0xa]
 60 | 		instr_text = 'mov %s, *(%s + 0x%x)' % (regs[op2], regs[op3], op1)
 61 | 	elif op_code == 0x15:
 62 | 		# op2 = data[9]
 63 | 		op3 = data[0xa]
 64 | 		instr_text = 'push %s' % (regs[op3])
 65 | 	elif op_code == 0x18:
 66 | 		op2 = data[9]
 67 | 		op3 = data[0xa]
 68 | 		instr_text = 'mov %s, %s' % (regs[op2], regs[op3])
 69 | 	elif op_code == 0x14:
 70 | 		op1_data = data[0 : 8]
 71 | 		op1 = bytes_to_int(op1_data)
 72 | 		op2 = data[0xa]
 73 | 		op3 = data[0xb]
 74 | 		instr_text = '*(%s + 0x%x) = %s' % (regs[op2], op1, regs[op3])
 75 | 	elif op_code == 0x28:
 76 | 		op1_data = data[0 : 8]
 77 | 		op1 = bytes_to_int(op1_data)		
 78 | 		addr = offset_to_addr(op1)
 79 | 		instr_text = 'call 0x%x' % (addr)
 80 | 	elif op_code == 0x27:
 81 | 		op1_data = data[0 : 8]
 82 | 		op1 = bytes_to_int(op1_data)
 83 | 		op2 = data[0xa]
 84 | 		addr = offset_to_addr(op1)
 85 | 		instr_text = 'cmp %s, 0; jne 0x%x' % (regs[op2], addr)
 86 | 	elif op_code == 0xa:
 87 | 		op1_data = data[0 : 8]
 88 | 		op1 = bytes_to_int(op1_data)
 89 | 		op2 = data[0xa]
 90 | 		op3 = data[0x9]
 91 | 		instr_text = 'mov %s, byte* (%s + 0x%x)' % (regs[op3], regs[op2], op1)
 92 | 	elif op_code == 0x1:
 93 | 		op1 = data[0xa]
 94 | 		op2 = data[0xb]
 95 | 		op3 = data[0x9]
 96 | 		instr_text = 'add %s, %s, %s' % (regs[op3], regs[op2], regs[op1])
 97 | 	elif op_code == 0x8:
 98 | 		op1 = data[0xa]
 99 | 		op2 = data[0x9]
100 | 		instr_text = '%s = neg %s' % (regs[op2], regs[op1])
101 | 	elif op_code == 0x17:
102 | 		op1 = data[0x9]
103 | 		instr_text = 'pop %s' % (regs[op1])
104 | 	elif op_code == 0x26:
105 | 		op1 = data[0xa]
106 | 		op2_data = data[0 : 8]
107 | 		op2 = bytes_to_int(op2_data)
108 | 		addr = offset_to_addr(op2)
109 | 		instr_text = 'cmp %s, 0x0; je 0x%x modified' % (regs[op1], addr)
110 | 	elif op_code == 0x29:
111 | 		instr_text = 'ret'
112 | 	elif op_code == 0x2b:
113 | 		op1 = data[0xa]
114 | 		instr_text = 'cmp %s, 0x0; if equal ret' % (regs[op1])
115 | 	elif op_code == 0x2e:
116 | 		op1 = data[0xa]
117 | 		op3 = data[0x9]
118 | 		op2_data = data[0 : 8]
119 | 		op2 = bytes_to_int(op2_data)
120 | 		instr_text = 'shl %s, %s, 0x%x' % (regs[op3], regs[op1], op2)
121 | 	elif op_code == 2:
122 | 		op1 = data[0xa]
123 | 		op2 = data[0xb]
124 | 		op3 = data[0x9]
125 | 		instr_text = 'sub %s, %s, %s' % (regs[op3], regs[op1], regs[op2])
126 | 	elif op_code == 3:
127 | 		op1 = data[0xa]
128 | 		op2 = data[0xb]
129 | 		op3 = data[0x9]
130 | 		instr_text = 'imul %s, %s, %s' % (regs[op3], regs[op2], regs[op1])
131 | 	elif op_code == 0x1b:
132 | 		op1 = data[0xa]
133 | 		op2 = data[0xb]
134 | 		op3 = data[0x9]
135 | 		instr_text = 'xor %s, %s, %s' % (regs[op3], regs[op1], regs[op2])
136 | 	elif op_code == 0x19:
137 | 		op1 = data[0xa]
138 | 		op2 = data[0xb]
139 | 		op3 = data[0x9]
140 | 		instr_text = 'or %s, %s, %s' % (regs[op3], regs[op1], regs[op2])
141 | 	elif op_code == 0x21:
142 | 		op1 = data[0xa]
143 | 		op2 = data[0xb]
144 | 		op3 = data[0x9]
145 | 		instr_text = 'cmp %s, %s; sete %s' % (regs[op2], regs[op1], regs[op3])
146 | 	else:
147 | 		instr_text = 'opcode used: 0x%x' % op_code
148 | 
149 | 	return instr_len, instr_text, possible_next_addrs
150 | 
151 | def main():
152 |     vm_code = open('mem.txt', 'rb').read()
153 |     vm_code = list(vm_code)
154 |     try:
155 |         # python 2-3 compatability
156 |         vm_code = [ord(c) for c in vm_code]
157 |     except:
158 |         pass
159 |     
160 |     # we need to write a disassembler(addr, data) that disassembles the data at addr, and return a list of: 1). the length of the current instruction; 2). the disassembly text of the current instruction; 3). the list of possible next addresses
161 |     # if the current instruction is not a branch, then probably the possible next address is jus the address after this instruction; if it is a branch, then we might have two possible next addresses
162 |     vm_dis = VM_Disassembler(vm_code, disassembler, 0, 0xe)
163 |     vm_dis.disassemble()
164 | 
165 | if __name__ == '__main__':
166 |     main()


--------------------------------------------------------------------------------