├── .gitignore ├── README.rst ├── decompile.py └── test_decompile.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Python Decompiler 2 | ================= 3 | 4 | This project aims to create a comprehensive decompiler for CPython bytecode 5 | (likely works with PyPy as well, and any other Python implementation that uses 6 | CPython's bytecode). At the moment it is relatively incomplete, with many 7 | things not supported, including, but certainly not limited to: 8 | 9 | * Unpacking 10 | * try/except/finally 11 | * else clauses on try/for loops 12 | * any sort of arithmatic 13 | * keyword argument and *args, **kwargs to functions 14 | 15 | I'm taking patches, but I suspect at least some of those will require 16 | refactorings and I've grown a tad too bored to do it myself. -------------------------------------------------------------------------------- /decompile.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import opcode 3 | 4 | 5 | NO_ARG = object() 6 | class Instruction(object): 7 | def __init__(self, op, arg=NO_ARG, real_idx=None, new_idx=None): 8 | self.op = op 9 | self.arg = arg 10 | self.real_idx = real_idx 11 | self.new_idx = new_idx 12 | 13 | @property 14 | def opname(self): 15 | return opcode.opname[self.op] 16 | 17 | def __repr__(self): 18 | r = self.opname 19 | if self.arg is not NO_ARG: 20 | r += "(%s)" % self.arg 21 | return r 22 | 23 | def __eq__(self, other): 24 | if not isinstance(other, Instruction): 25 | return NotImplemented 26 | return self.opname == other.opname and self.arg == other.arg 27 | 28 | def finish(self, instructions): 29 | if self.op in opcode.hasjabs or self.op in opcode.hasjrel: 30 | for idx, instr in enumerate(instructions): 31 | if instr.real_idx == self.arg: 32 | self.arg = idx 33 | break 34 | else: 35 | assert False 36 | 37 | OPCODE_WITH_CONST = frozenset(opcode.hasconst) 38 | OPCODE_WITH_NAME = frozenset(opcode.hasname) 39 | OPCODE_WITH_LOCAL = frozenset(opcode.haslocal) 40 | OPCODE_WITH_JREL = frozenset(opcode.hasjrel) 41 | OPCODE_WITH_NUM = frozenset(opcode.hasjabs) | frozenset([ 42 | opcode.opmap[op] for op in ["BUILD_LIST", "CALL_FUNCTION"] 43 | ]) 44 | 45 | def parse_bytecode(code): 46 | i = 0 47 | bytecodes = map(ord, code.co_code) 48 | instructions = [] 49 | while i < len(bytecodes): 50 | op = bytecodes[i] 51 | opidx = i 52 | i += 1 53 | arg = NO_ARG 54 | if op >= opcode.HAVE_ARGUMENT: 55 | oparg = bytecodes[i] + (bytecodes[i + 1] << 8) 56 | i += 2 57 | if op in OPCODE_WITH_CONST: 58 | arg = code.co_consts[oparg] 59 | elif op in OPCODE_WITH_NAME: 60 | arg = code.co_names[oparg] 61 | elif op in OPCODE_WITH_NUM: 62 | arg = oparg 63 | elif op in OPCODE_WITH_JREL: 64 | # Make it absolute 65 | arg = i + oparg 66 | elif op in OPCODE_WITH_LOCAL: 67 | arg = code.co_varnames[oparg] 68 | else: 69 | raise NotImplementedError 70 | instructions.append(Instruction( 71 | op, 72 | arg, 73 | opidx, 74 | len(instructions), 75 | )) 76 | for instr in instructions: 77 | instr.finish(instructions) 78 | return instructions 79 | 80 | class BasicBlock(object): 81 | def __init__(self): 82 | self.instructions = [] 83 | 84 | class BasicBlockFinder(object): 85 | def __init__(self, instructions): 86 | self.instructions = instructions 87 | self.pending_basic_blocks = {} 88 | self.starting_basic_block = self.pending_basic_blocks[0] = BasicBlock() 89 | 90 | def find_basic_blocks(self): 91 | current_basic_block = None 92 | for instr in self.instructions: 93 | if instr.new_idx in self.pending_basic_blocks: 94 | new_block = self.pending_basic_blocks.pop(instr.new_idx) 95 | current_basic_block = new_block 96 | handler = getattr(self, "handle_%s" % instr.opname) 97 | current_basic_block.instructions.append(instr) 98 | handler(instr) 99 | return self.starting_basic_block 100 | 101 | def get_basic_block(self, idx): 102 | # Could you try to get a block that was already completed? 103 | return self.pending_basic_blocks.setdefault(idx, BasicBlock()) 104 | 105 | def handle_simple_op(self, instr): 106 | pass 107 | handle_LOAD_CONST = handle_LOAD_FAST = handle_LOAD_ATTR = handle_LOAD_GLOBAL = \ 108 | handle_STORE_FAST = handle_STORE_SUBSCR = handle_BUILD_LIST = \ 109 | handle_CALL_FUNCTION = \ 110 | handle_SETUP_LOOP = \ 111 | handle_JUMP_ABSOLUTE = \ 112 | handle_POP_BLOCK = \ 113 | handle_NOP = \ 114 | handle_RETURN_VALUE = handle_POP_TOP = handle_simple_op 115 | 116 | def handle_POP_JUMP_IF_FALSE(self, instr): 117 | instr.true_block = self.get_basic_block(instr.new_idx + 1) 118 | instr.false_block = self.get_basic_block(instr.arg) 119 | 120 | def handle_JUMP_FORWARD(self, instr): 121 | instr.fallthrough = self.get_basic_block(instr.arg) 122 | 123 | def handle_GET_ITER(self, instr): 124 | # TODO: broken, in oh so many ways 125 | instr.loop_var = self.instructions[instr.new_idx + 2].arg 126 | self.instructions[instr.new_idx + 2].op = opcode.opmap["NOP"] 127 | instr.loop = self.get_basic_block(instr.new_idx + 1) 128 | 129 | def handle_FOR_ITER(self, instr): 130 | instr.fallthrough = self.get_basic_block(instr.arg) 131 | 132 | 133 | class AddBasicBlock(Exception): 134 | def __init__(self, block): 135 | self.block = block 136 | 137 | class Interpreter(object): 138 | def __init__(self, basic_block, indent_level=1): 139 | self.basic_blocks = [basic_block] 140 | self.ops = [] 141 | self.buf = [] 142 | self.indent_level = indent_level 143 | 144 | def get_and_clear_buf(self, expected_len): 145 | assert len(self.buf) == expected_len 146 | buf = self.buf[:] 147 | del self.buf[:] 148 | return buf 149 | 150 | def emit(self, op): 151 | if isinstance(op, Interpreter): 152 | self.ops.append(op.evaluate()) 153 | else: 154 | self.ops.append(" " * self.indent_level + op) 155 | 156 | @contextlib.contextmanager 157 | def indent(self): 158 | self.indent_level += 1 159 | try: 160 | yield 161 | finally: 162 | self.indent_level -= 1 163 | 164 | def evaluate(self): 165 | while self.basic_blocks: 166 | basic_block = self.basic_blocks.pop() 167 | self.handle_basic_block(basic_block) 168 | return "\n".join(self.ops) 169 | 170 | def handle_basic_block(self, basic_block): 171 | for instr in basic_block.instructions: 172 | handler = getattr(self, "handle_%s" % instr.opname) 173 | handler(instr) 174 | 175 | def handle_NOP(self, instr): 176 | pass 177 | handle_SETUP_LOOP = handle_POP_BLOCK = handle_NOP 178 | 179 | def handle_literal(self, instr): 180 | self.buf.append(str(instr.arg)) 181 | handle_LOAD_CONST = handle_LOAD_FAST = handle_LOAD_GLOBAL = handle_literal 182 | 183 | def handle_LOAD_ATTR(self, instr): 184 | [obj] = self.get_and_clear_buf(1) 185 | self.buf.append("%s.%s" % (obj, instr.arg)) 186 | 187 | def handle_BUILD_LIST(self, instr): 188 | self.buf.append("[]") 189 | 190 | def handle_STORE_FAST(self, instr): 191 | [obj] = self.get_and_clear_buf(1) 192 | self.emit("%s = %s" % (instr.arg, obj)) 193 | 194 | def handle_STORE_SUBSCR(self, instr): 195 | value, obj, idx = self.get_and_clear_buf(3) 196 | self.emit("%s[%s] = %s" % (obj, idx, value)) 197 | 198 | def handle_GET_ITER(self, instr): 199 | [obj] = self.get_and_clear_buf(1) 200 | self.emit("for %s in %s:" % (instr.loop_var, obj)) 201 | with self.indent(): 202 | self.handle_basic_block(instr.loop) 203 | 204 | def handle_FOR_ITER(self, instr): 205 | self.basic_blocks.append(instr.fallthrough) 206 | 207 | def handle_CALL_FUNCTION(self, instr): 208 | args = self.get_and_clear_buf(instr.arg + 1) 209 | func = args[0] 210 | args = args[1:] 211 | self.buf.append("%s(%s)" % (func, ", ".join(args))) 212 | 213 | def handle_RETURN_VALUE(self, instr): 214 | [obj] = self.get_and_clear_buf(1) 215 | self.emit("return %s" % obj) 216 | 217 | def handle_POP_TOP(self, instr): 218 | [obj] = self.get_and_clear_buf(1) 219 | self.emit(obj) 220 | 221 | def handle_POP_JUMP_IF_FALSE(self, instr): 222 | [obj] = self.get_and_clear_buf(1) 223 | self.emit("if %s:" % obj) 224 | with self.indent(): 225 | self.handle_basic_block(instr.true_block) 226 | self.emit("else:") 227 | with self.indent(): 228 | self.handle_basic_block(instr.false_block) 229 | 230 | def handle_JUMP_FORWARD(self, instr): 231 | self.basic_blocks.append(instr.fallthrough) 232 | 233 | def handle_JUMP_ABSOLUTE(self, instr): 234 | self.emit("continue") 235 | 236 | 237 | def decompile(function): 238 | instructions = parse_bytecode(function.__code__) 239 | start_bblock = BasicBlockFinder(instructions).find_basic_blocks() 240 | body = Interpreter(start_bblock).evaluate() 241 | header = "def %(name)s(%(args)s):\n" % { 242 | "name": function.__name__, 243 | "args": ", ".join(function.__code__.co_varnames[:function.__code__.co_argcount]) 244 | } 245 | return header + body -------------------------------------------------------------------------------- /test_decompile.py: -------------------------------------------------------------------------------- 1 | import opcode 2 | import textwrap 3 | 4 | from decompile import decompile, parse_bytecode, Instruction 5 | 6 | 7 | class TestDecompilation(object): 8 | def assert_decompiles(self, func, expected): 9 | result = decompile(func) 10 | expected = textwrap.dedent(expected).strip("\n") 11 | assert result == expected 12 | 13 | def test_simple(self): 14 | def f(): 15 | return 1 16 | 17 | self.assert_decompiles(f, """ 18 | def f(): 19 | return 1 20 | """) 21 | 22 | def test_branch(self): 23 | def f(): 24 | if z: 25 | return 1 26 | else: 27 | return 2 28 | 29 | # This needs some dead code eliminiation applied to it. 30 | self.assert_decompiles(f, """ 31 | def f(): 32 | if z: 33 | return 1 34 | else: 35 | return 2 36 | return None 37 | """) 38 | 39 | def test_more_branchy_stuff(self): 40 | def f(): 41 | if z: 42 | x 43 | else: 44 | y 45 | return 2 46 | 47 | self.assert_decompiles(f, """ 48 | def f(): 49 | if z: 50 | x 51 | else: 52 | y 53 | return 2 54 | """) 55 | 56 | def test_simple_parameters(self): 57 | def f(a): 58 | return a 59 | 60 | self.assert_decompiles(f, """ 61 | def f(a): 62 | return a 63 | """) 64 | 65 | def test_list_ops(self): 66 | def f(): 67 | x = [] 68 | x.append(1) 69 | x[0] = 3 70 | 71 | self.assert_decompiles(f, """ 72 | def f(): 73 | x = [] 74 | x.append(1) 75 | x[0] = 3 76 | return None 77 | """) 78 | 79 | def test_simple_for_loop(self): 80 | def f(x): 81 | for i in x: 82 | pass 83 | 84 | # TODO: continue should become pass where possible 85 | self.assert_decompiles(f, """ 86 | def f(x): 87 | for i in x: 88 | continue 89 | return None 90 | """) 91 | 92 | class TestBytecodeParser(object): 93 | def assert_bytecode(self, func, expected): 94 | instructions = parse_bytecode(func.__code__) 95 | expected = [ 96 | Instruction(opcode.opmap[args[0]], *args[1:]) 97 | for args in expected 98 | ] 99 | assert instructions == expected 100 | 101 | def test_simple(self): 102 | def f(): 103 | return 1 104 | 105 | self.assert_bytecode(f, [ 106 | ("LOAD_CONST", 1), 107 | ("RETURN_VALUE",), 108 | ]) 109 | 110 | def test_load_global(self): 111 | def f(): 112 | return z 113 | 114 | self.assert_bytecode(f, [ 115 | ("LOAD_GLOBAL", "z"), 116 | ("RETURN_VALUE",), 117 | ]) 118 | 119 | def test_simple_branch(self): 120 | def f(): 121 | if z: 122 | return 1 123 | else: 124 | return 2 125 | 126 | self.assert_bytecode(f, [ 127 | ("LOAD_GLOBAL", "z"), 128 | ("POP_JUMP_IF_FALSE", 4), 129 | ("LOAD_CONST", 1), 130 | ("RETURN_VALUE",), 131 | ("LOAD_CONST", 2), 132 | ("RETURN_VALUE",), 133 | ("LOAD_CONST", None), 134 | ("RETURN_VALUE",) 135 | ]) 136 | 137 | def test_jump_forward(self): 138 | def f(): 139 | if z: 140 | x 141 | else: 142 | y 143 | return 1 144 | 145 | self.assert_bytecode(f, [ 146 | ("LOAD_GLOBAL", "z"), 147 | ("POP_JUMP_IF_FALSE", 5), 148 | ("LOAD_GLOBAL", "x"), 149 | ("POP_TOP",), 150 | ("JUMP_FORWARD", 7), 151 | ("LOAD_GLOBAL", "y"), 152 | ("POP_TOP",), 153 | ("LOAD_CONST", 1), 154 | ("RETURN_VALUE",) 155 | ]) 156 | 157 | def test_parameter_name(self): 158 | def f(a): 159 | return a 160 | 161 | self.assert_bytecode(f, [ 162 | ("LOAD_FAST", "a"), 163 | ("RETURN_VALUE",) 164 | ]) --------------------------------------------------------------------------------