├── tox.ini ├── tests ├── test_low_level_interface.py └── test_noeval_parser.py ├── conftest.py ├── setup.py ├── LICENSE ├── .gitignore ├── README.md └── src └── bap ├── asm.py ├── bap.py ├── __init__.py ├── bil.py ├── arm.py ├── bir.py ├── rpc.py ├── noeval_parser.py └── adt.py /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py3 3 | 4 | [testenv] 5 | changedir=tests 6 | deps=pytest 7 | commands= 8 | py.test --basetemp={envtmpdir} {posargs} 9 | -------------------------------------------------------------------------------- /tests/test_low_level_interface.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import bap 3 | 4 | class TestLowLevelInterface(unittest.TestCase): 5 | 6 | def test_low_level_interface(self): 7 | asm_str = '\n'.join(insn.asm for insn in bap.disasm(b"\x48\x83\xec\x08")) 8 | self.assertIsNotNone(asm_str) 9 | self.assertIn("\tdecl\t%eax", asm_str) 10 | self.assertIn("\tsubl\t$0x8, %esp", asm_str) 11 | 12 | if __name__ == "__main__": 13 | unittest.main() -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | '''pytest configuration module''' 2 | import pytest # pylint: disable=import-error 3 | 4 | # configure setup to skip slow tests by default (without --slow flag) 5 | def pytest_runtest_setup(item): 6 | """Skip tests if they are marked as slow and --slow is not given""" 7 | if getattr(item.obj, 'slow', None) and not item.config.getvalue('slow'): 8 | pytest.skip('slow tests not requested') 9 | 10 | # add '--slow' flag to enable the slow tests, but default to False/disabled 11 | def pytest_addoption(parser): 12 | '''Add --slow option''' 13 | parser.addoption('--slow', action='store_true', default=False, 14 | help='Also run slow tests') 15 | 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | from setuptools import setup 4 | 5 | setup ( 6 | name = 'bap', 7 | version = '1.1.0', 8 | description = 'Python bindings to Binary Analysis Platform (BAP)', 9 | author = 'BAP Team', 10 | url = 'https://github.com/BinaryAnalysisPlatform/bap-python', 11 | maintainer = 'Ivan Gotovchits', 12 | maintainer_email = 'ivg@ieee.org', 13 | license = 'MIT', 14 | package_dir = {'' : 'src'}, 15 | packages = ['bap'], 16 | extras_require = { 17 | 'rpc' : ['requests'] 18 | }, 19 | 20 | classifiers=[ 21 | 'Development Status :: 3 - Alpha', 22 | 'License :: OSI Approved :: MIT License', 23 | 'Topic :: Software Development :: Disassemblers', 24 | 'Topic :: Security' 25 | ] 26 | ) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Carnegie Mellon University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #### joe made this: http://goel.io/joe 2 | #### Python #### 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | .venv/ 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | BAP python bindings 2 | 3 | # Installing 4 | 5 | Install python bindings with pip (after you have installed `bap`): 6 | 7 | ```bash 8 | $ pip install bap 9 | ``` 10 | 11 | Alternatively you can just copy paste files into your project, or clone it 12 | with git-subtree. 13 | 14 | 15 | ## Installing low-level bindings 16 | 17 | An optional low-level interface, called [rpc] depends on the requests 18 | library and the bap-server package. To use it, you need to install 19 | them from pip and opam correspondigly: 20 | 21 | ```bash 22 | $ pip install bap[rpc] 23 | $ opam install bap-server 24 | ``` 25 | 26 | ## Installing development version 27 | 28 | You can also install directly from github: 29 | 30 | ```bash 31 | pip install git+git://github.com/BinaryAnalysisPlatform/bap-python.git 32 | ```` 33 | 34 | # Using 35 | 36 | ```python 37 | >>> import bap 38 | >>> proj = bap.run('/bin/true') 39 | >>> main = proj.program.subs.find('main') 40 | >>> entry = main.blks[0] 41 | >>> next = main.blks.find(entry.jmps[0].target.arg) 42 | ``` 43 | 44 | For more information, read builtin documentation, for example with 45 | `ipython`: 46 | 47 | ```python 48 | >>> bap? 49 | ``` 50 | 51 | 52 | # Using low-level interface 53 | 54 | The low-level interface provides an access to disassembler and image 55 | loader. It uses RPC interface to make calls to the library. So make 56 | sure that you have installed `requests` and `bap-server` (see 57 | Installation section). 58 | 59 | 60 | ```python 61 | >>> import bap 62 | >>> print '\n'.join(insn.asm for insn in bap.disasm(b"\x48\x83\xec\x08")) 63 | decl %eax 64 | subl $0x8, %esp 65 | ``` 66 | 67 | A more complex example: 68 | 69 | ```python 70 | >>> img = bap.image('coreutils_O0_ls') 71 | >>> sym = img.get_symbol('main') 72 | >>> print '\n'.join(insn.asm for insn in bap.disasm(sym)) 73 | push {r11, lr} 74 | add r11, sp, #0x4 75 | sub sp, sp, #0xc8 76 | ... ... 77 | ``` 78 | -------------------------------------------------------------------------------- /src/bap/asm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Disassembled instuctions""" 4 | 5 | from .adt import ADT 6 | 7 | class Kind(ADT) : pass 8 | class Having_side_effects(Kind) : pass 9 | class Affecting_control(Kind) : pass 10 | class Branch(Affecting_control) : pass 11 | class Conditional_branch(Branch) : pass 12 | class Unconditional_branch(Branch) : pass 13 | class Indirect_branch(Branch) : pass 14 | class Return(Affecting_control) : pass 15 | class Call(Affecting_control) : pass 16 | class Barrier(Affecting_control) : pass 17 | class Terminator(Affecting_control): pass 18 | class May_affect_control_flow(Affecting_control) : pass 19 | class May_load(Having_side_effects) : pass 20 | class May_store(Having_side_effects) : pass 21 | class Valid(Kind) : pass 22 | 23 | 24 | def eval_if_not_adt(s): 25 | if isinstance(s, ADT): 26 | return s 27 | else: 28 | return eval(s) 29 | 30 | 31 | def map_eval(ss): 32 | return [eval_if_not_adt(s) for s in ss] 33 | 34 | 35 | 36 | class Insn(object) : 37 | def __init__(self, name, addr, size, asm, kinds, operands, target=None, bil=None, **kw): 38 | self.name = name 39 | self.addr = int(addr) 40 | self.size = int(size) 41 | self.operands = map_eval(operands) 42 | self.asm = str(asm) 43 | self.kinds = map_eval(kinds) 44 | self.target = target 45 | self.bil = bil 46 | self.__dict__.update(kw) 47 | 48 | def has_kind(self, k): 49 | return exists(self.kinds, lambda x: isinstance(x,k)) 50 | 51 | def __repr__(self): 52 | return 'Insn("{name}", {addr:#010x}, {size}, "{asm}", {kinds}, {operands})'.\ 53 | format(**self.__dict__) 54 | 55 | class Op(ADT) : pass 56 | class Reg(Op) : pass 57 | class Imm(Op) : pass 58 | class Fmm(Op) : pass 59 | 60 | 61 | def exists(cont,f): 62 | try: 63 | r = (x for x in cont if f(x)).next() 64 | return True 65 | except StopIteration: 66 | return False 67 | -------------------------------------------------------------------------------- /src/bap/bap.py: -------------------------------------------------------------------------------- 1 | from subprocess import Popen,PIPE 2 | from . import bir 3 | 4 | 5 | class BapError(Exception): 6 | "Base class for BAP runtime errors" 7 | def __init__(self, cmd, out, err): 8 | self.cmd = cmd 9 | self.out = out 10 | self.err = err 11 | 12 | def info(self): 13 | return """ 14 | Standard output:\n{0}\n 15 | Standard error: \n{1}\n 16 | Invoked as: {2} 17 | """.format(self.out, self.err, ' '.join(self.cmd)) 18 | 19 | class MalformedOutput(BapError): 20 | """Raised if we were unable to parse the output of bap. """ 21 | def __init__(self, exn, *args): 22 | super(MalformedOutput, self).__init__(*args) 23 | self.exn = exn 24 | 25 | def __str__(self): 26 | return '\n'.join([ 27 | "expected a valid Python expression, but got", 28 | str(self.exn), 29 | self.info() 30 | ]) 31 | 32 | class Failed(BapError): 33 | "Raised when bap subprocess returns a non-zero code" 34 | def __init__(self, code, *args): 35 | super(Failed, self).__init__(*args) 36 | self.code = code 37 | 38 | def __str__(self): 39 | return '\n'.join([ 40 | "exited with return code {0}".format(self.code), 41 | self.info() 42 | ]) 43 | 44 | class Killed(BapError): 45 | "Raised when bap subprocess is killed by a signal" 46 | def __init__(self, signal, *args): 47 | super(Killed, self).__init__(*args) 48 | self.signal = signal 49 | 50 | def __str__(self): 51 | return '\n'.join([ 52 | "received signal {0}".format(self.signal), 53 | self.info() 54 | ]) 55 | 56 | 57 | adt_project_parser = { 58 | 'format' : 'adt', 59 | 'load' : bir.loads 60 | } 61 | 62 | 63 | def run(path, args=[], bap='bap', parser=adt_project_parser): 64 | r"""run(file[, args] [, bap=PATH] [,parser=PARSER]) -> project 65 | 66 | Run bap on a specified `file`, wait until it finishes, parse 67 | and return the result, using project data structure as default. 68 | 69 | Example: 70 | 71 | >>> proj = run('/bin/true') 72 | 73 | To specify extra command line arguments, pass them as a list: 74 | 75 | >>> proj = run('/bin/true', ['--no-cache', '--symbolizer=ida']) 76 | 77 | To specify an explicit path to `bap` executable use `bap` keyword 78 | argument: 79 | 80 | >>> proj = run('/bin/true', bap='/usr/bin/bap') 81 | 82 | 83 | By default a project data structure is dumped in ADT format and 84 | loaded into `bir.Project` data structure. To parse other formats, 85 | a parser argument can be specified. It must be a dictionary, that 86 | may contain the following two fields: 87 | 88 | - `format` - a format name as accepted by bap's `--dump` option, 89 | it will passed to bap. 90 | - `load` - a function that parses the output. 91 | 92 | 93 | In case of errors, the `load` function must raise `SyntaxError` 94 | exception. Example: 95 | 96 | >>> version = run('/bin/true', parser={'load' : str.strip}) 97 | 98 | If `parser` is `None` or if it doesn't provide `load` function, 99 | then the program output is returned as is. 100 | 101 | 102 | Exceptions 103 | ---------- 104 | 105 | Will pass through exceptions from the underlying subprocess module, 106 | with OSError being the most common one. If everything went fine on 107 | the system level, then may raise SyntaxError at the parsing step. 108 | Also may raise Failed or Killed exceptions in case if the return code 109 | wasn't zero. 110 | 111 | 112 | """ 113 | opts = [bap, path] + args 114 | 115 | if parser and 'format' in parser: 116 | opts += ['-d{format}'.format(**parser)] 117 | 118 | bap = Popen(opts, stdout=PIPE, stderr=PIPE) 119 | out,err = bap.communicate() 120 | 121 | if bap.returncode == 0: 122 | try: 123 | if parser and 'load' in parser: 124 | return parser['load'](out) 125 | else: 126 | return out 127 | except SyntaxError as exn: 128 | raise MalformedOutput(exn, opts, out, err) 129 | elif bap.returncode < 0: 130 | raise Killed(-bap.returncode, opts, out, err) 131 | else: 132 | raise Failed(bap.returncode, opts, out, err) 133 | -------------------------------------------------------------------------------- /src/bap/__init__.py: -------------------------------------------------------------------------------- 1 | r"""Python inteface to BAP. 2 | 3 | 4 | Porcelain Interace 5 | ================== 6 | 7 | The high level interface allows to run ``bap`` and get back the information 8 | that we were able to infer from the file. It consists only from one function, 9 | ``bap.run``, that will drive ``bap`` for you. It is quite versatile, so read the 10 | documentation for the further information. 11 | 12 | 13 | Example 14 | ------- 15 | 16 | >>> import bap 17 | >>> proj = bap.run('/bin/true', ['--symbolizer=ida']) 18 | >>> text = proj.sections['.text'] 19 | >>> main = proj.program.subs.find('main') 20 | >>> entry = main.blks[0] 21 | >>> next = main.blks.find(entry.jmps[0].target.arg) 22 | 23 | It is recommended to explore the interface using ipython or similiar 24 | interactive toplevels. 25 | 26 | We use ADT syntax to communicate with python. It is a syntactical 27 | subset of Python grammar, so in fact, bap just returns a valid Python 28 | program, that is then evaluated. The ADT stands for Algebraic Data 29 | Type, and is described in ``adt`` module. For non-trivial tasks one 30 | should consider using ``adt.Visitor`` class. 31 | 32 | 33 | 34 | Plumbing interface [rpc] 35 | ======================== 36 | 37 | The low level interface provides an access to internal services. It 38 | uses ``bap-server``, and talks with bap using RPC protocol. It is in 39 | extras section and must be installed explicitly with ``[rpc]`` tag. 40 | 41 | In a few keystrokes: 42 | 43 | >>> import bap 44 | >>> print '\n'.join(insn.asm for insn in bap.disasm("\x48\x83\xec\x08")) 45 | decl %eax 46 | subl $0x8, %esp 47 | 48 | A more complex example: 49 | 50 | >>> img = bap.image('coreutils_O0_ls') 51 | >>> sym = img.get_symbol('main') 52 | >>> print '\n'.join(insn.asm for insn in bap.disasm(sym)) 53 | push {r11, lr} 54 | add r11, sp, #0x4 55 | sub sp, sp, #0xc8 56 | ... ... 57 | 58 | Bap package exposes two functions: 59 | 60 | #. ``disasm`` returns a disassembly of the given object 61 | #. ``image`` loads given file 62 | 63 | Disassembling things 64 | -------------------- 65 | 66 | ``disasm`` is a swiss knife for disassembling things. It takes either a 67 | string object, or something returned by an ``image`` function, e.g., 68 | images, segments and symbols. 69 | 70 | ``disasm`` function returns a generator yielding instances of class 71 | ``Insn`` defined in module :mod:`asm`. It has the following attributes: 72 | 73 | * name - instruction name, as undelying backend names it 74 | * addr - address of the first byte of instruction 75 | * size - overall size of the instruction 76 | * operands - list of instances of class ``Op`` 77 | * asm - assembler string, in native assembler 78 | * kinds - instruction meta properties, see :mod:`asm` 79 | * target - instruction lifter to a target platform, e.g., see :mod:`arm` 80 | * bil - a list of BIL statements, describing instruction semantics. 81 | 82 | ``disasm`` function also accepts a bunch of keyword arguments, to name a few: 83 | 84 | * server - either an url to a bap server or a dictionay containing port 85 | and/or executable name 86 | * arch 87 | * endian (instance of ``bil.Endian``) 88 | * addr (should be an instance of type ``bil.Int``) 89 | * backend 90 | * stop_conditions 91 | 92 | All attributes are self-describing I hope. ``stop_conditions`` is a list of 93 | ``Kind`` instances defined in :mod:`asm`. If disassembler meets instruction 94 | that is instance of one of this kind, it will stop. 95 | 96 | Reading files 97 | ------------- 98 | 99 | To read and analyze file one should load it with ``image`` 100 | function. This function returns an instance of class ``Image`` that 101 | allows one to discover information about the file, and perform different 102 | queries. It has function ``get_symbol`` function to lookup symbol in 103 | file by name, and the following set of attributes (self describing): 104 | 105 | * arch 106 | * entry_point 107 | * addr_size 108 | * endian 109 | * file (file name) 110 | * segments 111 | 112 | Segments is a list of instances of ``Segment`` class, that also has a 113 | ``get_symbol`` function and the following attributes: 114 | 115 | * name 116 | * perm (a list of ['r', 'w', 'x']) 117 | * addr 118 | * size 119 | * memory 120 | * symbols 121 | 122 | Symbols is a list of, you get it, ``Symbol`` class, each having the 123 | following attributes: 124 | 125 | * name 126 | * is_function 127 | * is_debug 128 | * addr 129 | * chunks 130 | 131 | Where chunks is a list of instances of ``Memory`` class, each having the 132 | following attributes: 133 | 134 | * addr 135 | * size 136 | * data 137 | 138 | Where data is actual string of bytes. 139 | """ 140 | 141 | from .bap import run 142 | 143 | try : 144 | from .rpc import disasm, image 145 | except ImportError: 146 | pass 147 | -------------------------------------------------------------------------------- /src/bap/bil.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """BAP BIL Python representation""" 4 | 5 | from .adt import * 6 | 7 | 8 | class Exp(ADT) : pass # Abstract base for all expressions 9 | class Load(Exp): 10 | "Load(mem,idx,endian,size)" 11 | @property 12 | def mem(self) : return self.arg[0] 13 | @property 14 | def idx(self) : return self.arg[1] 15 | @property 16 | def endian(self): return self.arg[2] 17 | @property 18 | def size(self): return self.arg[3] 19 | 20 | 21 | class Store(Exp): 22 | "Store(mem,idx,val,endian,size)" 23 | @property 24 | def mem(self) : return self.arg[0] 25 | @property 26 | def idx(self) : return self.arg[1] 27 | @property 28 | def value(self): return self.arg[2] 29 | @property 30 | def endian(self): return self.arg[3] 31 | @property 32 | def size(self): return self.arg[4] 33 | 34 | class BinOp(Exp): 35 | "Abstract base for all binary operators" 36 | @property 37 | def lhs(self): return self.arg[0] 38 | @property 39 | def rhs(self): return self.arg[1] 40 | 41 | class UnOp(Exp) : pass # Abstract base for all unary operators 42 | 43 | class Var(Exp) : 44 | "Var(name,type)" 45 | @property 46 | def name(self): return self.arg[0] 47 | @property 48 | def type(self): return self.arg[1] 49 | 50 | class Int(Exp): 51 | "Int(int,size)" 52 | @property 53 | def value(self): return self.arg[0] 54 | @property 55 | def size(self): 56 | "word size in bits" 57 | return self.arg[1] 58 | 59 | class Cast(Exp) : 60 | "Abstract base for all cast operations" 61 | @property 62 | def size(self): return self.arg[0] 63 | @property 64 | def expr(self): return self.arg[1] 65 | 66 | class Let(Exp) : 67 | "Let(var,val,expr)" 68 | @property 69 | def var(self): return self.arg[0] 70 | @property 71 | def value(self): return self.arg[1] 72 | @property 73 | def expr(self): return self.arg[2] 74 | 75 | class Unknown(Exp): 76 | "Unknown(string,type)" 77 | @property 78 | def desc(self): return self.arg[0] 79 | @property 80 | def type(self): return self.arg[1] 81 | 82 | class Ite(Exp): 83 | "Ite (cond,if_true,if_false)" 84 | @property 85 | def cond(self): return self.arg[0] 86 | @property 87 | def true(self): return self.arg[1] 88 | @property 89 | def false(self): return self.arg[2] 90 | 91 | class Extract(Exp): 92 | "Extract(hb,lb, exp)" 93 | @property 94 | def high_bit(self): return self.arg[0] 95 | @property 96 | def low_bit(self): return self.arg[1] 97 | @property 98 | def expr(self): return self.arg[2] 99 | 100 | class Concat(Exp): 101 | @property 102 | def lhs(self): return self.arg[0] 103 | @property 104 | def rhs(self): return self.arg[1] 105 | 106 | class Stmt(ADT) : pass # Abstract base for all statements 107 | 108 | class Move(Stmt) : 109 | "Move(var,exp)" 110 | @property 111 | def var(self): return self.arg[0] 112 | @property 113 | def expr(self): return self.arg[1] 114 | 115 | class Jmp(Stmt) : pass # Jmp(exp) 116 | class Special(Stmt): pass # Special (string) 117 | class While(Stmt) : 118 | "While (cond, stmts)" 119 | @property 120 | def cond(self): return self.arg[0] 121 | 122 | @property 123 | def stmts(self): return self.arg[1] 124 | 125 | class If(Stmt) : 126 | "If(cond, yes-exprs, no-exprs)" 127 | @property 128 | def cond(self): return self.arg[0] 129 | @property 130 | def true(self): return self.arg[1] 131 | @property 132 | def false(self): return self.arg[2] 133 | 134 | class CpuExn(Stmt) : pass # CpuExn(n) 135 | 136 | # All BinOps have two operands of type exp 137 | class PLUS (BinOp) : pass 138 | class MINUS (BinOp) : pass 139 | class TIMES (BinOp) : pass 140 | class DIVIDE (BinOp) : pass 141 | class SDIVIDE (BinOp) : pass 142 | class MOD (BinOp) : pass 143 | class SMOD (BinOp) : pass 144 | class LSHIFT (BinOp) : pass 145 | class RSHIFT (BinOp) : pass 146 | class ARSHIFT (BinOp) : pass 147 | class AND (BinOp) : pass 148 | class OR (BinOp) : pass 149 | class XOR (BinOp) : pass 150 | class EQ (BinOp) : pass 151 | class NEQ (BinOp) : pass 152 | class LT (BinOp) : pass 153 | class LE (BinOp) : pass 154 | class SLT (BinOp) : pass 155 | class SLE (BinOp) : pass 156 | 157 | # All UnOps have one operand of type exp 158 | class NEG (UnOp) : pass 159 | class NOT (UnOp) : pass 160 | 161 | # All Casts have two operands: (Int(size),exp) 162 | class UNSIGNED(Cast) : pass 163 | class SIGNED(Cast) : pass 164 | class HIGH(Cast) : pass 165 | class LOW(Cast) : pass 166 | 167 | # Endians doesn't have values 168 | class Endian(ADT) : pass 169 | class LittleEndian(Endian) : pass 170 | class BigEndian(Endian) : pass 171 | 172 | class Type(ADT) : pass # Abstract base for expression type 173 | class Imm(Type) : 174 | "Imm(size) - immediate value" 175 | @property 176 | def size(self): return self.arg 177 | 178 | class Mem(Type) : 179 | "Mem(addr_size, value_size)" 180 | @property 181 | def addr_size(self): return self.arg[0] 182 | 183 | @property 184 | def value_size(self): return self.arg[1] 185 | 186 | def loads(s): 187 | return eval(s) 188 | -------------------------------------------------------------------------------- /src/bap/arm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Lifted ARM instruction""" 4 | 5 | from .adt import * 6 | from .asm import * 7 | from .bil import * 8 | 9 | class Reg(ADT) : pass 10 | class Nil(Reg) : pass 11 | class GPR(Reg) : pass 12 | class CCR(Reg) : pass 13 | 14 | class R0(GPR) : pass 15 | class R1(GPR) : pass 16 | class R2(GPR) : pass 17 | class R3(GPR) : pass 18 | class R4(GPR) : pass 19 | class R5(GPR) : pass 20 | class R6(GPR) : pass 21 | class R7(GPR) : pass 22 | class R8(GPR) : pass 23 | class R9(GPR) : pass 24 | class R10(GPR) : pass 25 | class R11(GPR) : pass 26 | class R12(GPR) : pass 27 | class LR(GPR) : pass 28 | class PC(GPR) : pass 29 | class SP(GPR) : pass 30 | 31 | class CPSR(CCR) : pass 32 | class SPSR(CCR) : pass 33 | class ITSTATE(CCR) : pass 34 | 35 | class Insn(ADT) : pass 36 | class Move(Insn) : pass 37 | class Bits(Insn) : pass 38 | class Mult(Insn) : pass 39 | class Mem(Insn) : pass 40 | class Branch(Insn) : pass 41 | class Special(Insn) : pass 42 | 43 | class ADCri(Move) : pass 44 | class ADCrr(Move) : pass 45 | class ADCrsi(Move) : pass 46 | class ADCrsr(Move) : pass 47 | class ADDri(Move) : pass 48 | class ADDrr(Move) : pass 49 | class ADDrsi(Move) : pass 50 | class ADDrsr(Move) : pass 51 | class ANDri(Move) : pass 52 | class ANDrr(Move) : pass 53 | class ANDrsi(Move) : pass 54 | class ANDrsr(Move) : pass 55 | class BICri(Move) : pass 56 | class BICrr(Move) : pass 57 | class BICrsi(Move) : pass 58 | class BICrsr(Move) : pass 59 | class CMNri(Move) : pass 60 | class CMNzrr(Move) : pass 61 | class CMNzrsi(Move) : pass 62 | class CMNzrsr(Move) : pass 63 | class CMPri(Move) : pass 64 | class CMPrr(Move) : pass 65 | class CMPrsi(Move) : pass 66 | class CMPrsr(Move) : pass 67 | class EORri(Move) : pass 68 | class EORrr(Move) : pass 69 | class EORrsi(Move) : pass 70 | class EORrsr(Move) : pass 71 | class MOVTi16(Move) : pass 72 | class MOVi(Move) : pass 73 | class MOVi16(Move) : pass 74 | class MOVr(Move) : pass 75 | class MOVsi(Move) : pass 76 | class MOVsr(Move) : pass 77 | class MOVPCLR(Move) : pass 78 | class MVNi(Move) : pass 79 | class MVNr(Move) : pass 80 | class MVNsi(Move) : pass 81 | class MVNsr(Move) : pass 82 | class ORRri(Move) : pass 83 | class ORRrr(Move) : pass 84 | class ORRrsi(Move) : pass 85 | class ORRrsr(Move) : pass 86 | class RSBri(Move) : pass 87 | class RSBrr(Move) : pass 88 | class RSBrsi(Move) : pass 89 | class RSBrsr(Move) : pass 90 | class RSCri(Move) : pass 91 | class RSCrr(Move) : pass 92 | class RSCrsi(Move) : pass 93 | class RSCrsr(Move) : pass 94 | class SBCri(Move) : pass 95 | class SBCrr(Move) : pass 96 | class SBCrsi(Move) : pass 97 | class SBCrsr(Move) : pass 98 | class SUBri(Move) : pass 99 | class SUBrr(Move) : pass 100 | class SUBrsi(Move) : pass 101 | class SUBrsr(Move) : pass 102 | class TEQri(Move) : pass 103 | class TEQrr(Move) : pass 104 | class TEQrsi(Move) : pass 105 | class TEQrsr(Move) : pass 106 | class TSTri(Move) : pass 107 | class TSTrr(Move) : pass 108 | class TSTrsi(Move) : pass 109 | class TSTrsr(Move) : pass 110 | 111 | class BFC(Bits) : pass 112 | class BFI(Bits) : pass 113 | class PKHTB(Bits) : pass 114 | class RBIT(Bits) : pass 115 | class SBFX(Bits) : pass 116 | class SWPB(Bits) : pass 117 | class SXTAB(Bits) : pass 118 | class SXTAH(Bits) : pass 119 | class SXTB(Bits) : pass 120 | class SXTH(Bits) : pass 121 | class UBFX(Bits) : pass 122 | class UXTAB(Bits) : pass 123 | class UXTAH(Bits) : pass 124 | class UXTB(Bits) : pass 125 | class UXTH(Bits) : pass 126 | class REV(Bits) : pass 127 | class REV16(Bits) : pass 128 | class CLZ(Bits) : pass 129 | 130 | 131 | class MLA(Mult) : pass 132 | class MLS(Mult) : pass 133 | class MUL(Mult) : pass 134 | class SMLABB(Mult) : pass 135 | class SMLAD(Mult) : pass 136 | class SMLAL(Mult) : pass 137 | class SMLALBT(Mult) : pass 138 | class SMLAWB(Mult) : pass 139 | class SMUAD(Mult) : pass 140 | class SMULBB(Mult) : pass 141 | class SMULL(Mult) : pass 142 | class SMULTB(Mult) : pass 143 | class UMLAL(Mult) : pass 144 | class UMULL(Mult) : pass 145 | 146 | class LDMDA(Mem) : pass 147 | class LDMDA_UPD(Mem) : pass 148 | class LDMDB(Mem) : pass 149 | class LDMDB_UPD(Mem) : pass 150 | class LDMIA(Mem) : pass 151 | class LDMIA_UPD(Mem) : pass 152 | class LDMIB(Mem) : pass 153 | class LDMIB_UPD(Mem) : pass 154 | class STMDA(Mem) : pass 155 | class STMDA_UPD(Mem) : pass 156 | class STMDB(Mem) : pass 157 | class STMDB_UPD(Mem) : pass 158 | class STMIA(Mem) : pass 159 | class STMIA_UPD(Mem) : pass 160 | class STMIB(Mem) : pass 161 | class STMIB_UPD(Mem) : pass 162 | class LDRBT_POST_IMM(Mem) : pass 163 | class LDRBT_POST_REG(Mem) : pass 164 | class LDRB_POST_IMM(Mem) : pass 165 | class LDRB_POST_REG(Mem) : pass 166 | class LDRB_PRE_IMM(Mem) : pass 167 | class LDRB_PRE_REG(Mem) : pass 168 | class LDRBi12(Mem) : pass 169 | class LDRBrs(Mem) : pass 170 | class LDRD(Mem) : pass 171 | class LDRD_POST(Mem) : pass 172 | class LDRD_PRE(Mem) : pass 173 | class LDREX(Mem) : pass 174 | class LDREXB(Mem) : pass 175 | class LDREXD(Mem) : pass 176 | class LDREXH(Mem) : pass 177 | class LDRH(Mem) : pass 178 | class LDRHTr(Mem) : pass 179 | class LDRH_POST(Mem) : pass 180 | class LDRH_PRE(Mem) : pass 181 | class LDRSB(Mem) : pass 182 | class LDRSBTr(Mem) : pass 183 | class LDRSB_POST(Mem) : pass 184 | class LDRSB_PRE(Mem) : pass 185 | class LDRSH(Mem) : pass 186 | class LDRSHTi(Mem) : pass 187 | class LDRSHTr(Mem) : pass 188 | class LDRSH_POST(Mem) : pass 189 | class LDRSH_PRE(Mem) : pass 190 | class LDRT_POST_REG(Mem) : pass 191 | class LDR_POST_IMM(Mem) : pass 192 | class LDR_POST_REG(Mem) : pass 193 | class LDR_PRE_IMM(Mem) : pass 194 | class LDR_PRE_REG(Mem) : pass 195 | class LDRi12(Mem) : pass 196 | class LDRrs(Mem) : pass 197 | class STRBT_POST_IMM(Mem) : pass 198 | class STRBT_POST_REG(Mem) : pass 199 | class STRB_POST_IMM(Mem) : pass 200 | class STRB_POST_REG(Mem) : pass 201 | class STRB_PRE_IMM(Mem) : pass 202 | class STRB_PRE_REG(Mem) : pass 203 | class STRBi12(Mem) : pass 204 | class STRBrs(Mem) : pass 205 | class STRD(Mem) : pass 206 | class STRD_POST(Mem) : pass 207 | class STRD_PRE(Mem) : pass 208 | class STREX(Mem) : pass 209 | class STREXB(Mem) : pass 210 | class STREXD(Mem) : pass 211 | class STREXH(Mem) : pass 212 | class STRH(Mem) : pass 213 | class STRHTr(Mem) : pass 214 | class STRH_POST(Mem) : pass 215 | class STRH_PRE(Mem) : pass 216 | class STRT_POST_REG(Mem) : pass 217 | class STR_POST_IMM(Mem) : pass 218 | class STR_POST_REG(Mem) : pass 219 | class STR_PRE_IMM(Mem) : pass 220 | class STR_PRE_REG(Mem) : pass 221 | class STRi12(Mem) : pass 222 | class STRrs(Mem) : pass 223 | 224 | class BL(Branch) : pass 225 | class BLX(Branch) : pass 226 | class BLX_pred(Branch) : pass 227 | class BLXi(Branch) : pass 228 | class BL_pred(Branch) : pass 229 | class BX(Branch) : pass 230 | class BX_RET(Branch) : pass 231 | class BX_pred(Branch) : pass 232 | class Bcc(Branch) : pass 233 | 234 | class CPS2p(Special) : pass 235 | class DMB(Special) : pass 236 | class DSB(Special) : pass 237 | class HINT(Special) : pass 238 | class MRS(Special) : pass 239 | class MSR(Special) : pass 240 | class PLDi12(Special) : pass 241 | class SVC(Special) : pass 242 | 243 | 244 | def loads(s): 245 | return eval(s) 246 | -------------------------------------------------------------------------------- /src/bap/bir.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """BIR - BAP Intermediate Representation""" 4 | 5 | try: 6 | from collections.abc import Sequence,Mapping 7 | except ImportError: 8 | from collections import Sequence,Mapping 9 | from .adt import * 10 | from .bil import * 11 | from . import noeval_parser 12 | 13 | 14 | class Project(ADT) : 15 | """A collection of data associated with a disassembled program""" 16 | @property 17 | def attrs(self) : 18 | """A dictionary of attributes that are global to a project. 19 | 20 | Example: 21 | >>> file = proj.attrs['filename'] 22 | """ 23 | return self.arg[0] 24 | 25 | @property 26 | def sections(self) : 27 | """code and data sections of a file. 28 | 29 | Often a binary is split into several named sections. This is 30 | the mapping from names (that varies by particular, underlying 31 | file format, and data, that represents the section) 32 | 33 | Example: 34 | 35 | >>> code = proj.sections['.text'] 36 | """ 37 | return self.arg[1] 38 | 39 | @property 40 | def memmap(self) : 41 | """a mapping from memory regions to arbitrary attributes. 42 | 43 | Some facts may be discovered about a particular memory region 44 | and attributed to it. 45 | """ 46 | return self.arg[2] 47 | 48 | @property 49 | def program(self) : 50 | """a program in BAP Intermediate Representation (BIR)""" 51 | return self.arg[3] 52 | 53 | class Term(ADT) : 54 | """Term(id,attrs,...) a program term. 55 | 56 | Every term has a dictionary of attributes, associated with it, and 57 | a unique term identifier. 58 | """ 59 | @property 60 | def id(self) : 61 | "term.id() -> Tid(id,name)" 62 | return self.arg[0] 63 | 64 | @property 65 | def attrs(self) : return self.arg[1] 66 | 67 | class Program(Term) : 68 | """Program(id,attrs,Subs(s1,s2,..,sN)) 69 | A program is a term that contains a set of subroutines.""" 70 | 71 | @property 72 | def subs(self) : return self.arg[2] 73 | 74 | class Sub(Term) : 75 | """Sub(id,Attrs(...),name,Args(...),Blks(...)) 76 | A subroutine has a sequence of arguments and basic blocks 77 | """ 78 | 79 | @property 80 | def name(self) : 81 | "subroutine name" 82 | return self.arg[2] 83 | 84 | @property 85 | def args(self) : 86 | "a list of subroutine arguments" 87 | return self.arg[3] 88 | 89 | @property 90 | def blks(self) : 91 | "subroutine basic blocks, the first is the entry" 92 | return self.arg[4] 93 | 94 | class Arg(Term) : 95 | """Arg(id,attrs,lhs,rhs,intent=None) - a subroutine argument""" 96 | 97 | @property 98 | def var(self) : 99 | """a variable associated with the argument, e.g., 100 | 101 | >>> main = proj.subs.find('main') 102 | >>> main.args[0].var.name 103 | 'main_argc' 104 | 105 | """ 106 | return self.arg[2] 107 | 108 | @property 109 | def exp(self) : 110 | "a BIL expression associated with the argument" 111 | return self.arg[3] 112 | 113 | @property 114 | def intent(self) : 115 | "an instance of Intent class or None if unknown" 116 | None if len(self.arg) == 4 else self.arg[4] 117 | 118 | class Blk(Term) : 119 | """Blk(id,attrs,(p1,..,pL),(d1,..,dM),(j1,..,jN)) 120 | A basic block is a sequence of phi-nodes, defintions and jumps. 121 | """ 122 | @property 123 | def phis(self) : 124 | "phi-nodes" 125 | return self.arg[2] 126 | @property 127 | def defs(self) : 128 | "definitions" 129 | return self.arg[3] 130 | @property 131 | def jmps(self) : 132 | "jumps" 133 | return self.arg[4] 134 | 135 | class Def(Term) : 136 | "Def(id,attrs,Var(lhs),Exp(rhs)) assign rhs to lhs" 137 | @property 138 | def lhs(self) : 139 | "an assigned variable" 140 | return self.arg[2] 141 | @property 142 | def rhs(self) : 143 | "value expression" 144 | return self.arg[3] 145 | 146 | 147 | class Jmp(Term) : 148 | "Jmp(id,attrs,cond,target) base class for jump terms" 149 | @property 150 | def cond(self) : 151 | "guard condition" 152 | return self.arg[2] 153 | 154 | @property 155 | def target(self) : 156 | "jump target" 157 | return self.arg[3] 158 | 159 | class Goto(Jmp) : 160 | "Goto(id,attrs,cond,target) control flow local to a subroutine" 161 | pass 162 | 163 | class Call(Jmp) : 164 | """Call(id,attrs,(calee,returns)) 165 | a transfer of control flow to another subroutine""" 166 | 167 | @property 168 | def calee(self) : 169 | "call destination" 170 | return self.target[0] 171 | 172 | @property 173 | def returns(self) : 174 | "a basic block to which a call will return if ever" 175 | return self.target[1] if len(self.target[1]) == 2 else None 176 | 177 | class Ret(Jmp) : 178 | "Ret(id,attrs,label) - return from a call" 179 | pass 180 | 181 | class Exn(Jmp) : 182 | "Exn(id,attrs,(number,next)) - CPU exception" 183 | @property 184 | def number(self) : 185 | "exception number" 186 | return self.target[0] 187 | 188 | @property 189 | def next(self) : 190 | """next instruction to be executed after the 191 | exception handler finishes""" 192 | return self.target[1] 193 | 194 | class Label(ADT) : pass 195 | 196 | class Direct(Label) : 197 | "Direct(tid) a statically known target of a jump" 198 | pass 199 | 200 | class Indirect(Label) : 201 | "Indirect(exp) indirect jump that is computed at runtime" 202 | pass 203 | 204 | class Intent(ADT) : 205 | "argument intention" 206 | pass 207 | class In(Intent) : 208 | "input argument" 209 | pass 210 | class Out(Intent) : 211 | "output argument" 212 | pass 213 | class Both(Intent) : 214 | "input/output argument" 215 | pass 216 | 217 | class Phi(Term) : 218 | """Phi(id,attrs,lhs,Values(b1,..,bM))) a term whose value 219 | depends on chosen control flow path""" 220 | @property 221 | def lhs(self) : 222 | "defined variable" 223 | return self.arg[2] 224 | 225 | @property 226 | def value(self) : 227 | """a mapping from the tid of the preceeding block to 228 | an expression that defines a value of phi-node""" 229 | return self.arg[3] 230 | 231 | class Def(Term) : 232 | "Def(id,attrs,lhs,rhs) - assignment" 233 | @property 234 | def lhs(self) : 235 | "program variable to be assigned" 236 | return self.arg[2] 237 | 238 | @property 239 | def rhs(self) : 240 | "value expression" 241 | return self.arg[3] 242 | 243 | 244 | class Attrs(Map) : 245 | "A mapping from attribute names to attribute values" 246 | pass 247 | 248 | class Attr(ADT) : 249 | """Attribute is a pair of attribute name and value, 250 | both represented with str""" 251 | 252 | @property 253 | def name(self): 254 | """name of attribute""" 255 | return self.arg[0] 256 | 257 | @property 258 | def value(self): 259 | """value of attribute""" 260 | return self.arg[1] 261 | 262 | class Values(Map) : 263 | """A set of possible values, taken by a phi-node. 264 | 265 | It is a mapping from the tid of a preceeding block, 266 | to an expression that denotes a value. 267 | """ 268 | def __init__(self, *args): 269 | super(Map, self).__init__(args) # pylint: disable=bad-super-call 270 | self.elements = dict(args[0]) 271 | 272 | class Tid(ADT) : 273 | """Tid(id,name=None) term unique identifier. 274 | 275 | name is an optional human readable identifier, that 276 | doesn't affect the identity. 277 | 278 | """ 279 | 280 | def __init__(self,*args): 281 | super(Tid,self).__init__(*args) 282 | noname = not isinstance(self.arg, tuple) 283 | self.number = self.arg if noname else self.arg[0] 284 | self.name = None if noname else self.arg[1] 285 | 286 | def __cmp__(self, other): 287 | return cmp(self.number, other.number) 288 | 289 | def __hash__(self): 290 | return hash(self.number) 291 | 292 | class Subs(Seq) : 293 | "a set of subroutines" 294 | pass 295 | 296 | class Args(Seq) : 297 | "sequence of arguments" 298 | pass 299 | class Blks(Seq) : 300 | "sequence of basic blocks" 301 | pass 302 | class Phis(Seq) : 303 | "sequence of phi-nodes" 304 | pass 305 | class Defs(Seq) : 306 | "sequence of definitions" 307 | pass 308 | class Jmps(Seq) : 309 | "sequence of jump terms" 310 | pass 311 | 312 | class Memmap(Seq) : 313 | "sequence of memory annotations " 314 | pass 315 | 316 | class Region(ADT) : 317 | "Region(beg,end) a pair of addresses, that denote a memory region" 318 | @property 319 | def beg(self) : return self.arg[0] 320 | 321 | @property 322 | def end(self) : return self.arg[1] 323 | 324 | class Section(ADT,Sequence) : 325 | """A contiguous piece of memory in a process image""" 326 | 327 | @property 328 | def name(self) : 329 | "name associated with the section" 330 | return self.arg[0] 331 | 332 | @property 333 | def beg(self) : 334 | "starting address" 335 | return self.arg[1] 336 | 337 | @property 338 | def data(self) : 339 | "an array of bytes" 340 | return self.arg[2] 341 | 342 | @property 343 | def end(self) : 344 | "an address of last byte" 345 | return self.beg + len(self.data) 346 | 347 | def __getitem__(self,i) : 348 | return self.data.__getitem__(i) 349 | 350 | def __len__(self) : 351 | return self.data.__len__() 352 | 353 | class Sections(ADT,Mapping) : 354 | " a mapping from names to sections" 355 | def __init__(self, *args): 356 | super(Sections, self).__init__(args) 357 | self.elements = dict((x.name,x) for x in args[0]) 358 | 359 | def __getitem__(self,i) : 360 | return self.elements.__getitem__(i) 361 | 362 | def __len__(self) : 363 | return self.elements.__len__() 364 | 365 | def __iter__(self) : 366 | return self.elements.__iter__() 367 | 368 | class Annotation(ADT) : 369 | """Annotation(Region(beg,end), Attr(name,value)) 370 | 371 | Each annotation denotes an association between a memory region and 372 | some arbitrary property, denoted with an attribute. 373 | """ 374 | @property 375 | def region(self): 376 | """memory region""" 377 | return self.arg[0] 378 | 379 | @property 380 | def attr(self): 381 | """memory region attribute""" 382 | return self.arg[1] 383 | 384 | def parse_addr(str): 385 | return int(str.split(':')[0],16) 386 | 387 | def loads(s): 388 | "loads bir object from string" 389 | return noeval_parser.parser(s) 390 | -------------------------------------------------------------------------------- /src/bap/rpc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, time, atexit, sys 5 | from signal import signal, SIGTERM 6 | import requests 7 | from subprocess import Popen 8 | from mmap import mmap 9 | if sys.version_info > (3, 0): 10 | from urllib.parse import urlparse, parse_qs 11 | else: 12 | from urlparse import urlparse, parse_qs 13 | 14 | from tempfile import NamedTemporaryFile 15 | import json 16 | from . import adt, arm, asm, bil 17 | 18 | import threading 19 | 20 | from pprint import pprint 21 | 22 | __all__ = ["disasm", "image"] 23 | 24 | DEBUG_LEVEL = ["Critical", "Error"] 25 | 26 | storage = threading.local() 27 | servers = dict() 28 | server_lock = threading.Lock() 29 | requests_lock = threading.Lock() 30 | request = None 31 | 32 | def init_requests(): 33 | global request 34 | with requests_lock: 35 | if request == None: 36 | request = requests.Session() 37 | adapter = requests.adapters.HTTPAdapter( 38 | pool_connections=1000, 39 | pool_maxsize=1000, 40 | max_retries=10, 41 | pool_block=True) 42 | request.mount('http://', adapter) 43 | 44 | init_requests() 45 | 46 | 47 | def del_instance(): 48 | instance = getattr(storage, 'instance', None) 49 | if instance is not None: 50 | instance.close() 51 | 52 | def get_instance(**kwargs): 53 | instance = getattr(storage, 'instance', None) 54 | if instance is None: 55 | args = kwargs.get('server', {}) 56 | storage.instance = Bap(args) 57 | return storage.instance 58 | 59 | atexit.register(del_instance) 60 | signal(SIGTERM, lambda x,y: del_instance) 61 | 62 | 63 | def spawn_server(**kwargs): 64 | port = str(kwargs.get('port', 8080)) 65 | name = kwargs.get('name', 'bap-server') 66 | with server_lock: 67 | if port in servers: 68 | return servers[port] 69 | else: 70 | process = Popen([name, '--port=' + port]) 71 | server = { 72 | 'server' : process, 73 | 'url' : "http://127.0.0.1:{0}".format(port) 74 | } 75 | servers[port] = server 76 | return server 77 | 78 | 79 | def disasm(obj, **kwargs): 80 | r""" disasm(obj) disassembles provided object. 81 | Returns a generator object yielding instructions. 82 | """ 83 | def run(obj): 84 | return get_instance(**kwargs).insns(obj, **kwargs) 85 | if isinstance(obj, Id): 86 | return run(obj) 87 | elif isinstance(obj, Resource): 88 | return run(obj.ident) 89 | else: 90 | return run(load_chunk(obj, **kwargs)) 91 | 92 | def image(f, **kwargs): 93 | bap = get_instance(**kwargs) 94 | if os.path.isfile(f) and not os.path.isabs(f): 95 | f = os.path.abspath(f) 96 | return Image(bap.load_file(f), bap) 97 | 98 | def load_chunk(s, **kwargs): 99 | return get_instance(**kwargs).load_chunk(s, **kwargs) 100 | 101 | 102 | 103 | 104 | class Resource(object): 105 | def __init__(self, name, ident, bap): 106 | self.ident = Id(ident) 107 | self.bap = bap 108 | self.msg = None 109 | self._name = name 110 | 111 | def load(self): 112 | if self.msg is None: 113 | self.msg = self.bap.get_resource(self.ident) 114 | if not self._name in self.msg: 115 | if 'error' in self.msg: 116 | raise ServerError(response) 117 | else: 118 | msg = "Expected {0} msg but got {1}".format( 119 | self._name, self.msg) 120 | raise RuntimeError(msg) 121 | 122 | def get(self, child): 123 | self.load() 124 | return self.msg[self._name].get(child) 125 | 126 | 127 | class Project(Resource): 128 | def __init__(self, ident, bap): 129 | super(Image,self).__init__('program', ident, bap) # pylint: disable=bad-super-call 130 | 131 | def load_program(self): 132 | self.program = bir.loads(self.get('program')) 133 | 134 | def __getattr__(self,name): 135 | if name == 'program': 136 | self.load_program() 137 | return self.program 138 | else: 139 | return self.get(name) 140 | 141 | 142 | class Image(Resource): 143 | def __init__(self, ident, bap): 144 | super(Image,self).__init__('image', ident, bap) 145 | 146 | def load_segments(self): 147 | ss = self.get('segments') 148 | self.segments = [Segment(s, self) for s in ss] 149 | 150 | def get_symbol(self, name, d=None): 151 | for sec in self.segments: 152 | sym = sec.get_symbol(name, d) 153 | if sym is not d: 154 | return sym 155 | return d 156 | 157 | def __getattr__(self, name): 158 | if name == 'segments': 159 | self.load_segments() 160 | return self.segments 161 | else: 162 | return self.get(name) 163 | 164 | class Segment(Resource): 165 | def __init__(self, ident, parent): 166 | super(Segment, self).__init__('segment', ident, parent.bap) 167 | self.parent = parent 168 | 169 | def load_symbols(self): 170 | self.symbols = [Symbol(s, self) for s in self.get('symbols')] 171 | 172 | def get_symbol(self, name, d=None): 173 | try: 174 | return next(s for s in self.symbols if s.name == name) 175 | except StopIteration: 176 | return d 177 | 178 | def __getattr__(self, name): 179 | if name == 'symbols': 180 | self.load_symbols() 181 | return self.symbols 182 | elif name == 'addr' or name == 'size': 183 | return self.get('memory')[name] 184 | elif name == 'memory': 185 | self.memory = Memory(self.get('memory'), self) 186 | return self.memory 187 | else: 188 | return self.get(name) 189 | 190 | class Symbol(Resource): 191 | def __init__(self, ident, parent): 192 | super(Symbol, self).__init__('symbol', ident, parent.bap) 193 | self.parent = parent 194 | 195 | def load_chunks(self): 196 | self.chunks = [Memory(s, self) for s in self.get('chunks')] 197 | 198 | def __getattr__(self, name): 199 | if name == 'chunks': 200 | self.load_chunks() 201 | return self.chunks 202 | elif name == 'addr': 203 | self.load_chunks() 204 | return self.chunks[0].addr 205 | else: 206 | return self.get(name) 207 | 208 | class Memory(object): 209 | def __init__(self, mem, parent): 210 | self.parent = parent 211 | self.size = int(mem['size']) 212 | self.addr = int(mem['addr']) 213 | self.links = mem['links'] 214 | 215 | def load_data(self): 216 | try: 217 | url = next(urlparse(url) for url in self.links 218 | if urlparse(url).scheme == 'mmap') 219 | qs = parse_qs(url.query) 220 | offset = int(qs['offset'][0]) 221 | with open(url.path, "rw+b") as f: 222 | mm = mmap(f.fileno(), length=0) 223 | mm.seek(offset) 224 | self.data = mm.read(self.size) 225 | mm.close() 226 | except StopIteration: 227 | self.data = None 228 | 229 | def __getattr__(self, name): 230 | if name == 'data': 231 | self.load_data() 232 | return self.data 233 | raise AttributeError(name) 234 | 235 | 236 | class ServerError(Exception): 237 | def __init__(self, err): 238 | self.msg = str(Error(err)) 239 | 240 | def __str__(self): 241 | return self.msg 242 | 243 | class Error(object): 244 | def __init__(self, err): 245 | self.__dict__.update(err) 246 | self.__dict__.update(err['error']) 247 | 248 | def __str__(self): 249 | return "{severity}: {description}".format(**self.error) 250 | 251 | class Id(object): 252 | def __init__(self, r): 253 | self.value = r 254 | def __str__(self): 255 | return str(self.value) 256 | 257 | RETRIES = 10 258 | 259 | class Bap(object): 260 | def __init__(self, server={}): 261 | if isinstance(server, dict): 262 | self.__dict__.update(spawn_server(**server)) 263 | else: 264 | self.url = server 265 | 266 | self.last_id = 0 267 | for attempt in range(RETRIES): 268 | try: 269 | self.capabilities = next(self.call({'init' : { 270 | 'version' : '0.1'}}))['capabilities'] 271 | break 272 | except Exception: 273 | if attempt + 1 == RETRIES: 274 | raise 275 | else: 276 | time.sleep(0.1 * attempt) 277 | 278 | if not "capabilities" in self.__dict__: 279 | raise RuntimeError("Failed to connect to BAP server") 280 | self.data = {} 281 | self.temp = NamedTemporaryFile('w+b', prefix="bap-") 282 | 283 | def insns(self, src, **kwargs): 284 | req = {'resource' : src} 285 | req.update(kwargs) 286 | res = self.call({'get_insns' : req}) 287 | for msg in res: 288 | if 'error' in msg: 289 | err = Error(msg) 290 | if err.severity in DEBUG_LEVEL: 291 | print(err) 292 | else: 293 | return (parse_insn(js) for js in msg['insns']) 294 | 295 | def close(self): 296 | self.__exit__() 297 | 298 | def load_file(self, name): 299 | return self._load_resource({'load_file' : { 300 | 'url' : 'file://' + name}}) 301 | 302 | def get_resource(self, name): 303 | return next(self.call({'get_resource' : name})) 304 | 305 | def load_chunk(self, data, **kwargs): 306 | kwargs.setdefault('url', self.mmap(data)) 307 | kwargs.setdefault('arch', 'i386') 308 | kwargs.setdefault('addr', 0) 309 | addr = kwargs['addr'] 310 | if isinstance(addr, str): 311 | addr = int(addr, 0) 312 | kwargs['addr'] = '0x{0:x}'.format(addr) 313 | 314 | return self._load_resource({'load_memory_chunk' : kwargs}) 315 | 316 | def __exit__(self): 317 | if 'server' in self.__dict__: 318 | self.server.terminate() 319 | self.temp.close() 320 | 321 | def dumps(self,dic): 322 | self.last_id += 1 323 | dic['id'] = Id(self.last_id) 324 | return json.dumps(dic, default=str) 325 | 326 | def call(self, data): 327 | if isinstance(data, dict): 328 | return jsons(request.post(self.url, data=self.dumps(data))) 329 | else: 330 | gen = (self.dumps(msg) for msg in data) 331 | return jsons(request.post(self.url, data=gen)) 332 | 333 | 334 | def mmap(self, data): 335 | url = "mmap://{0}?offset=0&length={1}".format( 336 | self.temp.name, len(data)) 337 | os.ftruncate(self.temp.fileno(), 4096) 338 | mm = mmap(self.temp.fileno(), 4096) 339 | mm.write(data) 340 | mm.close() 341 | return url 342 | 343 | def _load_resource(self, res): 344 | rep = next(self.call(res)) 345 | if 'error' in rep: 346 | raise ServerError(rep) 347 | return Id(rep['resource']) 348 | 349 | def jsons(r, p=0): 350 | dec = json.JSONDecoder() 351 | while True: 352 | obj,p = dec.scan_once(r.text,p) 353 | yield obj 354 | 355 | def parse_target(js): 356 | if 'target' in js: 357 | return arm.loads(js['target']) 358 | else: 359 | return None 360 | 361 | def parse_bil(js): 362 | if 'bil' in js: 363 | return [bil.loads(s) for s in js['bil']] 364 | else: 365 | return None 366 | 367 | def parse_insn(js): 368 | js.update(js['memory'], bil=parse_bil(js), target=parse_target(js)) 369 | return asm.Insn(**js) 370 | 371 | def hexs(data): 372 | return ' '.join(x.encode('hex') for x in data) 373 | -------------------------------------------------------------------------------- /src/bap/noeval_parser.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | ''' 3 | Parser for ADT string from bap that does not use eval 4 | 5 | The naive eval-based version runs into out-of-memory conditions on large files 6 | ''' 7 | import gc 8 | import sys 9 | import time 10 | 11 | from subprocess import check_output 12 | 13 | # bap.1.3 breaks the format of the following types. it prints hexes 14 | # without prefixing them with the `0x` escape. To fix it without 15 | # fixing bap, we will treat integers inside this parents as 16 | # hexadecimals if there is no prefix. 17 | BROKEN_TYPES = [ 18 | 'Section', 19 | 'Region' 20 | ] 21 | 22 | # NOTE: uses bap.bir, but cannot import at module level (circular references) 23 | 24 | def toint(string, start, end, base=10): 25 | ''' 26 | Convert substring string[start:end] to integer/long without eval 27 | 28 | Note: may contain leading whitespace 29 | ''' 30 | istr = string[start:end].lstrip() 31 | if sys.version_info > (3,): # then longs don't exist 32 | if istr.endswith('L'): 33 | istr = istr.rstrip('L') 34 | of_str = int 35 | else: 36 | if istr.endswith('L'): 37 | of_str = long 38 | else: 39 | of_str = int 40 | if istr.startswith('0x'): 41 | return of_str(istr, 16) 42 | else: 43 | return of_str(istr, base) 44 | 45 | def setup_progress(totalitems): 46 | ''' 47 | Generate functions to help track execution progress 48 | ''' 49 | last_itemsdone = [0] 50 | last_timedone = [time.time()] 51 | def s_to_hms(remain_s): 52 | ''' 53 | Convert seconds to (hours, minutes, seconds) 54 | ''' 55 | remain_m = remain_s / 60 56 | remain_h = remain_m / 60 57 | remain_m -= remain_h*60 58 | remain_s = remain_s%60 59 | return remain_h, remain_m, remain_s 60 | def progress(itemsdone): 61 | ''' 62 | Convert itemsdone of totalitems into tuple with elements: 63 | 1. tuple describing progress in units: (done/total, done, total) 64 | 2. remaining time from s_to_hms() 65 | ''' 66 | itemprogress = (100.0*itemsdone/totalitems, itemsdone, totalitems) 67 | itemsleft = totalitems - itemsdone 68 | idelta = itemsdone - last_itemsdone[0] 69 | last_itemsdone[0] = itemsdone 70 | timedone = time.time() 71 | tdelta = timedone - last_timedone[0] 72 | last_timedone[0] = timedone 73 | if idelta > 0: 74 | s_per = tdelta / idelta 75 | i_remain = itemsleft 76 | remain_s = int(i_remain * s_per) 77 | return itemprogress, s_to_hms(remain_s) 78 | return itemprogress, (-1, -1, -1) 79 | def interval(): 80 | ''' 81 | Return time since last progress() call 82 | ''' 83 | return time.time() - last_timedone[0] 84 | return interval, progress 85 | 86 | def _try_update_parent(parent, objs, stk): 87 | k = stk.pop() # pop the just evaluated item 88 | del objs[k] # preemtively remove since this is the most likely case 89 | if stk: 90 | pparent = objs[stk[-1]] 91 | assert isinstance(pparent, dict) 92 | assert pparent, 'parent is empty' 93 | assert pparent['typ'] != 'int', 'parent wrong type: %r' % (pparent['typ']) 94 | assert 'children' in pparent 95 | pparent['children'].append(parent) 96 | else: # put things back (unlikely) 97 | stk.append(k) 98 | objs[k] = parent 99 | 100 | def _parse_str(in_c, in_s, i, objs, stk): 101 | del in_c # unused 102 | endpos = i 103 | while True: # find non-escaped double quote 104 | endpos = in_s.find('"', endpos+1) 105 | if endpos < 0: 106 | raise ParserInputError("mismatched double-quote") 107 | if in_s[endpos-1] == '\\': # may be escaped double quote... 108 | # or could be a real quote after escaped slash 109 | # count slashes going back 110 | k = endpos - 2 111 | while k >= 0 and in_s[k] == '\\': 112 | k -= 1 113 | slashes = (endpos - 1) - k 114 | if slashes % 2 == 0: # this is really an ending double quote 115 | break 116 | # otherwise it's not 117 | continue 118 | break 119 | k = stk[-1] 120 | assert all((in_s[_k] in (' ', '\t', '\n') for _k in range(k, i))), \ 121 | 'pre quote is not whitespace at [%d..%d)' % (k, i) 122 | if sys.version_info > (3,): 123 | # need to use unicode_escape of a bytes, but have a str 124 | parent = objs[k] = (in_s[i+1:endpos]).encode('utf-8').decode('unicode_escape') 125 | else: 126 | parent = objs[k] = in_s[i+1:endpos].decode('string_escape') 127 | ## try added new item to parent 128 | _try_update_parent(parent, objs, stk) 129 | # next obj 130 | i = endpos+1 131 | stk.append(i) 132 | objs[i] = {} 133 | return i 134 | 135 | def _parse_finished(in_c, in_s, i, objs, stk): 136 | del in_c # unused 137 | # close an int, or make sure top object is empty and pop/return 138 | k = stk.pop() 139 | top = objs[k] 140 | del objs[k] # remove from hash 141 | if top: # must be an int 142 | assert isinstance(top, dict) 143 | if top.get('typ', None) != 'd': 144 | raise ParserInputError('Incomplete input stream') 145 | try: 146 | objs[k] = toint(in_s, k, i) 147 | except ValueError: 148 | raise ParserInputError("Integer expected between [%d..%d)" % (k, i)) 149 | # push it back 150 | stk.append(k) # this is unlikely so put the extra work here 151 | return 152 | 153 | def _parse_end(in_c, in_s, i, objs, stk): 154 | if 'typedb' not in globals(): # first time through this function 155 | # Need access to bap.bir namespace, but avoid circular import 156 | global bir # pylint: disable=global-variable-not-assigned,invalid-name 157 | from .bap import bir 158 | # potential optimization 159 | # define the typedb to optimize 160 | # global typedb # pylint: disable=global-variable-undefined,invalid-name 161 | # typedb = {} 162 | # pop last object 163 | k = stk.pop() 164 | top = objs[k] 165 | del objs[k] # remove from hash 166 | # look at parent 167 | if not stk: 168 | raise ParserInputError('Mismatched input stream') 169 | j = stk[-1] 170 | parent = objs[j] 171 | ptyp = parent['typ'] 172 | assert isinstance(parent, dict) 173 | assert parent, 'parent is empty' 174 | assert ptyp != 'int', 'parent wrong type: %r' % (parent['typ']) 175 | assert 'children' in parent 176 | if top: # add to parent if non empty 177 | # make real int before appending 178 | if top['typ'] == 'd': # int 179 | try: 180 | base = 16 if ptyp in BROKEN_TYPES else 10 181 | top = toint(in_s, k, i, base) 182 | except ValueError: 183 | raise ParserInputError("Integer expected between [%d..%d)" % (k, i)) 184 | parent['children'].append(top) 185 | if in_c == ',': # add blank object and move on 186 | # next obj 187 | i = i+1 188 | stk.append(i) 189 | objs[i] = {} 190 | return i 191 | else: # we are ending a tuple/list/app do it 192 | # maybe handle apply (num and seq are earlier) 193 | if ptyp == '[': 194 | if in_c != ']': 195 | raise ParserInputError('close %r and open %r mismatch' % (in_c, ptyp)) 196 | parent = objs[j] = parent.get('children', []) # pylint: disable=redefined-variable-type 197 | elif ptyp == '(': 198 | if in_c != ')': 199 | raise ParserInputError('close %r and open %r mismatch' % (in_c, ptyp)) 200 | parent = objs[j] = tuple(parent.get('children', ())) # pylint: disable=redefined-variable-type 201 | else: 202 | name = ptyp 203 | # potential optimization 204 | # if name not in typedb: 205 | # typedb[name] = getattr(bir, name) 206 | # parent = objs[j] = typedb[name](*parent.get('children', ())) # pylint: disable=redefined-variable-type 207 | parent = objs[j] = getattr(bir, name)(*parent.get('children', ())) # pylint: disable=redefined-variable-type 208 | # now add to parent if exists 209 | _try_update_parent(parent, objs, stk) 210 | # next obj 211 | i = i+1 212 | stk.append(i) 213 | objs[i] = {} 214 | return i 215 | 216 | def _parse_start(in_c, in_s, i, objs, stk): 217 | k = stk[-1] 218 | top = objs[k] 219 | if top: # not empty means app 220 | name_start = top['start'] # avoids whitespace issue 221 | name = in_s[name_start:i] # could just strip? 222 | top['typ'] = name 223 | else: 224 | top['typ'] = in_c # list or tuple 225 | top['children'] = [] 226 | # next obj 227 | i = i+1 228 | stk.append(i) 229 | objs[i] = {} 230 | return i 231 | 232 | def _parse_any(in_c, in_s, i, objs, stk): 233 | del in_s # unused 234 | # look at top to determine type 235 | top = objs[stk[-1]] 236 | if not top: # empty, so need to make type choice between int and app 237 | if in_c.isdigit(): 238 | top['typ'] = 'd' 239 | elif in_c in (' ', "\t", "\n"): # ignore whitespace 240 | pass # no setting, skipping whitespace 241 | else: 242 | top['typ'] = 'a' 243 | top['start'] = i # needed since whitespace might make the stack index off 244 | else: 245 | pass # type choice is already made and this char is not interesting 246 | i = i + 1 # keep going! 247 | return i 248 | 249 | _parse_functions = { # pylint: disable=invalid-name 250 | '"': _parse_str, 251 | ')': _parse_end, 252 | ']': _parse_end, 253 | ',': _parse_end, 254 | '(': _parse_start, 255 | '[': _parse_start, 256 | } 257 | 258 | def _parser(in_s, logger=None): 259 | ''' 260 | Main no-eval parser implementation 261 | ''' 262 | i = 0 263 | s_len = len(in_s) 264 | stk = [0] # start with 'top' position in stack 265 | objs = {0:{}} # start with blank object 266 | # upon reading a character it always belong to the top object 267 | # if the char ends the top object, then a new empty top is created 268 | # top object uninitialized going into loop first time 269 | interval_check, get_progress = setup_progress(s_len) 270 | while i <= s_len: 271 | if logger is not None and interval_check() > 5: 272 | progress, remaining = get_progress(i) 273 | logger.info("progress: %0.2f%% : %10d of %d" % progress) 274 | logger.info("remaining: %02d:%02d:%02d" % remaining) 275 | if i < s_len: 276 | in_c = in_s[i] 277 | else: 278 | assert i == s_len 279 | _parse_finished(in_c, in_s, i, objs, stk) 280 | break 281 | parse_func = _parse_functions.get(in_c, _parse_any) 282 | i = parse_func(in_c, in_s, i, objs, stk) 283 | # if c == '"': 284 | # i = _parse_str(c, s, i, objs, stk) 285 | # elif c in (',', ')', ']'): # ending item, tricky because tuple/list can end in comma 286 | # i = _parse_end(c, s, i, objs, stk) 287 | # elif c in ('(', '['): 288 | # i = _parse_start(c, s, i, objs, stk) 289 | # else: 290 | # i = _parse_any(c, s, i, objs, stk) 291 | assert len(stk) == 1 292 | assert stk[0] == 0 293 | assert 0 in objs 294 | result = objs[0] 295 | if isinstance(result, dict): 296 | raise ParserInputError('Incomplete input string') 297 | return objs[0] 298 | 299 | class ParserInputError(Exception): 300 | '''Class of exceptions for bad input to the parser''' 301 | pass 302 | class ParserError(Exception): 303 | '''Class of exceptions for errors in the parser, not the input''' 304 | pass 305 | 306 | def parser(input_str, disable_gc=False, logger=None): 307 | ''' 308 | Entrypoint to optimized adt parser. 309 | Input: string (non-empty) 310 | Output: Python object equivalent to eval(input_str) in the context bap.bir 311 | 312 | Options: disable_gc: if true, no garbage collection is done while parsing 313 | 314 | Notes: Expects a well formatted (ie. balanced) string with caveats: 315 | Only contains string representations of tuples, lists, integers, and 316 | function calls with name such that bap.bir.hasattr(name) is true. 317 | Integers may start with '0x' for base 16, otherwise base 10 is assumed. 318 | Strings must start and end with double-quote and not contain a 319 | double-quote, not even an escaped one 320 | ''' 321 | # _parser expects a str 322 | if not isinstance(input_str, str): 323 | input_str = input_str.decode('utf-8') 324 | if input_str == '': 325 | raise ParserInputError("ADT Parser called on empty string") 326 | if disable_gc: 327 | gc.disable() # disable for better timing consistency during testing 328 | result = _parser(input_str, logger=logger) 329 | if disable_gc: 330 | gc.enable() 331 | gc.collect() # force garbage collection to reclaim memory before we leave 332 | return result 333 | 334 | EVALFREE_ADT_PARSER = { 335 | 'format': 'adt', 336 | 'load': parser 337 | } 338 | -------------------------------------------------------------------------------- /tests/test_noeval_parser.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Test module for bap.noeval_parser 3 | ''' 4 | # pylint: disable=import-error 5 | import sys 6 | import logging 7 | import bap 8 | from bap.noeval_parser import parser, EVALFREE_ADT_PARSER, ParserInputError, ParserError 9 | 10 | logging.basicConfig(level=logging.DEBUG) 11 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 12 | 13 | def lparser(input_str): 14 | ''' 15 | wrapper for lparser under test so we can pass a logger in just one 16 | spot 17 | ''' 18 | return parser(input_str, logger=logger) 19 | 20 | EVALFREE_ADT_PARSER['load'] = lparser # override with wrapper so we have logging 21 | 22 | def test_parser_1(): 23 | # pylint: disable=missing-docstring,invalid-name 24 | s = '()' 25 | tok = lparser(s) 26 | assert tok == () 27 | 28 | def test_parser_2(): 29 | # pylint: disable=missing-docstring,invalid-name 30 | s = '(())' 31 | tok = lparser(s) 32 | assert tok == ((),) 33 | 34 | def test_parser_3(): 35 | # pylint: disable=missing-docstring,invalid-name 36 | s = '((),)' 37 | tok = lparser(s) 38 | assert tok == ((),) 39 | 40 | def test_parser_4(): 41 | # pylint: disable=missing-docstring,invalid-name 42 | s = '([],)' 43 | tok = lparser(s) 44 | assert tok == ([],) 45 | 46 | def test_parser_5(): 47 | # pylint: disable=missing-docstring,invalid-name 48 | s = '([1],)' 49 | tok = lparser(s) 50 | assert tok == ([1],) 51 | 52 | def test_parser_6(): 53 | # pylint: disable=missing-docstring,invalid-name 54 | def hello(x): 55 | assert x == [1] 56 | return 'hi' 57 | bap.bir.hello = hello # hack to test function applications 58 | try: 59 | s = 'hello([1],)' 60 | tok = lparser(s) 61 | assert tok == 'hi' 62 | finally: 63 | del bap.bir.hello 64 | 65 | def test_parser_7(): 66 | # pylint: disable=missing-docstring,invalid-name 67 | s = '("abc")' 68 | tok = lparser(s) 69 | assert tok == ("abc",) 70 | 71 | def test_parser_8(): 72 | # pylint: disable=missing-docstring,invalid-name 73 | def hello(x): 74 | assert x == [1] 75 | return 'hi' 76 | bap.bir.hello = hello 77 | s = '( "abc")' 78 | tok = lparser(s) 79 | assert tok == ("abc",) 80 | 81 | def test_parser_9(): 82 | # pylint: disable=missing-docstring,invalid-name 83 | s = r'"\""' 84 | tok = lparser(s) 85 | assert tok == '"' 86 | 87 | def test_parser_10(): 88 | # pylint: disable=missing-docstring,invalid-name 89 | s = '"\\\\"' 90 | assert eval(s) == '\\' # pylint: disable=eval-used 91 | tok = lparser(s) 92 | assert tok == '\\' 93 | 94 | def test_parser_12(): 95 | # pylint: disable=missing-docstring,invalid-name 96 | s = r'"\\\""' 97 | assert eval(s) == '\\"' # pylint: disable=eval-used 98 | tok = lparser(s) 99 | assert tok == '\\"' 100 | 101 | def test_parser_11(): 102 | # pylint: disable=missing-docstring,invalid-name 103 | s = r'"\'"' 104 | tok = lparser(s) 105 | assert tok == "'" 106 | 107 | def test_compare_to_old_escapes_1(tmpdir): 108 | # pylint: disable=missing-docstring,invalid-name 109 | import os 110 | tmpdir.join('test.c').write('int main() { return 0; }') 111 | with tmpdir.as_cwd(): 112 | assert os.system('gcc -o test.out test.c') == 0 113 | comment = r'a slash: \\' 114 | main([None, 'test.out'], extras=([ 115 | '--map-terms-with', 116 | '((true) (comment "{}"))'.format(comment), 117 | '--map-terms'],)) 118 | main([None, 'test.out', 'skip'], extras=([ 119 | '--map-terms-with', 120 | '((true) (comment "{}"))'.format(comment), 121 | '--map-terms'],)) 122 | 123 | def test_compare_to_old_escapes_2(tmpdir): 124 | # pylint: disable=missing-docstring,invalid-name 125 | import os 126 | tmpdir.join('test.c').write('int main() { return 0; }') 127 | with tmpdir.as_cwd(): 128 | assert os.system('gcc -o test.out test.c') == 0 129 | comment = r'an escaped quote: \"' 130 | main([None, 'test.out'], extras=([ 131 | '--map-terms-with', 132 | '((true) (comment "{}"))'.format(comment), 133 | '--map-terms'],)) 134 | main([None, 'test.out', 'skip'], extras=([ 135 | '--map-terms-with', 136 | '((true) (comment "{}"))'.format(comment), 137 | '--map-terms'],)) 138 | 139 | def test_compare_to_old_escapes_3(tmpdir): 140 | # pylint: disable=missing-docstring,invalid-name 141 | import os 142 | tmpdir.join('test.c').write('int main() { return 0; }') 143 | with tmpdir.as_cwd(): 144 | assert os.system('gcc -o test.out test.c') == 0 145 | comment = r'an escaped slash and then escaped quote: \\\"' 146 | main([None, 'test.out'], extras=([ 147 | '--map-terms-with', 148 | '((true) (comment "{}"))'.format(comment), 149 | '--map-terms'],)) 150 | main([None, 'test.out', 'skip'], extras=([ 151 | '--map-terms-with', 152 | '((true) (comment "{}"))'.format(comment), 153 | '--map-terms'],)) 154 | 155 | def test_compare_to_old_escapes_4(tmpdir): 156 | # pylint: disable=missing-docstring,invalid-name 157 | comment = r'an escaped slash and then escaped quote: \\\"' 158 | import os 159 | tmpdir.join('test.c').write('int main() { return 0; }') 160 | comment_file = tmpdir.join('comment.scm') 161 | comment_file.write('((true) (comment "{}"))'.format(comment)) 162 | with tmpdir.as_cwd(): 163 | assert os.system('gcc -o test.out test.c') == 0 164 | main([None, 'test.out'], extras=([ 165 | '--map-terms-using=%s' % comment_file, 166 | '--map-terms'],)) 167 | main([None, 'test.out', 'skip'], extras=([ 168 | '--map-terms-using=%s' % comment_file, 169 | '--map-terms'],)) 170 | 171 | def test_parser_badinput_1(): 172 | # pylint: disable=missing-docstring,invalid-name 173 | with pytest.raises(ParserInputError): 174 | lparser('a') 175 | 176 | def test_parser_badinput_2(): 177 | # pylint: disable=missing-docstring,invalid-name 178 | with pytest.raises(ParserInputError): 179 | lparser('(') 180 | 181 | def test_parser_badinput_3(): 182 | # pylint: disable=missing-docstring,invalid-name 183 | with pytest.raises(ParserInputError): 184 | lparser(')') 185 | 186 | def test_parser_badinput_4(): 187 | # pylint: disable=missing-docstring,invalid-name 188 | with pytest.raises(ParserInputError): 189 | lparser('') 190 | 191 | def test_parser_badinput_5(): 192 | # pylint: disable=missing-docstring,invalid-name 193 | with pytest.raises(ParserInputError): 194 | lparser(',') 195 | 196 | def test_parser_badinput_6(): 197 | # pylint: disable=missing-docstring,invalid-name 198 | with pytest.raises(ParserInputError): 199 | lparser('1a2') 200 | 201 | def test_parser_badinput_7(): 202 | # pylint: disable=missing-docstring,invalid-name 203 | with pytest.raises(ParserInputError): 204 | lparser('(]') 205 | 206 | def test_parser_badinput_8(): 207 | # pylint: disable=missing-docstring,invalid-name 208 | with pytest.raises(ParserInputError): 209 | lparser('[)') 210 | 211 | def test_big_1(): 212 | # pylint: disable=missing-docstring,invalid-name 213 | n = 1000 214 | hard_to_eval = '('*n + '0,' + ')'*n 215 | try: 216 | eval(hard_to_eval) # pylint: disable=eval-used 217 | assert False, 'expected MemoryError' 218 | except MemoryError: 219 | pass # expected 220 | result = lparser(hard_to_eval) 221 | # try to verify structure 222 | i = 0 223 | while i < n-1: 224 | i += 1 225 | assert isinstance(result, tuple) 226 | # assert len(list(result)) == 0 # this hits same MemoryError 227 | assert result[0] is result[-1] # this test is equivalent I think 228 | result = result[0] 229 | assert isinstance(result, tuple) 230 | assert len(result) == 1 231 | assert result == (0,) 232 | 233 | def test_compare_to_old_1(tmpdir): 234 | # pylint: disable=missing-docstring,invalid-name 235 | import os 236 | tmpdir.join('test.c').write('int main() { return 0; }') 237 | with tmpdir.as_cwd(): 238 | assert os.system('gcc -o test.out test.c') == 0 239 | main([None, 'test.out']) 240 | 241 | def test_compare_to_old_2(tmpdir): 242 | # pylint: disable=missing-docstring,invalid-name 243 | import os 244 | tmpdir.join('test.c').write('int main() { return 0; }') 245 | with tmpdir.as_cwd(): 246 | assert os.system('gcc -o test.out test.c') == 0 247 | main([None, 'test.out', 'skipeval']) 248 | 249 | # NOTE: this should be the last test to avoid memory usage affecting other tests 250 | def test_compare_to_old_verybig(tmpdir): 251 | # pylint: disable=missing-docstring,invalid-name 252 | import os 253 | tmpdir.join('test.c').write('int main() { return 0; }') 254 | with tmpdir.as_cwd(): 255 | assert os.system('gcc -static -o test.out test.c') == 0 256 | main([None, 'test.out', 'skipeval']) 257 | 258 | # Fixed ADT.__repr__ to match bap output to support testing 259 | # Should consider merging this, but breaks compatabilty if anybody relied on 260 | # the str() or repr() results on an ADT object 261 | # Also bap seems to be inconsistent with trailing commas in tuples, so not sure 262 | # which one is strictly better 263 | 264 | integer_types = (int, long) if sys.version_info < (3,) else (int,) # pylint: disable=invalid-name 265 | 266 | # this version always has trailing commas in tuples 267 | def ADT_repr1(self): # copied from bap.adt with tweaks. pylint: disable=invalid-name 268 | # pylint: disable=missing-docstring, invalid-name 269 | def qstr(x): 270 | if isinstance(x, integer_types): 271 | return '0x{0:x}'.format(x) 272 | elif isinstance(x, bap.adt.ADT): 273 | return repr(x) 274 | elif isinstance(x, tuple): 275 | return "(" + ",".join(qstr(i) for i in x) + ",)" # always trailing commas 276 | elif isinstance(x, list): 277 | return "[" + ",".join(qstr(i) for i in x) + "]" 278 | else: 279 | return '"' + repr(x)[1:-1] + '"' 280 | def args(): 281 | if isinstance(self.arg, tuple): 282 | return ",".join(qstr(x) for x in self.arg) 283 | else: 284 | return qstr(self.arg) 285 | 286 | return "{0}({1})".format(self.constr, args()) 287 | 288 | # this version never has trailing commas in tuples 289 | def ADT_repr2(self): # copied from bap.adt with tweaks. pylint: disable=invalid-name 290 | # pylint: disable=missing-docstring, invalid-name 291 | def qstr(x): 292 | if isinstance(x, integer_types): 293 | return '0x{0:x}'.format(x) 294 | elif isinstance(x, bap.adt.ADT): 295 | return repr(x) 296 | elif isinstance(x, tuple): 297 | return "(" + ",".join(qstr(i) for i in x) + ")" 298 | elif isinstance(x, list): 299 | return "[" + ",".join(qstr(i) for i in x) + "]" 300 | else: 301 | return '"' + repr(x)[1:-1] + '"' 302 | def args(): 303 | if isinstance(self.arg, tuple): 304 | return ",".join(qstr(x) for x in self.arg) 305 | else: 306 | return qstr(self.arg) 307 | 308 | return "{0}({1})".format(self.constr, args()) 309 | 310 | 311 | def conv(s, i, mayint=True): # pylint: disable=invalid-name 312 | '''helper function for comparing bap string output and the __repr__ of 313 | ADT objects 314 | ''' 315 | if s[i] == ' ' and s[i-1] == ',': # skip whitespace after comma 316 | j = i+1 317 | while s[j] == ' ': 318 | j += 1 319 | return conv(s, j) 320 | elif s[i] == '\\': # handle escaped values 321 | if s[i+1] == 'x': 322 | assert s[i+2] in '0123456789abcdef' 323 | assert s[i+3] in '0123456789abcdef' 324 | return chr(int(s[i+2:i+4], 16)), i+4 325 | else: 326 | return eval('"' + s[i:i+2] + '"'), i+2 # pylint: disable=eval-used 327 | elif mayint and s[i:i+2] == '0x': # try to normalize integers in hex representation 328 | j = i + 2 329 | while s[j] in '0123456789abcdef': 330 | j += 1 331 | if j == (i + 2): # not really a hex integer expression 332 | return s[i], i+1 333 | return int(s[i+2:j], 16), j # NOTE: returning int not char 334 | else: 335 | return s[i], i+1 336 | 337 | def get_proj_strs(proj): 338 | ''' 339 | Returns results of repr(proj) with various bap.adt.ADT.__repr__ 340 | implementations 341 | 342 | Uses ADT_repr1 and ADT_repr2 as neccessary based on Python version 343 | ''' 344 | astr0 = repr(proj) # get string represtation 345 | orig_ADT_repr = bap.adt.ADT.__repr__ # pylint: disable=invalid-name 346 | try: 347 | if True: 348 | # if sys.version_info < (3,): 349 | bap.adt.ADT.__repr__ = ADT_repr1 # Monkey patch in ADT_repr1 350 | astr1 = repr(proj) # get string represtation 351 | if True: 352 | # if sys.version_info < (3,): 353 | bap.adt.ADT.__repr__ = ADT_repr2 # Monkey patch in ADT_repr2 354 | astr2 = repr(proj) # get string represtation 355 | finally: 356 | bap.adt.ADT.__repr__ = orig_ADT_repr # fix before leaving 357 | 358 | return astr0, astr1, astr2 359 | 360 | def _compare_proj_str(estr, possible_actual_strs): 361 | ''' 362 | Compare string output from bap with (normalized) repr() of the project 363 | created with the eval-free parser 364 | 365 | Comparison is unfortunately complex. We need to compare varying 366 | representations without resorting to eval otherwise we hit the same bug 367 | the eval-free parser is trying to fix. 368 | ''' 369 | exceptions = [] 370 | for aidx, astr in enumerate(possible_actual_strs): # so we can try both ADT_repr implementations 371 | try: 372 | i = 0 373 | j = 0 374 | a_len = len(astr) 375 | e_len = len(estr) 376 | 377 | while i < a_len and j < e_len: 378 | achar, i_new = conv(astr, i) 379 | echar, j_new = conv(estr, j) 380 | if achar == echar: 381 | i = i_new 382 | j = j_new 383 | continue 384 | else: 385 | if estr[j] == '\\': # try the simple version of achar 386 | achar_new, i_new_new = astr[i], i+1 387 | if achar_new == echar: 388 | i = i_new_new 389 | j = j_new 390 | continue 391 | if isinstance(achar, integer_types) and not isinstance(echar, integer_types): 392 | # convert echar and compare 393 | k = j+1 394 | while estr[k] in '0123456789': 395 | k += 1 396 | try: 397 | eint = int(estr[j:k]) 398 | info = 'int mismatch at i=%d j=%d %d!=%d' % (i, j, 399 | achar, 400 | eint) 401 | assert achar == eint, info 402 | j = k 403 | i = i_new 404 | continue 405 | except (ValueError, AssertionError): 406 | # couldnt convert to int, or they dont match 407 | # try non-integer version 408 | achar, i_new = conv(astr, i, mayint=False) 409 | if achar == echar: 410 | i = i_new 411 | j = j_new 412 | continue 413 | if astr[i] == ',': # try again but "no-comma" ADT_repr 414 | break # while and go on to next astr option 415 | info = '' 416 | info += "proj failed at index i=%d j=%d\n" % (i, j) 417 | if i >= 20: 418 | info += "astr = %s\n%s\n" % (astr[i-20:i+10], '-'*(7+20)+'^') 419 | else: 420 | info += "astr = %s\n%s\n" % (astr[0:i+10], '-'*(i+7)+'^') 421 | if j >= 20: 422 | info += "estr = %s\n%s\n" % (estr[j-20:j+10], '-'*(7+20)+'^') 423 | else: 424 | info += "estr = %s\n%s\n" % (estr[0:j+10], '-'*(j+7)+'^') 425 | assert False, info 426 | break # done ok! 427 | except Exception as exc: # pylint: disable=broad-except 428 | exceptions.append((exc, sys.exc_info())) 429 | if (aidx+1) == len(possible_actual_strs): # then we're on last one so raise all 430 | # if all the exceptions were the same, just reraise this one 431 | set_of = set((str(e) for (e, _) in exceptions)) 432 | if len(set_of) == 1: 433 | # raise 434 | assert False, exceptions 435 | # otherwise assert False with all of them 436 | assert False, exceptions 437 | 438 | 439 | def main(argv=None, debugging=False, extras=()): 440 | ''' 441 | Main entry point, allows quick comparison of eval-based adt parser with this 442 | eval-free adt parser. 443 | 444 | Done by parsing, then comparing objects with ==. 445 | 446 | Also converts objects to strings for char-by-char comparison if the objects 447 | don't match, or the eval version can/should not be used. 448 | ''' 449 | import os # this is one of the few test functions needing this module 450 | 451 | # setup parser struct that uses eval. Do this explicitly so tests always 452 | # compare against an eval version, even after the code is (hopefully) merged 453 | witheval_adt_parser = { 454 | 'format': 'adt', 455 | 'load': lambda s: eval(s, bap.bir.__dict__) # pylint: disable=eval-used 456 | } 457 | 458 | if argv is None: 459 | argv = sys.argv 460 | toparse = argv[1] 461 | if not debugging: 462 | debugging = len(argv) > 3 463 | logger.debug("debugging = %s", debugging) 464 | 465 | if debugging and os.path.exists('estr.txt'): # optional optimize 466 | logger.debug('loading estr.txt') 467 | with open('estr.txt') as fobj: 468 | estr = fobj.read() 469 | else: 470 | skipeval = len(argv) > 2 471 | if skipeval: 472 | logger.info("Calling bap.run(%r, parser=PASSTHRU)", toparse) 473 | projtxt = bap.run(toparse, *extras, parser={'format':'adt', 'load':lambda s: s}) 474 | if not isinstance(projtxt, str): # on python3 projtxt is bytes not str 475 | estr = projtxt.decode('utf-8') 476 | else: 477 | estr = str(projtxt) # pylint: disable=redefined-variable-type 478 | # normalize white space in input 479 | estr = estr.replace("\n", "") 480 | # normalize strings in input 481 | else: 482 | logger.info("Calling bap.run(%r, parser=WITHEVAL)", toparse) 483 | origproj = bap.run(toparse, *extras, parser=witheval_adt_parser) 484 | 485 | # make sure to do this here not before calling bap the first time 486 | # Once this runs, if a lot of memory is used, Python can't create 487 | # child processes in all cases because os.fork() will fail under heavy 488 | # memory load 489 | logger.info("Calling bap.run(%r, parser=EVALFREE)", toparse) 490 | new_proj = bap.run(toparse, *extras, parser=EVALFREE_ADT_PARSER) 491 | 492 | if not skipeval: 493 | if origproj == new_proj: # done! 494 | return 495 | estr = str(origproj) 496 | 497 | if debugging and all(( # optionally optimize to test faster 498 | os.path.exists('/tmp/astr0.txt'), 499 | os.path.exists('/tmp/astr1.txt'), 500 | os.path.exists('/tmp/astr2.txt'))): 501 | logger.debug('loading astr0.txt') 502 | with open('/tmp/astr0.txt') as fobj: 503 | astr0 = fobj.read() 504 | logger.debug('loading astr1.txt') 505 | with open('/tmp/astr1.txt') as fobj: 506 | astr1 = fobj.read() 507 | logger.debug('loading astr2.txt') 508 | with open('/tmp/astr2.txt') as fobj: 509 | astr2 = fobj.read() 510 | else: # normal test path 511 | if 'new_proj' not in locals(): # since we may have optimized it out 512 | logger.info("Calling bap.run(%r, parser=EVALFREE)", toparse) 513 | new_proj = bap.run(toparse, parser=EVALFREE_ADT_PARSER) 514 | 515 | astr0, astr1, astr2 = get_proj_strs(new_proj) 516 | 517 | if debugging: # save for manual inspection 518 | with open('/tmp/astr0.txt', 'w') as fobj: 519 | fobj.write(astr1) 520 | with open('/tmp/astr1.txt', 'w') as fobj: 521 | fobj.write(astr1) 522 | with open('/tmp/astr2.txt', 'w') as fobj: 523 | fobj.write(astr2) 524 | with open('/tmp/estr.txt', 'w') as fobj: 525 | fobj.write(estr) 526 | 527 | _compare_proj_str(estr, (astr0, astr1, astr2)) 528 | 529 | 530 | try: 531 | import pytest # pylint: disable=wrong-import-position 532 | HAVE_PYTEST = True 533 | except ImportError: 534 | HAVE_PYTEST = False 535 | 536 | if HAVE_PYTEST: 537 | # mark the slow ones as 'slow' 538 | # Run pytest with '--slow' to also run the slow tests 539 | test_compare_to_old_verybig = pytest.mark.slow(test_compare_to_old_verybig) # pylint: disable=invalid-name 540 | 541 | if __name__ == '__main__': 542 | main() 543 | 544 | -------------------------------------------------------------------------------- /src/bap/adt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Algebraic Data Types for Python. 3 | 4 | Algebraic Data Types is not an attempt to add a strict typing 5 | discipline to Python, and the word ``type'' here has a much broader 6 | meaning. Types represent models of reasoning about objects. This 7 | models we, humans, employ everyday (at least those of us, who do the 8 | thinking). These are just methods (among others) that we're using to 9 | structure our knowledge. For example, we can say, that both 10 | ``Bananas`` and ``Apples`` are ``Fruits`` (biologists, please stop 11 | reading at this point). With this phrase we constructively defined a 12 | new type (concept, idea), that we named the ``Fruit``. To contrast 13 | with abstraction, we didn't try to find anything common between these 14 | two entities, and to remove the differences, we just stated, that the 15 | Fruit is either Banana or Apple. No more, no less. We just used an 16 | alteration to define something. Another example of the alteration 17 | would be to say, that a human is either a man or woman. 18 | 19 | If we will reason about types, as sets, then the alteration can be 20 | viewed as a union. A disjoint union in our case, as we're not loosing 21 | any information (we are not abstracting anything out). The union 22 | operation is isomorphic to the summation in arithmetic, that's why we 23 | call such types - sum types. A dual of the sum is a product. The 24 | product models and idea of a composition, i.e., when an entity is 25 | composed of other entities. For example, a ``Bicycle`` is a 26 | combination of ``Wheels``, ``Frame`` and ``Handlebars``. And a ``Car`` 27 | is a combination of ``Wheels``, ``Body``, ``Doors``, and 28 | ``Engine``. Again, we described concepts constructively, we didn't try 29 | to make any abstractions. (In fact, we employed an abstraction, when 30 | we made a choice how to represent the compound object, by omitting 31 | parts that are not relevant, with respect to our task. But this is a 32 | completely different modus of reasoning, that is in fact orthogonal to 33 | ADT). 34 | 35 | Finally, we can mix both concepts together to model even more complex 36 | ideas. For example, we can define that a ``Vehicle`` is either a 37 | ``Car`` or ``Bicycle``. Suppose, that we're trying to model road 38 | traffic. In that case we can tell that we have two kinds of road 39 | users, either a ``Motorist`` that is a combination of a ``Car``, 40 | ``Driver``, ``Passengers`` and ``Luggage``, and a ``Bicyclist`` that 41 | is a composition of ``Bicycle`` and the ``Driver``. You may see, that 42 | we apply the sum and product recursively, that's why the ADT types are 43 | also called recursive types. The same way as you can build complex 44 | algebraic expressions using sum and product, we can build complex data 45 | using a combination of sum and product. The whole set of algebraic 46 | data types is a closure of sum and product operations. 47 | 48 | We can define such complex concepts as lists, tables, trees and, even, 49 | natural numbers, using only ADT. For example, a list is either Empty, 50 | or it is a Pair of an element and the rest of a List (note that since 51 | the type is recursive, we can use the type in its own definition). For 52 | example, ``[1,2,3]`` can be represented as 53 | ``Pair(1,Pair(2,Pair(3,Empty())))``. A Natural number is either Zero 54 | or a Successor of a Natural number, so that we can represent 3 as 55 | ``Successor(Successor(Successor(Zero())))``. So, we don't even need 56 | numerals, to represent the list [1,2,3]: 57 | 58 | ``` 59 | Pair(Successor(Zero()), 60 | Pair(Successor(Successor(Zero())), 61 | Pair(Successor(Successor(Successor(Zero()))), 62 | Empty()))) 63 | ``` 64 | 65 | You may notice, that these examples are actually syntactically valid 66 | Python code. So we're now close to the point, where we can define, 67 | how we will represent ADT in Python. It is believed, that Python 68 | doesn't support ADT (at least it is not listed in wikipedia as one of 69 | such languages), but as examples above show, this is not true. 70 | 71 | We will use inheritance to represent sum types. For example to say, that 72 | Fruit is Banana or Apple, we do the following: 73 | 74 | class Fruit(ADT): pass 75 | class Banana(Fruit): pass 76 | class Apple(Fruit): pass 77 | 78 | 79 | The product types, aka tuples, are already in the language, so we're 80 | done. We will use the following syntax, to say that a Bicycle is a 81 | product of Wheels, Frame and Handlebars: 82 | 83 | class Bicycle(ADT) : pass 84 | class Wheels(ADT) : pass 85 | class Frame(ADT) : pass 86 | class Handlebars(ADT) : pass 87 | 88 | Bicycle(Wheels(), Frame(), Handlebars()) 89 | 90 | We're not trying to enforce the type discipline here, by guaranteeing, 91 | that it is only possible to construct a Bicycle only from this three 92 | things. This is Python anyway. 93 | 94 | So, it looks like that we didn't introduce anything at all, other than 95 | extra verbose syntax, hidden by some type theoretic mumbo jumbo. Well 96 | yes, but this is only on a surface. The idea behind this library is 97 | that ADT is a great generalization, which we can employ to write code, 98 | that will work for any ADT. 99 | 100 | The first generalization, is that we can easily print any ADT in a 101 | unified syntax, and this syntax can be chosen to be a valid subset of 102 | Python syntax. In fact it is also a valid subset of many other 103 | programming languages, such as Ruby, JavaScript, Java, C, OCaml, 104 | Haskell, etc. That also mean, that we can easily parse them back, 105 | especially if the language provides an access to the parser (like 106 | Python). Thus, ADT is a nice data representation format (like json, 107 | xml, S-expressions), that is very suitable for storing hierarchical data. 108 | 109 | The second generalization, is that we can employ the same method of 110 | processing ADT. A usual way of processing lists and other iterable 111 | objects, is to apply some operation over every consecutive element of 112 | the list. ADT are more general, than lists (in fact lists a special 113 | case of ADT). ADT are hierarchical, so the elements have also 114 | ancestor/descendant relationships in addition to the 115 | successor/predecessor. Also, every element of an ADT value, is tagged 116 | by a name. And theses names also forms a separate type hierarchy, so 117 | that we have both object and type hierarchies. Given such a general 118 | structure, we need to find a general way of iteration over it. We will 119 | call it visiting. So visiting is a generalization of an iteration, 120 | where the computation is represented by an object called Visitor, that 121 | applies itself to each structural element of the ADT object. The 122 | visitor object has a method for each type of structural component, and 123 | thanks to a unified representation of the ADT type, it knows how to 124 | deconstruct any instance of ADT. So, we generalized a way of 125 | traversing data structure, so that a user of it needs only to specify 126 | the computation, that needs to be applied for each, or some 127 | elements. 128 | 129 | We can compare visiting with a regular iteration over some 130 | hierarchical data structures, like compounds of lists and 131 | maps. Suppose, that we're modeling a library, and started with the 132 | following representation: 133 | 134 | 135 | Library -> Shelf -> Book -> (Author, Title) 136 | 137 | And we wrote a function that will count a total number of distinct authors: 138 | 139 | def count_authors(library): 140 | authors = set() 141 | for shelf in library: 142 | for book in shelf: 143 | authors.add(book.author) 144 | return len(authors) 145 | 146 | The code looks fine, but it has one problem, it hardcodes the 147 | structure of our library. If at some point of time we decide, that we 148 | chose a wrong representation and it is much better to represent it as: 149 | 150 | Author -> Title -> Library -> Shelf 151 | 152 | Then we need to rewrite our ``count_authors`` function. On the other 153 | hand, with the visitor approach the following code will work with both 154 | representations. 155 | 156 | 157 | class AuthorCounter(Visitor): 158 | def __init__(self): 159 | self.authors = set() 160 | def visit_Author(self, author): 161 | self.authors.add(author) 162 | 163 | def count_authors(library): 164 | counter = AuthorCounter() 165 | counter.run(library) 166 | return len(counter.authors) 167 | 168 | 169 | This variant is slightly more verbose, but is easier to implement, as 170 | we don't need to know the hierarchical structure of the data, and 171 | anything about the data representation. Moreover, it is easier to 172 | support, as it will not break, when something is added or removed from 173 | the library structure. 174 | 175 | The visitor pattern really starts to shine, when the hierarchy is much 176 | more complex, than in the example, that we provided above. For 177 | example, Abstract Syntax Trees (AST) tend to be very complex even for 178 | toy languages, and writing the traversing code for them is very 179 | tedious. Moreover, the code needed to be repeated over and over again, 180 | leading to fragile and hard to support programs. 181 | 182 | 183 | """ 184 | 185 | try: 186 | from collections.abc import Iterable,Sequence,Mapping 187 | except ImportError: 188 | from collections import Iterable,Sequence,Mapping 189 | 190 | class ADT(object): 191 | """Algebraic Data Type. 192 | 193 | This is a base class for all ADTs. ADT represented by a tuple of 194 | arguments, stored in a `arg` field. Arguments should be instances 195 | of ADT class, numbers, strings or lists. Empty set of arguments is 196 | permitted. A one-tuple is automatically untupled, i.e., `Int(12)` 197 | has value `12`, not `(12,)`. A name of the constructor is stored 198 | in the `constr` field 199 | 200 | A structural comparison is provided. 201 | 202 | """ 203 | def __init__(self, *args): 204 | self.constr = self.__class__.__name__ 205 | self.arg = args if len(args) != 1 else args[0] 206 | 207 | def __cmp__(self,other): 208 | return self.__dict__.__cmp__(other.__dict__) 209 | 210 | def __repr__(self): 211 | def qstr(x): 212 | if isinstance(x, (int)): 213 | return '0x{0:x}'.format(x) 214 | elif isinstance(x, ADT): 215 | return str(x) 216 | elif isinstance(x, tuple): 217 | return "(" + ", ".join(qstr(i) for i in x) + ")" 218 | else: 219 | return '"{0}"'.format(x) 220 | def args(): 221 | if isinstance(self.arg, tuple): 222 | return ", ".join(qstr(x) for x in self.arg) 223 | else: 224 | return qstr(self.arg) 225 | 226 | return "{0}({1})".format(self.constr, args()) 227 | 228 | 229 | class Visitor(object): 230 | """ADT Visitor. 231 | This class helps to perform iterations over arbitrary ADTs. 232 | 233 | 234 | When visitor runs, it will visit each constituent of an ADT. 235 | When an ADT instance is visited, the visitor will first look 236 | for method named `enter_C` for each class `C` in the MRO of 237 | the ADT instance. All found methods will be invoked. 238 | 239 | Then it will look for a method called `enter_C` for each class `C` 240 | in the MRO sequence of the ADT class. If one is found, 241 | then it will be called, other classes in the MRO sequence will not 242 | be considered. 243 | 244 | Finally, the visitor will look for a method called `leave_C` using 245 | the same algorithm as described for the `enter_C` method. 246 | 247 | The algorithm, described above, actually implements the 248 | depth-first traversal. Methods starting with the prefix `enter` 249 | are called right before the corresponding subtree is visited 250 | (preorder). Methods starting with the `leave` are called just 251 | after the subtree is visited. Methods starting with `visit` 252 | actually perform the visiting. If it is not overridden, then 253 | `visit_ADT` method is invoked, that will continue traversal to the 254 | subtree. If `visit_C` method is overridden (where `C` is name of 255 | class in the MRO of the ADT instance), then it is responsibility 256 | of the `visit_C` method to call `run` method to continue 257 | traversal. If `run` is not called, then the traversal will not 258 | continue. It is possible to change the order of traversal, by 259 | overriding `visit` methods. Usually, it is better to keep away 260 | from the `visit` methods, and use `enter` (the preorder traversal) 261 | if possible. However, if it is needed to inject some code between 262 | the traversal of two subtrees of a tree, or if an order should be 263 | changed, then the visit method is a way to go. 264 | 265 | By default, every element of an ADT is traversed. It is possible 266 | to terminate the traversal abnormally (to short-circuit) by 267 | returning not-a-None value from any of the methods. The returned 268 | value will be a result of the `run` method. 269 | 270 | 271 | Example 272 | ------- 273 | 274 | Suppose we have a small expression language with defined as 275 | follows: 276 | 277 | >>> class Exp(ADT) : pass 278 | >>> class Binop(Exp) : pass 279 | >>> class Unop(Exp) : pass 280 | >>> class Value(Exp) : pass 281 | >>> class Add(Binop) : pass 282 | >>> class Mul(Binop) : pass 283 | >>> class Neg(Unop) : pass 284 | >>> class Var(Value) : pass 285 | >>> class Int(Value) : pass 286 | 287 | 288 | We will write an abstract interpreter that will calculate a sign 289 | of expression. In our abstraction, we now a sign of constants, 290 | signs of variables are unknown. The negation operation negates the 291 | sign of expression, and any binary operation preserves the sign, 292 | if both operands have the same sign, otherwise the sign is 293 | undefined. We will use the following lattice to represent our 294 | abstraction: 295 | 296 | 297 | True False 298 | | | 299 | +--+--+ 300 | | 301 | None 302 | 303 | The same expressed in Python: 304 | 305 | 306 | >>> class Sign(Visitor) : 307 | def __init__(self): 308 | self.neg = None 309 | 310 | def visit_Binop(self,exp): 311 | self.run(exp.arg[0]) 312 | lhs = self.neg 313 | self.run(exp.arg[1]) 314 | rhs = self.neg 315 | if lhs != rhs: 316 | self.neg = None 317 | 318 | def leave_Neg(self,exp): 319 | if self.neg is not None: 320 | self.neg = not self.neg 321 | 322 | def enter_Var(self,var): 323 | self.neg = None 324 | 325 | def enter_Int(self,n): 326 | self.neg = n < Int(0) 327 | 328 | We overrode method ``visit_Binop`` that will be invoked for both, 329 | addition and subtraction, since in our abstraction they behave the 330 | same. We chose to override the ``visit`` stage instead of the 331 | ``enter`` or leave, because we wanted to inject our code between 332 | visiting left and right branch of the expression. We overrode 333 | `leave_Neg` to switch the sign _after_ the enclosed expression is 334 | visited. Since variable can have arbitrary sign, we're must stop 335 | the sign analysis as soon as we have a variable. Finally, for constants 336 | we just look at their sign. 337 | 338 | 339 | To test our sign analysis let's write a simple expression, 340 | 341 | >>> exp = Add((Neg(Neg(Int(1)))), Mul(Int(2), Neg(Neg(Int(3))))) 342 | 343 | It is easy to see that it is positive (in fact it is not). In the 344 | infix notation, the expression corresponds to 345 | 346 | 347 | >>> -(-1) + 2 * -(-3) 348 | 7 349 | 350 | So, let's run the analysis: 351 | 352 | >>> exp = Add((Neg(Neg(Int(1)))), Mul(Int(2), Neg(Neg(Int(3))))) 353 | >>> ai = Sign() 354 | >>> ai.run(exp) 355 | >>> print("exp {0} is {1}".format(exp, 356 | "negative" if ai.neg else 357 | "unknown" if ai.neg is None else 358 | "positive")) 359 | 360 | For an ADT of type C the method `visit_C` is looked up in the 361 | visitors methods dictionary. If it doesn't exist, then `visit_B` is 362 | looked up, where `B` is the base class of `C`. The process continues, 363 | until the method is found. This is guaranteed to terminate, 364 | since visit_ADT method is defined. 365 | 366 | Note: Non ADTs will be silently ignored. 367 | 368 | Once the method is found it is called. It is the method's responsiblity 369 | to recurse into sub-elements, e.g., call run method. 370 | 371 | For example, suppose that we want to count negative values in 372 | some BIL expression: 373 | 374 | class CountNegatives(Visitor): 375 | def __init__(self): 376 | self.neg = False 377 | self.count = 0 378 | 379 | def visit_Int(self, int): 380 | if int.arg < 0 and not self.neg \ 381 | or int.arg > 0 and self.neg: 382 | self.count += 1 383 | 384 | def visit_NEG(self, op): 385 | was = self.neg 386 | self.neg = not was 387 | self.run(op.arg) 388 | self.neg = was 389 | 390 | We need to keep track on the unary negation operator, and, of 391 | course, we need to look for immediates, so we override two methods: 392 | visit_Int for Int constructor and visit_NEG for counting unary minuses. 393 | (Actually we should count for bitwise NOT operation also, since it will 394 | change the sign bit also, but lets forget about it for the matter of the 395 | exercise (and it can be easily fixed just by matching visit_UnOp)). 396 | 397 | When we hit visit_NEG we toggle current sign, storing its previous value 398 | and recurse into the operand. After we return from the recursion, we restore 399 | the sign. 400 | 401 | """ 402 | 403 | def visit_ADT(self, adt): 404 | """Default visitor. 405 | 406 | This method will be called for those data types that has 407 | no specific visitors. It will recursively descent into all 408 | ADT values. 409 | """ 410 | if isinstance(adt.arg, tuple): 411 | return self.__induct(adt.arg) 412 | elif isinstance(adt.arg, ADT): 413 | return self.run(adt.arg) 414 | 415 | def __induct(self, xs): 416 | return next((r for r in (self.run(x) for x in xs) if r), None) 417 | 418 | def visit_Seq(self,adt): 419 | """Deconstructs sequences""" 420 | return self.__induct(adt.arg[0]) 421 | 422 | def visit_Map(self,adt): 423 | """Deconstructs maps""" 424 | return self.__induct(adt.arg[0]) 425 | 426 | 427 | def run(self, adt): 428 | """visitor.run(adt) -> result 429 | 430 | """ 431 | if isinstance(adt, ADT): 432 | 433 | for meth in ("enter", "visit", "leave"): 434 | for cls in adt.__class__.mro(): 435 | name = "{0}_{1}".format(meth, cls.__name__) 436 | fn = getattr(self, name, None) 437 | if fn is not None: 438 | r = fn(adt) 439 | if r is not None: 440 | return r 441 | if meth == "visit": 442 | break 443 | 444 | class Seq(ADT,Sequence) : 445 | def __init__(self, *args) : 446 | super(Seq,self).__init__(args) 447 | self.elements = args[0] 448 | 449 | def __getitem__(self,i) : 450 | return self.elements.__getitem__(i) 451 | 452 | def __len__(self) : 453 | return self.elements.__len__() 454 | 455 | def find(self,key, d=None) : 456 | """find(key[, d=None]) -> t 457 | 458 | Looks up for a term that matches with a given key. 459 | 460 | If the key is a string, starting with `@' or `%', then a term 461 | with the given identifier name is returned. Otherwise a term 462 | with a matching `name' attribute is returned (useful to find 463 | subroutines). 464 | 465 | If a key is an instance of Tid class, then a term with 466 | corresponding tid is returned. 467 | 468 | If a key is a number, or an instance of `bil.Int' class or is 469 | an integer, then a term with a matching address is returned. 470 | 471 | Example 472 | ------- 473 | 474 | In the following example, all searches return the 475 | same object 476 | 477 | 478 | >>> main = proj.program.subs.find('main') 479 | >>> main = proj.program.subs.find(main.id) 480 | >>> main = proj.program.subs.find(main.id.name) 481 | 482 | """ 483 | def by_id(t, k) : return t.id.number == k 484 | def by_name(t,k) : 485 | if k.startswith(('@','%')): 486 | return t.id.name == k 487 | else: 488 | return hasattr(t, 'name') and t.name == k 489 | def by_addr(t,k) : 490 | value = t.attrs.get('address', None) 491 | if value is not None: 492 | return parse_addr(value) == key 493 | 494 | test = by_addr 495 | if isinstance(key,str): 496 | test = by_name 497 | elif hasattr(key,'constr') and key.constr == 'Tid': 498 | key = key.number 499 | test = by_id 500 | elif hasattr(key,'constr') and key.constr == 'Int': 501 | key = key.value 502 | test = by_addr 503 | 504 | for t in self : 505 | if test(t,key) : return t 506 | return d 507 | 508 | 509 | class Map(ADT,Mapping) : 510 | def __init__(self, *args) : 511 | super(Map,self).__init__(args) 512 | self.elements = dict((x.arg[0],x.arg[1]) for x in args[0]) 513 | 514 | def __getitem__(self,i) : 515 | return self.elements.__getitem__(i) 516 | 517 | def __len__(self) : 518 | return self.elements.__len__() 519 | 520 | def __iter__(self) : 521 | return self.elements.__iter__() 522 | 523 | 524 | def visit(visitor, adt): 525 | 526 | if isinstance(adt, Iterable): 527 | for x in adt: 528 | visitor.run(x) 529 | else: 530 | visitor.run(adt) 531 | return visitor 532 | 533 | 534 | 535 | 536 | if __name__ == "__main__": 537 | class Fruit(ADT) : pass 538 | class Bannana(Fruit) : pass 539 | class Apple(Fruit) : pass 540 | 541 | assert(Bannana() == Bannana()) 542 | assert(Bannana() != Apple()) 543 | assert( Apple() < Bannana()) 544 | --------------------------------------------------------------------------------