├── LICENSE ├── README.md ├── binnavi_db.py ├── bn_disasm.py ├── elf_loader.py ├── fREedom.py ├── loader.py ├── pe_loader.py ├── postgresql_tables.sql └── x86_disasm.py /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | fREedom is a primitive attempt to provide an IDA Pro independent means 2 | of extracting disassembly information from executables for use with 3 | binnavi (https://github.com/google/binnavi). 4 | 5 | WARNING: This software is in its infancy 6 | 7 | Background: binnavi is a graphical "binary navigator" useful for reverse 8 | engineering software. binnavi does not contain its own disassembler, instead 9 | relying upon the capabilities of the commercial disassembler, IDA Pro. 10 | binnavi ships with an IDA plugin that extracts required information from an 11 | existing IDA database into a set of binnavi compatible, Postgres tables. The 12 | amount of work that IDA does on behalf of binnavi is not trivial. There is 13 | a reason there are no open source competitors to IDA. Eliminating binnavi's 14 | dependency on IDA is not quite as trivial as slapping some glue code on top 15 | of a disassembly framework like Capstone (http://www.capstone-engine.org/) 16 | and calling it a day. This project takes some small steps in that direction. 17 | it is thrown together, not well thought out, and it has a long way to go. 18 | 19 | Basic use: 20 | * Use the provided postgres script to setup the initial postgres database. 21 | * Configure your postgres instance appropriately (pg_hba.conf ...) 22 | * `python fREedom.py --database=my_binnavi --user=someone --pass=itsasecret --dbhost=127.0.0.1 --binary=foo.exe` 23 | * Launch binnavi to browse foo.exe 24 | 25 | What's here: 26 | * binnavi's postgres script to build the required Postgres database 27 | * Python scripts to extract from PE32, PE32+, and ELF binaries containing 28 | x86 or x86_64 code. 29 | 30 | What's not here: 31 | * A Postgres tutorial (see http://www.postgresql.org/). Among other things, 32 | you'll need psycopg2. 33 | * A Capstone installation tutorial (see http://www.capstone-engine.org/) 34 | * Support for anything other than PE32, PE32+, and ELF 35 | * Support for anything other than x86 and x86_64 36 | 37 | Limitations: 38 | * fREedom's disassembly engine is not as thorough as IDA's, lacking many of 39 | the heuristics that IDA uses to identify code. 40 | * There is currently no support for known data types and library function 41 | signatures. binnavi's type system is complex and not well documented. 42 | Substantial effort will be required to process development header files from 43 | many platforms in order to incorporate this information into fREedom generated 44 | disassemblies. 45 | * Parsers (crude at best) are included for only PE32, PE32+, and ELF. 46 | * Disassembly generators are included for only x86 and x86_64. 47 | * My python skills are not good. 48 | -------------------------------------------------------------------------------- /binnavi_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | The database interface for a stand-alone binnavi compatible disassembler 5 | ''' 6 | 7 | __author__ = "Chris Eagle" 8 | __copyright__ = "Copyright 2015, Chris Eagle" 9 | __credits__ = ["Chris Eagle"] 10 | __license__ = "GPL" 11 | __version__ = "2.0" 12 | __maintainer__ = "Chris Eagle" 13 | __email__ = "cseagle@gmail.com" 14 | __status__ = "Use at your own risk" 15 | 16 | import sys 17 | import traceback 18 | import struct 19 | import hashlib 20 | import psycopg2 21 | import capstone 22 | import bn_disasm 23 | 24 | FUNCTION_TYPES = {'NORMAL':0, 'LIBRARY':1, 'IMPORTED':2, 'THUNK':3, 'INAVALID':4,} 25 | 26 | class binnavi_db(object): 27 | 28 | def __init__(self, db, user, passwd, host='localhost'): 29 | # try: 30 | self.conn = psycopg2.connect("dbname='%s' user='%s' host='%s' password='%s'" % (db, user, host, passwd)) 31 | self.create_empty_tables() 32 | # except Exception, e: 33 | # raise Exception("db connect fail: %s:%s" % (type(e), e.message)) 34 | 35 | def export(self, module_data): 36 | return self.add_module(module_data) 37 | 38 | def add_functions(self, curs, id, module_data): 39 | for addr in module_data.call_targets: 40 | name = module_data.names[addr] 41 | named = not name.startswith('sub_') 42 | demangled = None #demangle(name) 43 | ftype = FUNCTION_TYPES['NORMAL'] #NORMAL 44 | if addr in module_data.loader.imports_by_addr: 45 | ftype = FUNCTION_TYPES['IMPORT'] 46 | elif addr in module_data.thunks: 47 | ftype = FUNCTION_TYPES['THUNK'] 48 | #not working yet, but if a function has a stack frame, it will be named: "__SF%x" % addr 49 | stkframe = None 50 | if ("__SF%x" % addr) in module_data.types: 51 | stkframe = module_data.types["__SF%x" % addr].id 52 | curs.execute("insert into ex_%d_functions values (%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s);" % id, 53 | (addr, name, demangled, named, ftype, module_data.loader.name, stkframe, None)) 54 | 55 | def add_instructions(self, curs, id, module_data): 56 | for addr in module_data.visited: 57 | insn = module_data.insts[addr] 58 | curs.execute("insert into ex_%d_instructions values (%%s, %%s, %%s);" % id, (addr, insn.mnemonic, insn.bytes)) 59 | 60 | #called from inside a with block already, so take a cursor from the caller 61 | #computes the basic block members from the give start address 62 | def add_basic_block_instructions(self, curs, id, module_data): 63 | for addr in module_data.visited: 64 | i = module_data.insts[addr] 65 | if hasattr(i, "bb"): 66 | for b in i.bb: 67 | curs.execute("insert into ex_%d_basic_block_instructions values (%%s, %%s, %%s);" % id, (b.bid, addr, b.seq)) 68 | 69 | # also, build the cgf while we're at it 70 | def add_basic_blocks(self, curs, id, module_data): 71 | for addr,bb in module_data.basic_blocks.iteritems(): 72 | for block in bb: 73 | curs.execute("insert into ex_%d_basic_blocks values (%%s, %%s, %%s);" % id, (block[0], block[1], addr)) 74 | 75 | def drop_table(self, curs, table): 76 | curs.execute("drop table if exists %s cascade;" % table) 77 | 78 | def delete_raw_module(self, curs, id): 79 | self.drop_table(curs, "ex_%d_address_comments" % id) 80 | self.drop_table(curs, "ex_%d_address_references" % id) 81 | self.drop_table(curs, "ex_%d_expression_substitutions" % id) 82 | self.drop_table(curs, "ex_%d_operands" % id) 83 | self.drop_table(curs, "ex_%d_expression_tree_nodes" % id) 84 | self.drop_table(curs, "ex_%d_expression_trees" % id) 85 | self.drop_table(curs, "ex_%d_expression_nodes" % id) 86 | self.drop_table(curs, "ex_%d_control_flow_graphs" % id) 87 | self.drop_table(curs, "ex_%d_callgraph" % id) 88 | self.drop_table(curs, "ex_%d_basic_block_instructions" % id) 89 | self.drop_table(curs, "ex_%d_instructions" % id) 90 | self.drop_table(curs, "ex_%d_basic_blocks" % id) 91 | self.drop_table(curs, "ex_%d_functions" % id) 92 | self.drop_table(curs, "ex_%d_type_renderers" % id) 93 | self.drop_table(curs, "ex_%d_base_types" % id) 94 | self.drop_table(curs, "ex_%d_expression_type_instances" % id) 95 | self.drop_table(curs, "ex_%d_expression_types" % id) 96 | self.drop_table(curs, "ex_%d_types" % id) 97 | self.drop_table(curs, "ex_%d_type_instances" % id) 98 | self.drop_table(curs, "ex_%d_sections" % id) 99 | self.drop_table(curs, "ex_%d_type_substitution_paths" % id) 100 | 101 | def create_raw_module(self, curs, id): 102 | curs.execute('create table ex_%d_functions ("address" bigint not null, "name" text not null,"demangled_name" text null default null,"has_real_name" boolean not null,"type" int not null default 0 check( "type" in ( 0, 1, 2, 3, 4 )),"module_name" text null default null,"stack_frame" int null default null,"prototype" int null default null);' % id) 103 | curs.execute('create table ex_%d_basic_blocks ("id" int not null,"parent_function" bigint not null,"address" bigint not null);' % id) 104 | curs.execute('create table ex_%d_instructions ("address" bigint not null,"mnemonic" varchar( 32 ) not null,"data" bytea not null);' % id) 105 | curs.execute('create table ex_%d_basic_block_instructions ("basic_block_id" int not null,"instruction" bigint not null,"sequence" int not null);' % id) 106 | curs.execute('create table ex_%d_callgraph ("id" serial,"source" bigint not null,"source_basic_block_id" int not null,"source_address" bigint not null,"destination" bigint not null);' % id) 107 | curs.execute('create table ex_%d_control_flow_graphs ("id" serial,"parent_function" bigint not null,"source" int not null,"destination" int not null,"type" int not null default 0 check( "type" in ( 0, 1, 2, 3 )));' % id) 108 | curs.execute('create table ex_%d_expression_trees ("id" serial);' % id) 109 | curs.execute('create table ex_%d_expression_nodes ("id" serial,"type" int not null default 0 check( "type" >= 0 and "type" <= 7 ),"symbol" varchar( 256 ),"immediate" bigint,"position" int,"parent_id" int check( "id" > "parent_id" ));' % id) 110 | curs.execute('create table ex_%d_expression_tree_nodes ("expression_tree_id" int not null,"expression_node_id" int not null);' % id) 111 | curs.execute('create table ex_%d_operands ("address" bigint not null,"expression_tree_id" int not null,"position" int not null);' % id) 112 | curs.execute('create table ex_%d_expression_substitutions ("id" serial,"address" bigint not null,"position" int not null,"expression_node_id" int not null,"replacement" text not null);' % id) 113 | curs.execute('create table ex_%d_address_references ("address" bigint not null,"position" int null,"expression_node_id" int null,"destination" bigint not null,"type" int not null default 0 check( "type" >= 0 and "type" <= 8 ));' % id) 114 | curs.execute('create table ex_%d_address_comments ("address" bigint not null,"comment" text not null);' % id) 115 | curs.execute('drop type if exists ex_%d_type_category;' % id) 116 | curs.execute("create type ex_%d_type_category as enum ('atomic', 'pointer', 'array','struct', 'union', 'function_pointer');" % id) 117 | curs.execute('create table ex_%d_base_types ("id" integer not null,"name" text not null,"size" integer not null,"pointer" integer,"signed" bool,"category" ex_%d_type_category not null);' % (id, id)) 118 | curs.execute('create table ex_%d_types ("id" serial not null,"name" text not null,"base_type" integer not null,"parent_id" integer,"offset" integer,"argument" integer,"number_of_elements" integer);' % id) 119 | curs.execute('drop type if exists ex_%d_type_renderers_renderer_type;' % id) 120 | curs.execute("create type ex_%d_type_renderers_renderer_type as enum ('integer','floating point', 'boolean', 'ascii', 'utf8', 'utf16');" % id) 121 | curs.execute('create table ex_%d_type_renderers ("type_id" int not null,"renderer" ex_%d_type_renderers_renderer_type not null);' % (id, id)) 122 | curs.execute('drop type if exists ex_%d_section_permission_type;' % id) 123 | curs.execute("create type ex_%d_section_permission_type as enum ('READ', 'WRITE','EXECUTE', 'READ_WRITE', 'READ_EXECUTE', 'WRITE_EXECUTE','READ_WRITE_EXECUTE');" % id) 124 | curs.execute('create table ex_%d_sections ("id" serial not null,"name" text not null,"start_address" bigint not null,"end_address" bigint not null,"permission" ex_%d_section_permission_type not null,"data" bytea not null);' % (id, id)) 125 | curs.execute('create table ex_%d_expression_types ("address" bigint not null,"position" integer not null,"expression_id" integer not null,"type" integer not null,"path" integer[] not null,"offset" integer);' % id) 126 | curs.execute('create table ex_%d_expression_type_instances ("address" bigint not null,"position" integer not null,"expression_node_id" integer not null,"type_instance_id" integer not null);' % id) 127 | curs.execute('create table ex_%d_type_instances ("id" integer not null,"name" text not null,"section_offset" bigint not null,"type_id" integer not null,"section_id" integer not null);' % id) 128 | curs.execute('create table ex_%d_type_substitution_paths ("id" integer not null,"child_id" integer,"type_id" integer not null);' % id) 129 | 130 | def vaccuum_raw_tables(self, id): 131 | try: 132 | with self.conn as conn: 133 | old_iso = conn.isolation_level 134 | conn.set_isolation_level(0) 135 | with conn.cursor() as curs: 136 | curs.execute('vacuum analyze "ex_%d_operands";' % id) 137 | curs.execute('vacuum analyze "ex_%d_functions";' % id) 138 | curs.execute('vacuum analyze "ex_%d_basic_blocks";' % id) 139 | curs.execute('vacuum analyze "ex_%d_instructions";' % id) 140 | curs.execute('vacuum analyze "ex_%d_basic_block_instructions";' % id) 141 | curs.execute('vacuum analyze "ex_%d_callgraph";' % id) 142 | curs.execute('vacuum analyze "ex_%d_control_flow_graphs";' % id) 143 | curs.execute('vacuum analyze "ex_%d_expression_trees";' % id) 144 | curs.execute('vacuum analyze "ex_%d_expression_nodes";' % id) 145 | curs.execute('vacuum analyze "ex_%d_expression_tree_nodes";' % id) 146 | curs.execute('vacuum analyze "ex_%d_expression_substitutions";' % id) 147 | curs.execute('vacuum analyze "ex_%d_address_references";' % id) 148 | curs.execute('vacuum analyze "ex_%d_address_comments";' % id) 149 | curs.execute('vacuum analyze "ex_%d_type_renderers";' % id) 150 | curs.execute('vacuum analyze "ex_%d_base_types";' % id) 151 | curs.execute('vacuum analyze "ex_%d_types";' % id) 152 | curs.execute('vacuum analyze "ex_%d_expression_types";' % id) 153 | curs.execute('vacuum analyze "ex_%d_sections";' % id) 154 | conn.set_isolation_level(old_iso) 155 | except psycopg2.Error, p: 156 | print "vaccuum_raw_tables: %s" % p.message 157 | raise p 158 | 159 | def create_raw_indicies(self, curs, id): 160 | curs.execute('create unique index ex_%d_functions_address_idx on ex_%d_functions( "address" );' % (id, id)) 161 | curs.execute('create unique index ex_%d_basic_blocks_id_idx on ex_%d_basic_blocks( "id" );' % (id, id)) 162 | curs.execute('create index ex_%d_basic_blocks_address_idx on ex_%d_basic_blocks( "address" );' % (id, id)) 163 | curs.execute('create unique index ex_%d_instructions_address_idx on ex_%d_instructions( "address" );' % (id, id)) 164 | curs.execute('create unique index ex_%d_expression_trees_id_idx on ex_%d_expression_trees( "id" );' % (id, id)) 165 | curs.execute('create unique index ex_%d_expression_nodes_id_idx on ex_%d_expression_nodes( "id" );' % (id, id)) 166 | 167 | def delete_cleanup(self, curs, id): 168 | curs.execute("delete from ex_%d_instructions as instructions using ex_%d_basic_block_instructions as basic_block_instructions where basic_block_instructions.instruction = instructions.address and basic_block_id is null;" % (id, id)) 169 | curs.execute("delete from ex_%d_basic_block_instructions where basic_block_id is null;" % id) 170 | curs.execute("delete from ex_%d_address_references where address in ( select address from ex_%d_address_references except select address from ex_%d_instructions);" % (id, id, id)) 171 | curs.execute("delete from ex_%d_address_comments where address in ( select address from ex_%d_address_comments except select address from ex_%d_instructions);" % (id, id, id)) 172 | curs.execute("delete from ex_%d_expression_substitutions where address in ( select address from ex_%d_expression_substitutions except select address from ex_%d_instructions);" % (id, id, id)) 173 | curs.execute("delete from ex_%d_operands where address in ( select address from ex_%d_operands except select address from ex_%d_instructions);" % (id, id, id)) 174 | curs.execute("delete from ex_%d_expression_type_instances where address in ( select address from ex_%d_expression_type_instances except select address from ex_%d_operands);" % (id, id, id)) 175 | 176 | def create_raw_keys(self, curs, id): 177 | curs.execute('alter table ex_%d_functions add primary key( "address" );' % id) 178 | curs.execute('alter table ex_%d_basic_blocks add primary key( "id" );' % id) 179 | curs.execute('alter table ex_%d_basic_blocks add constraint ex_%d_basic_blocks_parent_function_fkey foreign key ( "parent_function" ) references ex_%d_functions( "address" ) on delete cascade on update cascade;' % (id, id, id)) 180 | curs.execute('alter table ex_%d_instructions add primary key( "address" );' % id) 181 | curs.execute('alter table ex_%d_basic_block_instructions add constraint ex_%d_basic_block_instructions_bb_fkey foreign key ( "basic_block_id" ) references ex_%d_basic_blocks( "id" ) on delete cascade on update cascade;' % (id, id, id)) 182 | curs.execute('alter table ex_%d_basic_block_instructions add constraint ex_%d_basic_block_instructions_ins_fkey foreign key ( "instruction" ) references ex_%d_instructions( "address" ) on delete cascade on update cascade;' % (id, id, id)) 183 | curs.execute('alter table ex_%d_callgraph add primary key( "id" );' % id) 184 | curs.execute('alter table ex_%d_callgraph add constraint ex_%d_callgraph_source_fkey foreign key ( "source" ) references ex_%d_functions( "address" ) on delete cascade on update cascade;' % (id, id, id)) 185 | curs.execute('alter table ex_%d_callgraph add constraint ex_%d_callgraph_destination_fkey foreign key ( "destination" ) references ex_%d_functions( "address" ) on delete cascade on update cascade;' % (id, id, id)) 186 | curs.execute('alter table ex_%d_callgraph add constraint ex_%d_callgraph_source_basic_block_id_fkey foreign key ( "source_basic_block_id" ) references ex_%d_basic_blocks( "id" ) on delete cascade on update cascade;' % (id, id, id)) 187 | curs.execute('alter table ex_%d_callgraph add constraint ex_%d_callgraph_source_address_fkey foreign key ( "source_address" ) references ex_%d_instructions( "address" ) on delete cascade on update cascade;' % (id, id, id)) 188 | curs.execute('alter table ex_%d_control_flow_graphs add primary key( "id" );' % id) 189 | curs.execute('alter table ex_%d_control_flow_graphs add constraint ex_%d_control_flow_graphs_parent_function_fkey foreign key ( "parent_function" ) references ex_%d_functions( "address" ) on delete cascade on update cascade;' % (id, id, id)) 190 | curs.execute('alter table ex_%d_control_flow_graphs add constraint ex_%d_control_flow_graphs_source_fkey foreign key ( "source" ) references ex_%d_basic_blocks( "id" ) on delete cascade on update cascade;' % (id, id, id)) 191 | curs.execute('alter table ex_%d_control_flow_graphs add constraint ex_%d_control_flow_graphs_destination_fkey foreign key ( "destination" ) references ex_%d_basic_blocks( "id" ) on delete cascade on update cascade;' % (id, id, id)) 192 | curs.execute('alter table ex_%d_expression_trees add primary key( "id" );' % id) 193 | curs.execute('alter table ex_%d_expression_nodes add primary key( "id" );' % id) 194 | curs.execute('alter table ex_%d_expression_nodes add constraint ex_%d_expression_nodes_parent_id_fkey foreign key ( "parent_id" ) references ex_%d_expression_nodes( "id" ) on delete cascade on update cascade;' % (id, id, id)) 195 | curs.execute('alter table ex_%d_expression_tree_nodes add constraint ex_%d_expression_tree_nodes_expression_tree_id_fkey foreign key ( "expression_tree_id" ) references ex_%d_expression_trees( "id" ) on delete cascade on update cascade;' % (id, id, id)) 196 | curs.execute('alter table ex_%d_expression_tree_nodes add constraint ex_%d_expression_tree_nodes_expression_node_id_fkey foreign key ( "expression_node_id" ) references ex_%d_expression_nodes( "id" ) on delete cascade on update cascade;' % (id, id, id)) 197 | curs.execute('alter table ex_%d_operands add primary key ( "address", "position" );' % id) 198 | curs.execute('alter table ex_%d_operands add constraint ex_%d_operands_expression_tree_id_fkey foreign key ( "expression_tree_id" ) references ex_%d_expression_trees( "id" ) on delete cascade on update cascade;' % (id, id, id)) 199 | curs.execute('alter table ex_%d_operands add constraint ex_%d_operands_address_fkey foreign key ( "address" ) references ex_%d_instructions( "address" ) on delete cascade on update cascade;' % (id, id, id)) 200 | curs.execute('alter table ex_%d_expression_substitutions add constraint ex_%d_expression_substitutions_address_position_fkey foreign key ( "address", "position" ) references ex_%d_operands( "address", "position" ) on delete cascade on update cascade;' % (id, id, id)) 201 | curs.execute('alter table ex_%d_expression_substitutions add constraint ex_%d_expression_substitutions_expression_node_id_fkey foreign key ( "expression_node_id" ) references ex_%d_expression_nodes( "id" ) on delete cascade on update cascade;' % (id, id, id)) 202 | curs.execute('alter table ex_%d_address_references add constraint ex_%d_address_references_address_position foreign key ( "address", "position" ) references ex_%d_operands( "address", "position" ) on delete cascade on update cascade;' % (id, id, id)) 203 | curs.execute('alter table ex_%d_address_references add constraint ex_%d_address_references_expression_node_id_fkey foreign key ( "expression_node_id" ) references ex_%d_expression_nodes( "id" ) on delete cascade on update cascade;' % (id, id, id)) 204 | curs.execute('alter table ex_%d_base_types add primary key ( "id" );' % id) 205 | curs.execute('alter table ex_%d_base_types add constraint ex_%d_base_types_pointer_fkey foreign key ( "pointer" ) references ex_%d_base_types( "id" ) on delete cascade on update cascade deferrable initially deferred;' % (id, id, id)) 206 | curs.execute('alter table ex_%d_types add primary key ( "id");' % id) 207 | curs.execute('alter table ex_%d_types add constraint ex_%d_types_parent_id_fkey foreign key ( "parent_id" ) references ex_%d_base_types ( "id" ) on delete cascade on update cascade deferrable initially deferred;' % (id, id, id)) 208 | curs.execute('alter table ex_%d_types add constraint ex_%d_types_base_type_fkey foreign key ( "base_type" ) references ex_%d_base_types ( "id" ) on delete cascade on update cascade;' % (id, id, id)) 209 | curs.execute('alter table ex_%d_expression_types add primary key ( "address", "position", "expression_id" );' % id) 210 | curs.execute('alter table ex_%d_expression_types add constraint ex_%d_expression_type_type_fkey foreign key ( "type" ) references ex_%d_base_types ( "id" ) on update no action on delete cascade deferrable initially deferred;' % (id, id, id)) 211 | curs.execute('alter table ex_%d_sections add primary key ( "id" );' % id) 212 | curs.execute('alter table ex_%d_type_instances add primary key ( "id" );' % id) 213 | curs.execute('alter table ex_%d_type_instances add constraint ex_%d_type_instances_type_id_fkey foreign key ( "type_id" ) references ex_%d_base_types ( "id" ) match simple on update cascade on delete cascade;' % (id, id, id)) 214 | curs.execute('alter table ex_%d_type_instances add constraint ex_%d_type_instances_section_id_fkey foreign key ( "section_id" ) references ex_%d_sections ( "id" ) match simple on update cascade on delete cascade;' % (id, id, id)) 215 | curs.execute('alter table ex_%d_expression_type_instances add primary key ( "address", "position", "expression_node_id" );' % id) 216 | curs.execute('alter table ex_%d_expression_type_instances add constraint ex_%d_expression_type_instances_type_instance_id_fkey foreign key ( "type_instance_id" ) references ex_%d_type_instances ( "id" ) match simple on update cascade on delete cascade;' % (id, id, id)) 217 | curs.execute('alter table ex_%d_expression_type_instances add constraint ex_%d_expression_type_instances_address_position_fkey foreign key ( "address", "position" ) references ex_%d_operands ( "address", "position" ) match simple on update cascade on delete cascade;' % (id, id, id)) 218 | curs.execute('alter table ex_%d_expression_type_instances add constraint ex_%d_expression_type_instances_expression_node_id_fkey foreign key ( "expression_node_id" ) references ex_%d_expression_nodes ( "id" ) match simple on update cascade on delete cascade;' % (id, id, id)) 219 | 220 | def has_table(self, curs, table): 221 | result = False 222 | try: 223 | curs.execute("SELECT relname FROM pg_class WHERE relname = %s;", (table, )) 224 | result = curs.rowcount == 1 225 | except psycopg2.Error, p: 226 | print "has_table: %s" % p.message 227 | raise p 228 | return result 229 | 230 | def need_pg_init(self, curs): 231 | try: 232 | curs.execute('''SELECT count(*) FROM pg_class WHERE relname in ('bn_projects','bn_modules', 233 | 'bn_address_spaces','bn_space_modules','bn_functions','bn_function_views','bn_instructions', 234 | 'bn_operands','bn_expression_tree','bn_expression_tree_ids','bn_expression_tree_mapping', 235 | 'bn_code_nodes','bn_codenode_instructions','bn_edges','bn_edge_paths','bn_function_nodes', 236 | 'bn_group_nodes','bn_nodes','bn_project_settings','bn_module_settings','bn_traces','bn_trace_events', 237 | 'bn_trace_event_values','bn_views','bn_module_views','bn_project_views','bn_view_settings', 238 | 'bn_global_edge_comments','bn_global_node_comments','bn_project_debuggers','bn_debuggers', 239 | 'bn_tags','bn_tagged_views','bn_tagged_nodes','bn_expression_substitutions','bn_comments', 240 | 'bn_comments_audit','bn_types','bn_base_types','bn_users','bn_expression_types')''') 241 | res = curs.fetchone()[0] 242 | return res != 41 243 | except psycopg2.Error, p: 244 | print "need_pg_init: %s" % p.message 245 | raise p 246 | return True 247 | 248 | def create_modules_table(self): 249 | try: 250 | with self.conn as conn: 251 | with conn.cursor() as curs: 252 | query = ("CREATE TABLE modules (" 253 | " id serial, " 254 | " name text NOT NULL, " 255 | " architecture varchar( 32 ) NOT NULL, " 256 | " base_address bigint NOT NULL, " 257 | " exporter varchar( 256 ) NOT NULL, " 258 | " version int NOT NULL, " 259 | " md5 char( 32 ) NOT NULL, " 260 | " sha1 char( 40 ) NOT NULL, " 261 | " comment TEXT, " 262 | " import_time timestamp NOT NULL DEFAULT current_timestamp, " 263 | " PRIMARY KEY (id));") 264 | curs.execute(query) 265 | except psycopg2.Error, p: 266 | print "create_modules_table: %s" % p.message 267 | raise p 268 | 269 | def delete_module(self, id): 270 | try: 271 | with self.conn as conn: 272 | with conn.cursor() as curs: 273 | curs.execute("delete from modules where id = %s;", (id, )) 274 | self.delete_raw_module(curs, id) 275 | except psycopg2.Error, p: 276 | print "delete_module: %s" % p.message 277 | raise p 278 | 279 | def insert_module(self, id, module_data): 280 | try: 281 | with self.conn as conn: 282 | with conn.cursor() as curs: 283 | curs.execute("insert into modules values(%s, %s, %s, %s, %s, %s, %s, %s, %s, now());", 284 | (id, module_data.loader.name, module_data.loader.arch_name, module_data.loader.image_base, 'infiltrated', 0, 285 | module_data.loader.md5, module_data.loader.sha1, module_data.comment)) 286 | except psycopg2.Error, p: 287 | print "insert_module: %s" % p.message 288 | raise p 289 | 290 | def add_sections(self, curs, id, module_data): 291 | for s in module_data.loader.sections: 292 | raw = s.get_raw_bytes(module_data.loader) 293 | if raw is not None: 294 | curs.execute(("insert into ex_%d_sections" 295 | "(name, start_address, end_address, permission, data)" 296 | " values (%%s, %%s, %%s, %%s, %%s);" % id), 297 | (s.name, s.start, s.end, bn_disasm.PERMISSIONS[s.perms], bytearray(raw))) 298 | 299 | def add_operands(self, curs, id, module_data): 300 | for addr in module_data.visited: 301 | op_exprs = module_data.operands[addr] 302 | opnum = 0 303 | ''' 304 | if not hasattr(insn, "op_exprs"): 305 | print "Missing op_exprs for 0x%x" % insn.address 306 | continue 307 | if insn.op_exprs is None: 308 | print "op_exprs == None for 0x%x" % insn.address 309 | continue 310 | ''' 311 | for expr in op_exprs: 312 | curs.execute(("insert into ex_%d_operands" 313 | "(address, expression_tree_id, position)" 314 | " values (%%s, %%s, %%s);" % id), 315 | (addr, expr, opnum)) 316 | opnum += 1 317 | 318 | def add_nodes(self, curs, id, module_data, nodes, parent): 319 | for key,value in nodes.iteritems(): 320 | node = value[0] 321 | val = None 322 | if node.op_type == bn_disasm.IMMEDIATE_INT: 323 | val = node.value 324 | if node.value in module_data.names: 325 | key = module_data.names[node.value] 326 | else: 327 | key = None 328 | curs.execute(("insert into ex_%d_expression_nodes" 329 | "(id, type, symbol, immediate, position, parent_id)" 330 | " values (%%s, %%s, %%s, %%s, %%s, %%s);" % id), 331 | (node.node_id, node.op_type % 10, key, val, node.pos, parent)) 332 | for pos,op in value[1].iteritems(): 333 | self.add_nodes(curs, id, module_data, op, node.node_id) 334 | 335 | def add_trees(self, curs, id, module_data): 336 | for expr in module_data.exprs.keys(): 337 | curs.execute(("insert into ex_%d_expression_trees" 338 | "(id)" 339 | " values (%%s);" % id), 340 | (expr, )) 341 | 342 | def add_tree_nodes(self, curs, id, module_data): 343 | for expr,nodes in module_data.exprs.iteritems(): 344 | for n in nodes: 345 | curs.execute(("insert into ex_%d_expression_tree_nodes" 346 | "(expression_tree_id, expression_node_id)" 347 | " values (%%s, %%s);" % id), 348 | (expr, n)) 349 | 350 | def add_types(self, curs, id, module_data): 351 | for name,btype in module_data.types.iteritems(): 352 | curs.execute(("insert into ex_%d_base_types" 353 | "(id, name, size, pointer, signed, category)" 354 | " values (%%s, %%s, %%s, %%s, %%s, %%s);" % id), 355 | (btype.id, btype.name, btype.size, btype.pointer, btype.signed, bn_disasm.TYPE_CATEGORIES[btype.category])) 356 | 357 | def add_arefs(self, curs, id, module_data): 358 | for aref in module_data.arefs: 359 | curs.execute(("insert into ex_%d_address_references" 360 | "(address, position, expression_node_id, destination, type)" 361 | " values (%%s, %%s, %%s, %%s, %%s);" % id), 362 | (aref.addr, aref.pos, aref.node_id, aref.dest, aref.rtype)) 363 | 364 | def add_module(self, module_data): 365 | try: 366 | id = 0 367 | with self.conn as conn: 368 | with conn.cursor() as curs: 369 | curs.execute("select coalesce(max(id), 0) + 1 from modules;") 370 | id = curs.fetchone()[0] 371 | self.insert_module(id, module_data) 372 | 373 | with self.conn as conn: 374 | with conn.cursor() as curs: 375 | # ordering as binnavi's Ida plugin seems to 376 | #begin is here 377 | self.delete_raw_module(curs, id) 378 | self.create_raw_module(curs, id) 379 | #binnavi then adds sections here 380 | sys.stderr.write("add_sections\n") 381 | self.add_sections(curs, id, module_data) 382 | 383 | #next binnavi inserts into base_types table 384 | # some basic types, then enumerates IDA's structs window, then adds types for all functions ('struct' ???) 385 | # (1,'BYTE',8,181,true,'atomic'), 386 | # (2,'WORD',16,181,true,'atomic'), 387 | # (3,'DWORD',32,181,true,'atomic'), 388 | # (4,'QWORD',64,null,true,'atomic'), 389 | # (5,'void',32,181,false,'atomic'), 390 | # (6,'void *',32,5,false,'atomic') 391 | sys.stderr.write("add_types\n") 392 | self.add_types(curs, id, module_data) 393 | #next binnavi inserts into types table 394 | #next into expression_types 395 | # type_instances 396 | # expression_type_instances 397 | # address_comments 398 | 399 | sys.stderr.write("add_operands\n") 400 | self.add_operands(curs, id, module_data) 401 | sys.stderr.write("add_instructions\n") 402 | self.add_instructions(curs, id, module_data) 403 | #functions must have non-null stack_frame 404 | sys.stderr.write("add_functions\n") 405 | self.add_functions(curs, id, module_data) 406 | sys.stderr.write("add_basic_blocks\n") 407 | self.add_basic_blocks(curs, id, module_data) 408 | # basic_block_instructions 409 | self.add_basic_block_instructions(curs, id, module_data) 410 | 411 | cfg_query = ("insert into ex_%d_control_flow_graphs" 412 | "(parent_function, source, destination, type)" 413 | " values (%%s, %%s, %%s, %%s);") % id 414 | for edge in module_data.cfg: 415 | curs.execute(cfg_query, (edge.parent_func, edge.src_bb, edge.dest_bb, edge.edge_type)) 416 | 417 | cg_query = ("insert into ex_%d_callgraph" 418 | "(source, source_basic_block_id, source_address, destination)" 419 | " values (%%s, %%s, %%s, %%s);") % id 420 | for edge in module_data.callgraph: 421 | curs.execute(cg_query, (edge.src_func, edge.src_bb, edge.src_addr, edge.dest)) 422 | 423 | sys.stderr.write("add_nodes\n") 424 | self.add_nodes(curs, id, module_data, module_data.nodes, None) 425 | # expression_trees 426 | sys.stderr.write("add_trees\n") 427 | self.add_trees(curs, id, module_data) 428 | # expression_tree_nodes 429 | sys.stderr.write("add_tree_nodes\n") 430 | self.add_tree_nodes(curs, id, module_data) 431 | 432 | #create indicies 433 | self.create_raw_indicies(curs, id) 434 | 435 | # expression_substitutions 436 | # address_references 437 | self.add_arefs(curs, id, module_data) 438 | 439 | #next a number of delete queries are executed 440 | self.delete_cleanup(curs, id) 441 | #now add indicies/foreign keys on all tables that need them 442 | self.create_raw_keys(curs, id) 443 | #commit is here 444 | 445 | self.vaccuum_raw_tables(id) 446 | 447 | return id 448 | except psycopg2.Error, p: 449 | traceback.print_exc() 450 | print "add_module: %s" % p.message 451 | raise p 452 | return -1 453 | 454 | def create_empty_tables(self): 455 | try: 456 | with self.conn as conn: 457 | with conn.cursor() as curs: 458 | if not self.has_table(curs, "modules"): 459 | query = ("CREATE TABLE modules (" 460 | " id serial, " 461 | " name text NOT NULL, " 462 | " architecture varchar( 32 ) NOT NULL, " 463 | " base_address bigint NOT NULL, " 464 | " exporter varchar( 256 ) NOT NULL, " 465 | " version int NOT NULL, " 466 | " md5 char( 32 ) NOT NULL, " 467 | " sha1 char( 40 ) NOT NULL, " 468 | " comment TEXT, " 469 | " import_time timestamp NOT NULL DEFAULT current_timestamp, " 470 | " PRIMARY KEY (id));") 471 | curs.execute(query) 472 | 473 | if self.need_pg_init(curs): 474 | with open('postgresql_tables.sql') as sql: 475 | build_tables = sql.read() 476 | curs.execute(build_tables) 477 | curs.execute("INSERT INTO bn_users VALUES (DEFAULT, 'identity', null, null);") 478 | except psycopg2.Error, p: 479 | print "create_empty_tables: %s" % p.message 480 | -------------------------------------------------------------------------------- /bn_disasm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | The disassembly engine for a stand-alone binnavi compatible disassembler 5 | ''' 6 | 7 | __author__ = "Chris Eagle" 8 | __copyright__ = "Copyright 2015, Chris Eagle" 9 | __credits__ = ["Chris Eagle"] 10 | __license__ = "GPL" 11 | __version__ = "2.0" 12 | __maintainer__ = "Chris Eagle" 13 | __email__ = "cseagle@gmail.com" 14 | __status__ = "Use at your own risk" 15 | 16 | import os 17 | import hashlib 18 | import sys 19 | import capstone 20 | import loader 21 | 22 | XR_FLOW = 1 23 | XR_CALL = 2 24 | XR_JUMP = 3 25 | XR_JCC = 4 26 | 27 | CONDITION_TRUE = 0 28 | CONDITION_FALSE = 1 29 | UNCONDITIONAL = 2 30 | SWITCH = 3 31 | CALL_DIRECT = 4 32 | CALL_INDIRECT = 5 33 | CALL_VIRTUAL = 6 34 | DATA = 7 35 | DATA_STRING = 8 36 | 37 | AREF_TYPES = { 38 | 0:'conditional_true', 39 | 1:'conditional_false', 40 | 2:'unconditional', 41 | 3:'switch', 42 | 4:'call_direct', 43 | 5:'call_indirect', 44 | 6:'call_virtual', 45 | 7:'data', 46 | 8:'data_string' 47 | } 48 | 49 | PERMISSIONS = { 50 | 1:'READ', 51 | 2:'WRITE', 52 | 4:'EXECUTE', 53 | 3:'READ_WRITE', 54 | 5:'READ_EXECUTE', 55 | 6:'WRITE_EXECUTE', 56 | 7:'READ_WRITE_EXECUTE' 57 | } 58 | 59 | NO_TYPE = 0 60 | SYMBOL = 1 # String to be displayed. 61 | IMMEDIATE_INT = 2 62 | IMMEDIATE_FLOAT = 3 63 | OPERATOR = 4 # '+', '*' etc. 64 | REGISTER = 5 65 | SIZE_PREFIX = 6 # 'B4, 'B8', etc. 66 | DEREFERENCE = 7 67 | 68 | ATOMIC = 0 69 | POINTER = 1 70 | ARRAY = 2 71 | STRUCT = 3 72 | UNION = 4 73 | FUNCTION_POINTER = 5 74 | 75 | TYPE_CATEGORIES = { 76 | 0:'atomic', 77 | 1:'pointer', 78 | 2:'array', 79 | 3:'struct', 80 | 4:'union', 81 | 5:'function_pointer', 82 | } 83 | 84 | #do the xrefs in the given list describe a conditional jump 85 | def is_conditional(xrefs): 86 | if len(xrefs) != 2: 87 | return False 88 | return (xrefs[0][1] == XR_JCC or xrefs[1][1] == XR_JCC) 89 | 90 | # return as (False target, True target) 91 | def get_conditional_targets(xrefs): 92 | if len(xrefs) != 2: 93 | return None 94 | if xrefs[0][1] == XR_FLOW: 95 | return (xrefs[0][0], xrefs[1][0]) 96 | return (xrefs[1][0], xrefs[0][0]) 97 | 98 | # return as (return target, call target) 99 | def get_call_targets(xrefs): 100 | if len(xrefs) != 2: 101 | return None 102 | if xrefs[0][1] == XR_FLOW: 103 | return (xrefs[0][0], xrefs[1][0]) 104 | return (xrefs[1][0], xrefs[0][0]) 105 | 106 | class OpNode(object): 107 | 108 | def __init__(self, op_type, value): 109 | self.op_type = op_type 110 | self.value = value 111 | self.node_id = 0 112 | self.pos = 0 113 | 114 | class Operand(object): 115 | 116 | def __init__(self, addr, expr, pos): 117 | self.addr = addr 118 | self.expr = expr 119 | self.pos = pos 120 | 121 | class AddressRef(object): 122 | 123 | def __init__(self, addr, pos, node_id, dest, rtype): 124 | self.addr = addr 125 | self.pos = pos 126 | self.node_id = node_id 127 | self.dest = dest 128 | self.rtype = rtype 129 | 130 | class TypeInfo(object): 131 | 132 | def __init__(self, id, name, size, pointer, signed, category): 133 | self.id = id 134 | self.name = name 135 | self.size = size 136 | self.pointer = pointer 137 | self.signed = signed 138 | self.category = category 139 | 140 | class BlockInfo(object): 141 | def __init__(self, bid, seq, func_addr): 142 | self.bid = bid 143 | self.seq = seq 144 | self.func = func_addr 145 | 146 | #callgraph edge 147 | class CG_Edge(object): 148 | def __init__(self, src_func, src_bb, src_addr, dest): 149 | self.src_func = src_func 150 | self.src_bb = src_bb 151 | self.src_addr = src_addr 152 | self.dest = dest 153 | 154 | #control flow graph edge 155 | class CFG_Edge(object): 156 | def __init__(self, parent_func, src_bb, dest_bb, edge_type): 157 | self.parent_func = parent_func 158 | self.src_bb = src_bb 159 | self.dest_bb = dest_bb 160 | self.edge_type = edge_type 161 | 162 | class Disassembly(object): 163 | 164 | def __init__(self, loader): 165 | self.loader = loader 166 | 167 | self.comment = '' 168 | 169 | self.locs = [] # addr - to be visited 170 | self.visited = set() # addr - instructions we have actually examined 171 | self.insts = {} # addr:cs.CsInsn - cache of disassemled instructions 172 | self.names = {} # addr:string 173 | self.jmp_targets = set() # addr 174 | self.call_targets = set() # addr 175 | self.xrefs_to = {} # addr:list of (int,int) (addr,type) 176 | self.xrefs_from = {} # addr:list of (int,int) (addr,type) 177 | self.thunks = set() # addr 178 | 179 | self.bb_id = 0 180 | self.basic_block_starts = set() # star address for basic blocks 181 | self.basic_blocks = {} # addr:(int, set) block_start:[(block_id, parent funcs)] 182 | 183 | self.callgraph = [] # CG_Edge 184 | self.cfg = [] # CFG_Edge 185 | 186 | self.nodes = {} # {str:tuple} tuple is node,{str:tuple} 187 | self.exprs = {} # int:[] int expression_id, list of nodes in expression 188 | self.expr_strings = {} # string representations of expressions : expr_id 189 | self.node_id = 0 190 | self.expr_id = 0 191 | self.operands = {} # addr:[int] instruction address -> list of operand expressions 192 | self.arefs = [] # AddressRef 193 | self.type_id = 0 194 | self.types = {} # name:TypeInfo 195 | self.func_sigs = [] # str - list of function header signatures for signature matching 196 | self.data_locs = {} # {addr:size} - locations known to be data and their sizes 197 | 198 | #these should really come from disassembly process ?? 199 | #rather than just priming the pump here 200 | self.add_type("char", 8, None, True, ATOMIC) 201 | self.add_type("short", 16, None, True, ATOMIC) 202 | self.add_type("int", 32, None, True, ATOMIC) 203 | self.add_type("BYTE", 8, None, True, ATOMIC) 204 | self.add_type("WORD", 16, None, True, ATOMIC) 205 | self.add_type("DWORD", 32, None, True, ATOMIC) 206 | self.add_type("QWORD", 32, None, True, ATOMIC) 207 | self.add_type("int8_t", 8, None, True, ATOMIC) 208 | self.add_type("int16_t", 16, None, True, ATOMIC) 209 | self.add_type("int32_t", 32, None, True, ATOMIC) 210 | self.add_type("int64_t", 64, None, True, ATOMIC) 211 | self.add_type("uint8_t", 8, None, False, ATOMIC) 212 | self.add_type("uint16_t", 16, None, False, ATOMIC) 213 | self.add_type("uint32_t", 32, None, False, ATOMIC) 214 | self.add_type("uint64_t", 64, None, False, ATOMIC) 215 | 216 | for addr in loader.imports_by_addr: 217 | self.data_locs[addr] = loader.sizeof_ptr 218 | 219 | #do the xrefs in the given list describe a function call that returns 220 | def is_returning_call(self, xrefs): 221 | if len(xrefs) != 2: 222 | return False 223 | if xrefs[1][1] == XR_CALL: 224 | tgt = xrefs[1][0] 225 | elif xrefs[0][1] == XR_CALL: 226 | tgt = xrefs[0][0] 227 | else: 228 | return False # not a call 229 | if tgt in self.names and self.names[tgt] in self.loader.non_returning_funcs: 230 | return False 231 | return (xrefs[0][1] == XR_FLOW and xrefs[1][1] == XR_CALL) or \ 232 | (xrefs[1][1] == XR_FLOW and xrefs[0][1] == XR_CALL) 233 | 234 | def add_type(self, name, size, pointer, signed, category): 235 | self.type_id += 1 236 | self.types[name] = TypeInfo(self.type_id, name, size, pointer, signed, category) 237 | 238 | def add_basic_block_start(self, addr): 239 | self.basic_block_starts.add(addr) 240 | 241 | #returns new basic block id 242 | def add_basic_block(self, addr, parent): 243 | if addr not in self.basic_block_starts: 244 | return 245 | self.bb_id += 1 246 | if addr not in self.basic_blocks: 247 | self.basic_blocks[addr] = [] 248 | bb = (self.bb_id, parent) 249 | self.basic_blocks[addr].append(bb) 250 | return bb[0] 251 | 252 | def is_bb_start(self, addr): 253 | return addr in self.basic_block_starts 254 | 255 | def get_bb_id(self, func, addr): 256 | inst = self.insts[addr] 257 | if hasattr(inst, "bb"): 258 | for block in inst.bb: 259 | if func == block.func: 260 | return block.bid 261 | ''' 262 | sys.stderr.write("Unable to get_bb_id for 0x%x in func 0x%x\n" % (addr, func)) 263 | for block in inst.bb: 264 | sys.stderr.write("(%d, %d, 0x%x), " % (block.bid, block.seq, block.func)) 265 | sys.stderr.write("\n") 266 | ''' 267 | ''' 268 | else: 269 | sys.stderr.write("0x%x has no bb attr\n" % addr) 270 | sys.stderr.write("Unable to get_bb_id for 0x%x in func 0x%x\n" % (addr, func)) 271 | ''' 272 | return -1 273 | 274 | def print_func_owners(self, addr): 275 | insn = self.insts[addr] 276 | if hasattr(insn, "bb"): 277 | for b in insn.bb: 278 | sys.stderr.write("0x%x, " % b.func) 279 | sys.stderr.write("\n") 280 | 281 | 282 | def build_cfg(self): 283 | for addr,bb in self.basic_blocks.iteritems(): 284 | if addr in self.call_targets: 285 | continue 286 | if addr in self.xrefs_to: 287 | #look at the instructions that refer to this basic block start address 288 | for xr in self.xrefs_to[addr]: 289 | src = xr[0] 290 | #add an edge for each block that the referring instruction belongs to 291 | for block in bb: 292 | src_bb = self.get_bb_id(block[1], src) 293 | if src_bb == -1: 294 | # this seems to happen when we don't have a complete understanding 295 | # of whether a function call fails to return or not 296 | # which leads to the incorrect conclusion that the instruction 297 | # following the call is reachable 298 | ''' 299 | sys.stderr.write("0x%x refers to 0x%x but failed to find bid for 0x%x\n" % (src, addr, src)) 300 | sys.stderr.write("0x%x belongs to: " % src) 301 | self.print_func_owners(src) 302 | sys.stderr.write("0x%x belongs to: " % addr) 303 | self.print_func_owners(addr) 304 | ''' 305 | continue 306 | xr_type = CONDITION_FALSE 307 | if xr[1] == XR_FLOW: 308 | if len(self.xrefs_from[src]) == 1: 309 | xr_type = UNCONDITIONAL 310 | else: 311 | xr_type = CONDITION_FALSE 312 | elif xr[1] == XR_JCC: 313 | xr_type = CONDITION_TRUE 314 | elif xr[1] == XR_JUMP: 315 | xr_type = UNCONDITIONAL 316 | else: #should not get here 317 | continue 318 | edge = CFG_Edge(block[1], src_bb, block[0], xr_type) 319 | self.cfg.append(edge) 320 | 321 | def build_callgraph(self): 322 | for func in self.call_targets: 323 | if func in self.xrefs_to: 324 | for xr in self.xrefs_to[func]: 325 | src = xr[0] 326 | inst = self.insts[src] 327 | if hasattr(inst, "bb"): 328 | for block in inst.bb: 329 | edge = CG_Edge(block.func, block.bid, src, func) 330 | self.callgraph.append(edge) 331 | 332 | #need to traverse to figure out the parent functions for 333 | #all basic blocks. Note we have more work to do than we should 334 | #this is a result of the binnavi database schema failing to actually 335 | #set the ex_N_basic_blocks primary key to (id, parent_function) as they 336 | #claim to in 337 | #binnavi/src/main/java/com/google/security/zynamics/binnavi/manual/html/dbformat.htm 338 | #instead they only use id so we need a unique id when a block is part of more than 339 | #one function 340 | def extract_basic_block_data(self, func, addr, func_insts): 341 | bb = -1 342 | while True: 343 | if addr in func_insts: 344 | break 345 | func_insts.add(addr) 346 | if self.is_bb_start(addr): 347 | if addr == 0: 348 | print "tried to add basic block at 0 for func 0x%x" % func 349 | else: 350 | bb = self.add_basic_block(addr, func) 351 | if addr in self.xrefs_from: 352 | flows_to = -1 353 | xrefs = self.xrefs_from[addr] 354 | for xr in xrefs: 355 | if xr[1] == XR_FLOW: 356 | flows_to = xr[0] 357 | elif xr[1] == XR_CALL: 358 | continue 359 | elif xr[1] == XR_JCC: 360 | self.extract_basic_block_data(func, xr[0], func_insts) 361 | elif xr[0] in self.thunks: # must be XR_JUMP 362 | continue 363 | elif xr[0] in self.call_targets: # must be XR_JUMP to a function 364 | # this might/probably needs a callgraph edge 365 | continue 366 | elif xr[0] in self.loader.imports_by_addr: # must be XR_JUMP 367 | continue 368 | else: # XR_JUMP, perhaps switch jump ??? 369 | self.extract_basic_block_data(func, xr[0], func_insts) 370 | if flows_to != -1: 371 | addr = flows_to 372 | else: #no normal flow from here 373 | break 374 | else: #no xrefs from here 375 | break 376 | 377 | #assumes we have all basic blocks identified, we make a second pass here 378 | #in case we need to associate a bansic block with more than one function 379 | #this is a result of the binnavi database schema failing to actually 380 | #set the ex_N_basic_blocks primary key to (id, parent_function) as they 381 | #claim to in 382 | #binnavi/src/main/java/com/google/security/zynamics/binnavi/manual/html/dbformat.htm 383 | #instead they only use id so we need a unique id when a block is part of more than 384 | #one function 385 | def set_basic_block_instructions(self): 386 | for addr,bb in self.basic_blocks.iteritems(): 387 | seq = 0 388 | while True: 389 | if addr not in self.insts: 390 | #may have reference to invalid isntruction 391 | break 392 | inst = self.insts[addr] 393 | inst.bb = [BlockInfo(b[0], seq, b[1]) for b in bb] #block may belong to more than one function 394 | seq += 1 395 | if addr in self.xrefs_from: 396 | xrefs = self.xrefs_from[addr] 397 | if self.is_returning_call(xrefs): 398 | addr = get_call_targets(xrefs)[0] 399 | elif len(xrefs) > 1: 400 | break 401 | else: # len(xrefs) == 1 402 | addr = xrefs[0][0] 403 | else: # no xrefs from so at end of block 404 | break 405 | if addr in self.basic_blocks: #hit start of different basic block 406 | break 407 | 408 | #tree is a list of OpNode 409 | def insert_tree(self, root, tree, depth, pos): 410 | n = tree[depth] 411 | n.pos = pos 412 | depth += 1 413 | arity = 0 414 | if (n.op_type % 10) == OPERATOR: 415 | #operator types are encoded as #4 where # is the arity of the operator 416 | arity = n.op_type // 10 417 | if n.op_type == SIZE_PREFIX or n.op_type == DEREFERENCE: 418 | #also descend on a SIZE_PREFIX 419 | arity = 1 420 | 421 | if n.value not in root: 422 | #new node at this level 423 | self.node_id += 1 424 | n.node_id = self.node_id 425 | root[n.value] = (n, {}) 426 | else: 427 | n.node_id = root[n.value][0].node_id 428 | self.exprs[self.expr_id].append(root[n.value][0].node_id) 429 | op_root = root[n.value][1] 430 | for i in range(arity): 431 | if i not in op_root: 432 | op_root[i] = {} 433 | root = op_root[i] #different subtrees for different operand position 434 | #parse the operands for the operator 435 | depth = self.insert_tree(root, tree, depth, i) 436 | return depth 437 | 438 | def tree_to_str(self, tree): 439 | s = '' 440 | for o in tree: 441 | s += '(%s)' % str(o.value) 442 | return s 443 | 444 | def add_expr_tree(self, tree): 445 | if len(tree) == 0: 446 | return 0 447 | s = self.tree_to_str(tree) 448 | if s in self.expr_strings: 449 | #we have seen this expression before 450 | expr_id = self.expr_strings[s] 451 | idx = 0 452 | for i in self.exprs[expr_id]: 453 | tree[idx].node_id = i 454 | idx += 1 455 | return expr_id 456 | # will be making a new expression 457 | self.expr_id += 1 458 | self.exprs[self.expr_id] = [] 459 | self.insert_tree(self.nodes, tree, 0, 0) 460 | self.expr_strings[s] = self.expr_id 461 | return self.expr_id 462 | 463 | def print_disassembly(self): 464 | keylist = [a for a in self.visited] # self.insts.keys() 465 | keylist.sort() 466 | last = None 467 | for a in keylist: 468 | i = self.insts[a] 469 | if a in self.names: 470 | print "%s:" % self.names[a] 471 | ref = '' 472 | if i.address not in self.xrefs_to: 473 | ref = "\t\t**** NOT REFERENCED ****" 474 | operand = self.get_op_name(i.address, i.op_str) 475 | print "\t0x%08x:\t%s%s%s" % (i.address, i.mnemonic.ljust(8), operand, ref) 476 | ''' 477 | if i.address in self.xrefs_from: 478 | xr = self.xrefs_from[i.address] 479 | sys.stdout.write('\t') 480 | for x in xr: 481 | sys.stdout.write("0x%x(%d), " % (x[0], x[1])) 482 | sys.stdout.write('\n') 483 | ''' 484 | last = i 485 | 486 | def scan_gaps(self, header): 487 | keylist = [a for a in self.visited] # self.insts.keys() 488 | keylist.sort() 489 | last = None 490 | count = 0 491 | for a in keylist: 492 | i = self.insts[a] 493 | if last is not None and (last.address + last.size) != a: 494 | gap_start = last.address + last.size 495 | gap = self.loader.get_bytes(gap_start, a - gap_start) 496 | if gap is None: 497 | print "That's odd, gap is None" 498 | continue 499 | idx = 0 500 | while True: 501 | loc = gap.find(header, idx) 502 | if loc != -1 and (loc + gap_start) not in self.visited: 503 | self.locs.append(loc + gap_start) 504 | #print "Adding gap function 0x%x" % (loc + gap_start) 505 | count += 1 506 | idx = loc + 1 507 | else: 508 | break 509 | last = i 510 | #print "Gap analysis added %d new locations" % count 511 | 512 | #Scan the data sections for possible references back to code 513 | #such as vtables, switch jumps, and other function pointers 514 | def scan_data(self): 515 | pass 516 | 517 | #Scan unanalyzed gaps in the code section for possible references 518 | #to code such as switch jumps 519 | def scan_gap_data(self): 520 | pass 521 | 522 | #subclasses should implement this as it's very platform specific 523 | def process_operands(self, inst): 524 | raise Exception("Please implement process_operands") 525 | 526 | #subclasses should implement this 527 | def process_jump(self, inst): 528 | raise Exception("Please implement process_jump") 529 | 530 | #subclasses should implement this 531 | def process_call(self, inst): 532 | raise Exception("Please implement process_jump") 533 | 534 | #subclasses should implement this 535 | def get_op_name(self, addr, default_val): 536 | raise Exception("Please implement get_op_name") 537 | 538 | def add_xref(self, frm, to, xr_type=XR_FLOW): 539 | raise Exception("Please implement add_xref") 540 | 541 | def nextinst(self, addr): 542 | #take enough to get at least 1 instruction in majority case 543 | if addr in self.insts: 544 | # previously decoded this with capstone 545 | return self.insts[addr] 546 | # grab a block of bytes following the current address 547 | mc = self.loader.get_bytes(addr, 256) 548 | if mc is None or len(mc) == 0: 549 | return None 550 | for i in self.dis.disasm(mc, addr): 551 | self.insts[i.address] = i 552 | if addr in self.insts: 553 | return self.insts[addr] 554 | return None 555 | 556 | def is_possible_code(self, addr): 557 | if addr in self.data_locs: 558 | return False 559 | for s in self.loader.sections: 560 | if (s.perms & loader.PROT_EXEC) and s.contains(addr): 561 | return True 562 | return False 563 | 564 | def generate_disassembly(self): 565 | while len(self.locs) > 0: 566 | addr = self.locs.pop(0) 567 | if not self.is_possible_code(addr): 568 | continue 569 | dead_end = False 570 | while True: 571 | i = self.nextinst(addr) 572 | if i is None: 573 | # but we should have gotten an instruction so this is odd 574 | # remove all xrefs to this address 575 | if addr in self.xrefs_to: 576 | srcs = self.xrefs_to[addr] 577 | for s in srcs: 578 | if s[0] in self.xrefs_from: 579 | dests = self.xrefs_from[s[0]] 580 | for tgt in dests: 581 | if tgt[0] == addr: 582 | dests.remove(tgt) 583 | break 584 | if len(dests) == 0: 585 | self.xrefs_from.pop(s[0]) 586 | self.xrefs_to.pop(addr, None) 587 | break 588 | if i.address in self.visited: 589 | #already been here, won't learn anything new 590 | break 591 | self.visited.add(i.address) 592 | self.insts[i.address] = i 593 | self.process_operands(i) 594 | 595 | dead_end = False 596 | if i.group(capstone.CS_GRP_JUMP): 597 | dead_end = self.process_jump(i) 598 | elif i.group(capstone.CS_GRP_CALL): 599 | dead_end = self.process_call(i) 600 | elif i.group(capstone.CS_GRP_RET): 601 | dead_end = True 602 | elif i.group(capstone.CS_GRP_IRET): 603 | dead_end = True 604 | if not dead_end: 605 | next_addr = i.address + i.size 606 | self.add_xref(i.address, next_addr) 607 | else: 608 | #dead end return to instruction list 609 | break 610 | 611 | def generate_data(self): 612 | self.generate_disassembly() 613 | 614 | print "After first pass, have %d insts" % len(self.visited) 615 | 616 | main = self.loader.find_main(self.insts, self.xrefs_to, self.xrefs_from) 617 | if main is not None and main not in self.visited: 618 | print "Found main at 0x%x" % main 619 | self.locs.append(main) 620 | self.call_targets.add(main) 621 | self.add_basic_block_start(main) 622 | if "main" not in self.names: 623 | self.names[main] = "main" 624 | elif "_main" not in self.names: 625 | self.names[main] = "_main" 626 | else: 627 | self.names[main] = "sub_%x" % main 628 | self.generate_disassembly() 629 | 630 | print "After 'find_main' pass, have %d insts" % len(self.visited) 631 | 632 | #pick up pointers in the rdata section 633 | # self.scan_data() 634 | # self.generate_disassembly() 635 | 636 | # for sig in self.func_sigs: 637 | #try to find more code by looking for standard prologue 638 | # self.scan_gaps(sig) 639 | # self.generate_disassembly() 640 | 641 | #pick up pointers in the text section 642 | # self.scan_gap_data() 643 | # self.generate_disassembly() 644 | 645 | for f in self.call_targets: 646 | self.extract_basic_block_data(f, f, set()) 647 | self.set_basic_block_instructions() 648 | self.build_cfg() 649 | self.build_callgraph() 650 | for addr,bb in self.basic_blocks.iteritems(): 651 | if len(bb) == 0: 652 | print "no parent found for basic block at 0x%x" % addr 653 | for addr in self.visited: 654 | i = self.insts[addr] 655 | if not hasattr(i, "bb"): 656 | print "Instruction 0x%x has no bb" % addr 657 | -------------------------------------------------------------------------------- /elf_loader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | Crude ELF loader, conforming to the Loader interface, for a stand-alone binnavi compatible disassembler 5 | ''' 6 | 7 | __author__ = "Chris Eagle" 8 | __copyright__ = "Copyright 2015, Chris Eagle" 9 | __credits__ = ["Chris Eagle"] 10 | __license__ = "GPL" 11 | __version__ = "2.0" 12 | __maintainer__ = "Chris Eagle" 13 | __email__ = "cseagle@gmail.com" 14 | __status__ = "Use at your own risk" 15 | 16 | import sys 17 | import struct 18 | import hashlib 19 | import binascii 20 | import capstone 21 | from loader import * 22 | 23 | XR_FLOW = 1 24 | XR_CALL = 2 25 | XR_JUMP = 3 26 | XR_JCC = 4 27 | 28 | EI_CLASS = 4 # File class byte index 29 | ELFCLASSNONE = 0 # Invalid class 30 | ELFCLASS32 = 1 # 32-bit objects 31 | ELFCLASS64 = 2 # 64-bit objects 32 | ELFCLASSNUM = 3 33 | 34 | EI_DATA = 5 # Data encoding byte index 35 | ELFDATANONE = 0 # Invalid data encoding 36 | ELFDATA2LSB = 1 # 2's complement, little endian 37 | ELFDATA2MSB = 2 # 2's complement, big endian 38 | ELFDATANUM = 3 39 | 40 | EI_VERSION = 6 # File version byte index 41 | # Value must be EV_CURRENT 42 | 43 | EI_OSABI = 7 # OS ABI identification 44 | ELFOSABI_NONE = 0 # UNIX System V ABI 45 | ELFOSABI_SYSV = 0 # Alias. 46 | ELFOSABI_HPUX = 1 # HP-UX 47 | ELFOSABI_NETBSD = 2 # NetBSD. 48 | ELFOSABI_GNU = 3 # Object uses GNU ELF extensions. 49 | ELFOSABI_LINUX = ELFOSABI_GNU # Compatibility alias. 50 | ELFOSABI_SOLARIS = 6 # Sun Solaris. 51 | ELFOSABI_AIX = 7 # IBM AIX. 52 | ELFOSABI_IRIX = 8 # SGI Irix. 53 | ELFOSABI_FREEBSD = 9 # FreeBSD. 54 | ELFOSABI_TRU64 = 10 # Compaq TRU64 UNIX. 55 | ELFOSABI_MODESTO = 11 # Novell Modesto. 56 | ELFOSABI_OPENBSD = 12 # OpenBSD. 57 | ELFOSABI_ARM_AEABI = 64 # ARM EABI 58 | ELFOSABI_ARM = 97 # ARM 59 | ELFOSABI_STANDALONE = 255 # Standalone (embedded) application 60 | 61 | EI_ABIVERSION = 8 # ABI version 62 | 63 | EI_PAD = 9 # Byte index of padding bytes 64 | 65 | # Legal values for e_type (object file type). 66 | 67 | ET_NONE = 0 # No file type 68 | ET_REL = 1 # Relocatable file 69 | ET_EXEC = 2 # Executable file 70 | ET_DYN = 3 # Shared object file 71 | ET_CORE = 4 # Core file 72 | ET_NUM = 5 # Number of defined types 73 | ET_LOOS = 0xfe00 # OS-specific range start 74 | ET_HIOS = 0xfeff # OS-specific range end 75 | ET_LOPROC = 0xff00 # Processor-specific range start 76 | ET_HIPROC = 0xffff # Processor-specific range end 77 | 78 | 79 | EM_NONE = 0 # No machine 80 | EM_SPARC = 2 # SUN SPARC 81 | EM_386 = 3 # Intel 80386 82 | EM_68K = 4 # Motorola m68k family 83 | EM_MIPS = 8 # MIPS R3000 big-endian 84 | EM_MIPS_RS3_LE = 10 # MIPS R3000 little-endian 85 | 86 | EM_PPC = 20 # PowerPC 87 | EM_PPC64 = 21 # PowerPC 64-bit 88 | 89 | EM_ARM = 40 # ARM 90 | EM_SPARCV9 = 43 # SPARC v9 64-bit 91 | 92 | EM_X86_64 = 62 # AMD x86-64 architecture 93 | 94 | EM_AARCH64 = 183 # ARM AARCH64 95 | 96 | # Legal values for p_type (segment type). 97 | 98 | PT_NULL = 0 # Program header table entry unused 99 | PT_LOAD = 1 # Loadable program segment 100 | PT_DYNAMIC = 2 # Dynamic linking information 101 | PT_INTERP = 3 # Program interpreter 102 | PT_NOTE = 4 # Auxiliary information 103 | PT_SHLIB = 5 # Reserved 104 | PT_PHDR = 6 # Entry for header table itself 105 | PT_TLS = 7 # Thread-local storage segment 106 | PT_NUM = 8 # Number of defined types 107 | PT_LOOS = 0x60000000 # Start of OS-specific 108 | PT_GNU_EH_FRAME = 0x6474e550 # GCC .eh_frame_hdr segment 109 | PT_GNU_STACK = 0x6474e551 # Indicates stack executability 110 | PT_GNU_RELRO = 0x6474e552 # Read-only after relocation 111 | PT_LOSUNW = 0x6ffffffa 112 | PT_SUNWBSS = 0x6ffffffa # Sun Specific segment 113 | PT_SUNWSTACK = 0x6ffffffb # Stack segment 114 | PT_HISUNW = 0x6fffffff 115 | PT_HIOS = 0x6fffffff # End of OS-specific 116 | PT_LOPROC = 0x70000000 # Start of processor-specific 117 | PT_HIPROC = 0x7fffffff # End of processor-specific 118 | 119 | # Legal values for e_version (version). 120 | 121 | EV_NONE = 0 # Invalid ELF version 122 | EV_CURRENT = 1 # Current version 123 | EV_NUM = 2 124 | 125 | # Legal values for p_flags (segment flags). 126 | 127 | PF_X = (1 << 0) # Segment is executable 128 | PF_W = (1 << 1) # Segment is writable 129 | PF_R = (1 << 2) # Segment is readable 130 | PF_MASKOS = 0x0ff00000 # OS-specific 131 | PF_MASKPROC = 0xf0000000 # Processor-specific 132 | 133 | # Legal values for sh_type (section type). 134 | 135 | SHT_NULL = 0 # Section header table entry unused 136 | SHT_PROGBITS = 1 # Program data 137 | SHT_SYMTAB = 2 # Symbol table 138 | SHT_STRTAB = 3 # String table 139 | SHT_RELA = 4 # Relocation entries with addends 140 | SHT_HASH = 5 # Symbol hash table 141 | SHT_DYNAMIC = 6 # Dynamic linking information 142 | SHT_NOTE = 7 # Notes 143 | SHT_NOBITS = 8 # Program space with no data (bss) 144 | SHT_REL = 9 # Relocation entries, no addends 145 | SHT_SHLIB = 10 # Reserved 146 | SHT_DYNSYM = 11 # Dynamic linker symbol table 147 | SHT_INIT_ARRAY = 14 # Array of constructors 148 | SHT_FINI_ARRAY = 15 # Array of destructors 149 | SHT_PREINIT_ARRAY = 16 # Array of pre-constructors 150 | SHT_GROUP = 17 # Section group 151 | SHT_SYMTAB_SHNDX = 18 # Extended section indeces 152 | SHT_NUM = 19 # Number of defined types. 153 | SHT_LOOS = 0x60000000 # Start OS-specific. 154 | SHT_GNU_ATTRIBUTES = 0x6ffffff5 # Object attributes. 155 | SHT_GNU_HASH = 0x6ffffff6 # GNU-style hash table. 156 | SHT_GNU_LIBLIST = 0x6ffffff7 # Prelink library list 157 | SHT_CHECKSUM = 0x6ffffff8 # Checksum for DSO content. 158 | SHT_LOSUNW = 0x6ffffffa # Sun-specific low bound. 159 | SHT_SUNW_move = 0x6ffffffa 160 | SHT_SUNW_COMDAT = 0x6ffffffb 161 | SHT_SUNW_syminfo = 0x6ffffffc 162 | SHT_GNU_verdef = 0x6ffffffd # Version definition section. 163 | SHT_GNU_verneed = 0x6ffffffe # Version needs section. 164 | SHT_GNU_versym = 0x6fffffff # Version symbol table. 165 | SHT_HISUNW = 0x6fffffff # Sun-specific high bound. 166 | SHT_HIOS = 0x6fffffff # End OS-specific type 167 | SHT_LOPROC = 0x70000000 # Start of processor-specific 168 | SHT_HIPROC = 0x7fffffff # End of processor-specific 169 | SHT_LOUSER = 0x80000000 # Start of application-specific 170 | SHT_HIUSER = 0x8fffffff # End of application-specific 171 | 172 | # Legal values for sh_flags (section flags). 173 | 174 | SHF_WRITE = (1 << 0) # Writable 175 | SHF_ALLOC = (1 << 1) # Occupies memory during execution 176 | SHF_EXECINSTR = (1 << 2) # Executable 177 | SHF_MERGE = (1 << 4) # Might be merged 178 | SHF_STRINGS = (1 << 5) # Contains nul-terminated strings 179 | SHF_INFO_LINK = (1 << 6) # `sh_info' contains SHT index 180 | SHF_LINK_ORDER = (1 << 7) # Preserve order after combining 181 | SHF_OS_NONCONFORMING = (1 << 8) # Non-standard OS specific handling required 182 | SHF_GROUP = (1 << 9) # Section is member of a group. 183 | SHF_TLS = (1 << 10) # Section hold thread-local data. 184 | SHF_MASKOS = 0x0ff00000 # OS-specific. 185 | SHF_MASKPROC = 0xf0000000 # Processor-specific 186 | SHF_ORDERED = (1 << 30) # Special ordering requirement (Solaris). 187 | SHF_EXCLUDE = (1 << 31) # Section is excluded unless referenced or allocated (Solaris). 188 | 189 | # Legal values for ST_TYPE subfield of st_info (symbol type). 190 | 191 | STT_NOTYPE = 0 # Symbol type is unspecified 192 | STT_OBJECT = 1 # Symbol is a data object 193 | STT_FUNC = 2 # Symbol is a code object 194 | STT_SECTION = 3 # Symbol associated with a section 195 | STT_FILE = 4 # Symbol's name is file name 196 | STT_COMMON = 5 # Symbol is a common data object 197 | STT_TLS = 6 # Symbol is thread-local data object 198 | STT_NUM = 7 # Number of defined types. 199 | STT_LOOS = 10 # Start of OS-specific 200 | STT_GNU_IFUNC = 10 # Symbol is indirect code object 201 | STT_HIOS = 12 # End of OS-specific 202 | STT_LOPROC = 13 # Start of processor-specific 203 | STT_HIPROC = 15 # End of processor-specific 204 | 205 | # Legal values for d_tag (dynamic entry type). 206 | 207 | DT_NULL = 0 # Marks end of dynamic section 208 | DT_NEEDED = 1 # Name of needed library 209 | DT_PLTRELSZ = 2 # Size in bytes of PLT relocs 210 | DT_PLTGOT = 3 # Processor defined value 211 | DT_HASH = 4 # Address of symbol hash table 212 | DT_STRTAB = 5 # Address of string table 213 | DT_SYMTAB = 6 # Address of symbol table 214 | DT_RELA = 7 # Address of Rela relocs 215 | DT_RELASZ = 8 # Total size of Rela relocs 216 | DT_RELAENT = 9 # Size of one Rela reloc 217 | DT_STRSZ = 10 # Size of string table 218 | DT_SYMENT = 11 # Size of one symbol table entry 219 | DT_INIT = 12 # Address of init function 220 | DT_FINI = 13 # Address of termination function 221 | DT_SONAME = 14 # Name of shared object 222 | DT_RPATH = 15 # Library search path (deprecated) 223 | DT_SYMBOLIC = 16 # Start symbol search here 224 | DT_REL = 17 # Address of Rel relocs 225 | DT_RELSZ = 18 # Total size of Rel relocs 226 | DT_RELENT = 19 # Size of one Rel reloc 227 | DT_PLTREL = 20 # Type of reloc in PLT 228 | DT_DEBUG = 21 # For debugging; unspecified 229 | DT_TEXTREL = 22 # Reloc might modify .text 230 | DT_JMPREL = 23 # Address of PLT relocs 231 | DT_BIND_NOW = 24 # Process relocations of object 232 | DT_INIT_ARRAY = 25 # Array with addresses of init fct 233 | DT_FINI_ARRAY = 26 # Array with addresses of fini fct 234 | DT_INIT_ARRAYSZ = 27 # Size in bytes of DT_INIT_ARRAY 235 | DT_FINI_ARRAYSZ = 28 # Size in bytes of DT_FINI_ARRAY 236 | DT_RUNPATH = 29 # Library search path 237 | DT_FLAGS = 30 # Flags for the object being loaded 238 | DT_ENCODING = 32 # Start of encoded range 239 | DT_PREINIT_ARRAY = 32 # Array with addresses of preinit fct 240 | DT_PREINIT_ARRAYSZ = 33 # size in bytes of DT_PREINIT_ARRAY 241 | DT_NUM = 34 # Number used 242 | DT_LOOS = 0x6000000d # Start of OS-specific 243 | DT_HIOS = 0x6ffff000 # End of OS-specific 244 | DT_LOPROC = 0x70000000 # Start of processor-specific 245 | DT_HIPROC = 0x7fffffff # End of processor-specific 246 | #DT_PROCNUM = DT_MIPS_NUM # Most used by any processor 247 | 248 | # DT_* entries which fall between DT_VALRNGHI & DT_VALRNGLO use the 249 | # Dyn.d_un.d_val field of the Elf*_Dyn structure. This follows Sun's 250 | # approach. 251 | DT_VALRNGLO = 0x6ffffd00 252 | DT_GNU_PRELINKED = 0x6ffffdf5 # Prelinking timestamp 253 | DT_GNU_CONFLICTSZ = 0x6ffffdf6 # Size of conflict section 254 | DT_GNU_LIBLISTSZ = 0x6ffffdf7 # Size of library list 255 | DT_CHECKSUM = 0x6ffffdf8 256 | DT_PLTPADSZ = 0x6ffffdf9 257 | DT_MOVEENT = 0x6ffffdfa 258 | DT_MOVESZ = 0x6ffffdfb 259 | DT_FEATURE_1 = 0x6ffffdfc # Feature selection (DTF_*). 260 | DT_POSFLAG_1 = 0x6ffffdfd # Flags for DT_* entries, effecting the following DT_* entry. 261 | DT_SYMINSZ = 0x6ffffdfe # Size of syminfo table (in bytes) 262 | DT_SYMINENT = 0x6ffffdff # Entry size of syminfo 263 | DT_VALRNGHI = 0x6ffffdff 264 | #DT_VALTAGIDX(tag) (DT_VALRNGHI - (tag)) # Reverse order! 265 | DT_VALNUM = 12 266 | 267 | # DT_* entries which fall between DT_ADDRRNGHI & DT_ADDRRNGLO use the 268 | # Dyn.d_un.d_ptr field of the Elf*_Dyn structure. 269 | 270 | # If any adjustment is made to the ELF object after it has been 271 | # built these entries will need to be adjusted. 272 | DT_ADDRRNGLO = 0x6ffffe00 273 | DT_GNU_HASH = 0x6ffffef5 # GNU-style hash table. 274 | DT_TLSDESC_PLT = 0x6ffffef6 275 | DT_TLSDESC_GOT = 0x6ffffef7 276 | DT_GNU_CONFLICT = 0x6ffffef8 # Start of conflict section 277 | DT_GNU_LIBLIST = 0x6ffffef9 # Library list 278 | DT_CONFIG = 0x6ffffefa # Configuration information. 279 | DT_DEPAUDIT = 0x6ffffefb # Dependency auditing. 280 | DT_AUDIT = 0x6ffffefc # Object auditing. 281 | DT_PLTPAD = 0x6ffffefd # PLT padding. 282 | DT_MOVETAB = 0x6ffffefe # Move table. 283 | DT_SYMINFO = 0x6ffffeff # Syminfo table. 284 | DT_ADDRRNGHI = 0x6ffffeff 285 | #DT_ADDRTAGIDX(tag) (DT_ADDRRNGHI - (tag)) # Reverse order! 286 | DT_ADDRNUM = 11 287 | 288 | # The versioning entry types. The next are defined as part of the GNU extension. 289 | DT_VERSYM = 0x6ffffff0 290 | 291 | DT_RELACOUNT = 0x6ffffff9 292 | DT_RELCOUNT = 0x6ffffffa 293 | 294 | # These were chosen by Sun. 295 | DT_FLAGS_1 = 0x6ffffffb # State flags, see DF_1_* below. 296 | DT_VERDEF = 0x6ffffffc # Address of version definition table 297 | DT_VERDEFNUM = 0x6ffffffd # Number of version definitions 298 | DT_VERNEED = 0x6ffffffe # Address of table with needed versions 299 | DT_VERNEEDNUM = 0x6fffffff # Number of needed versions 300 | #DT_VERSIONTAGIDX(tag) (DT_VERNEEDNUM - (tag)) # Reverse order! 301 | DT_VERSIONTAGNUM = 16 302 | 303 | # Sun added these machine-independent extensions in the "processor-specific" range. Be compatible. 304 | DT_AUXILIARY = 0x7ffffffd # Shared object to load before self 305 | DT_FILTER = 0x7fffffff # Shared object to get values from 306 | #DT_EXTRATAGIDX(tag) ((Elf32_Word)-((Elf32_Sword) (tag) <<1>>1)-1) 307 | DT_EXTRANUM = 3 308 | 309 | class InvalidHeader(Exception): 310 | def __init__(self, msg): 311 | Exception.__init__(self, msg) 312 | 313 | class ElfSectionHeader(object): 314 | # do our best to handle both Elf32_Shdr and Elf64_Shdr 315 | def __init__(self, elf, offset): 316 | try: 317 | self.elf = elf 318 | self.raw = elf.raw[offset:offset+elf.e_shentsize] 319 | if elf.sizeof_ptr == 8: 320 | fields = struct.unpack(elf.endian + "IIQQQQIIQQ", self.raw) 321 | else: 322 | fields = struct.unpack(elf.endian + "IIIIIIIIII", self.raw) 323 | self.sh_name = fields[0] 324 | self.sh_type = fields[1] 325 | self.sh_flags = fields[2] 326 | self.sh_addr = fields[3] 327 | self.sh_offset = fields[4] 328 | self.sh_size = fields[5] 329 | self.sh_link = fields[6] 330 | self.sh_info = fields[7] 331 | self.sh_addralign = fields[8] 332 | self.sh_entsize = fields[9] 333 | 334 | self.perms = PROT_READ 335 | 336 | if self.sh_type == SHT_NOBITS: 337 | size = 0 338 | else: 339 | size = self.sh_size 340 | 341 | if self.sh_flags & SHF_WRITE: 342 | self.perms |= PROT_WRITE 343 | if self.sh_flags & SHF_EXECINSTR: 344 | self.perms |= PROT_EXEC 345 | 346 | self.content = elf.raw 347 | 348 | except: 349 | raise InvalidHeader("Invalid section header") 350 | 351 | def __del__(self): 352 | del self.raw 353 | 354 | def get_string(self, offset): 355 | #if this isn't a STRTAB section we should probably throw an exception 356 | res = '' 357 | while offset < self.sh_size: 358 | ch = self.content[self.sh_offset + offset] 359 | if ch == '\x00': 360 | break 361 | res += ch 362 | offset += 1 363 | return res 364 | 365 | def get_symbol(self, offset): 366 | #if this isn't a SYMTAB section we should probably throw an exception 367 | strtab = self.elf.shdrs[self.sh_link] 368 | sym_start = self.sh_offset + offset 369 | st_name = struct.unpack(self.elf.endian + "I", self.content[sym_start:sym_start + 4])[0] 370 | if self.elf.sizeof_ptr == 4: 371 | idx = 2 372 | fields = struct.unpack(self.elf.endian + "IIBBH", self.content[sym_start + 4:sym_start + 16]) 373 | else: 374 | idx = 0 375 | fields = struct.unpack(self.elf.endian + "BBHQQ", self.content[sym_start + 4:sym_start + 24]) 376 | st_info = fields[idx] 377 | st_other = fields[idx + 1] 378 | st_shndx = fields[idx + 2] 379 | st_value = fields[(idx + 3) % 5] 380 | st_size = fields[(idx + 4) % 5] 381 | name = strtab.get_string(st_name) 382 | #print "Symbol name: %s" % name 383 | return ElfSymbol(name, st_value, st_size, st_info, st_other, st_shndx) 384 | 385 | class ElfProgramHeader(object): 386 | # do our best to handle both Elf32_Phdr and Elf64_Phdr 387 | def __init__(self, elf, offset): 388 | try: 389 | self.raw = elf.raw[offset:offset+elf.e_phentsize] 390 | i = elf.sizeof_ptr >> 2 391 | if elf.sizeof_ptr == 8: 392 | fields = struct.unpack(elf.endian + "IIQQQQQQ", self.raw) 393 | self.p_flags = fields[1] 394 | else: 395 | fields = struct.unpack(elf.endian + "IIIIIIII", self.raw) 396 | self.p_flags = fields[6] 397 | 398 | self.p_type = fields[0] 399 | self.p_offset = fields[i] 400 | self.p_vaddr = fields[i + 1] 401 | self.p_paddr = fields[i + 2] 402 | self.p_filesz = fields[i + 3] 403 | self.p_memsz = fields[i + 4] 404 | self.p_align = fields[7] 405 | 406 | self.perms = 0 407 | if self.p_flags & PF_R: 408 | self.perms |= PROT_READ 409 | if self.p_flags & PF_W: 410 | self.perms |= PROT_WRITE 411 | if self.p_flags & PF_X: 412 | self.perms |= PROT_EXEC 413 | 414 | if self.p_type == PT_DYNAMIC: 415 | self.dyns = {} 416 | dyn_size = 2 * elf.sizeof_ptr 417 | num_dyns = self.p_filesz // dyn_size 418 | for i in range(num_dyns): 419 | d_tag = elf.get_pointer(self.p_vaddr + i * dyn_size) 420 | d_un = elf.get_pointer(self.p_vaddr + i * dyn_size + elf.sizeof_ptr) 421 | if d_tag == DT_NEEDED: 422 | if d_tag not in self.dyns: 423 | self.dyns[d_tag] = [] 424 | self.dyns[d_tag].append(d_un) 425 | elif d_tag == DT_NULL: 426 | break 427 | elif d_tag == DT_STRTAB: 428 | if elf.symbol_strtab != 0: 429 | #print "Existing strtab: 0x%x" % elf.symbol_strtab 430 | #print "DT_STRTAB: 0x%x" % d_un 431 | pass 432 | elf.symbol_strtab = d_un 433 | else: 434 | if d_tag in self.dyns: 435 | print "Unexpected duplicate of d_tag %d" % d_tag 436 | self.dyns[d_tag] = d_un 437 | except: 438 | raise InvalidHeader("Invalid program header") 439 | 440 | def __del__(self): 441 | del self.raw 442 | 443 | class ElfSymbol(object): 444 | 445 | def __init__(self, name, value, size, info, other, shndx): 446 | self.name = name 447 | self.value = value 448 | self.size = size 449 | self.info = info 450 | self.other = other 451 | self.shndx = shndx 452 | self.bind = (info >> 4) & 0xf 453 | self.type = info & 0xf 454 | 455 | def __del__(self): 456 | del self.name 457 | 458 | class ElfBase(Loader): 459 | 460 | def __init__(self, elf_file): 461 | Loader.__init__(self, elf_file) 462 | 463 | self.pe_offset = 0 464 | self.shdrs = [] 465 | self.phdrs = [] 466 | self.symbols = [] 467 | 468 | #need algorithm to propogate this attribute to callers when possible 469 | self.non_returning_funcs.append("abort") 470 | self.non_returning_funcs.append("err") 471 | self.non_returning_funcs.append("errx") 472 | self.non_returning_funcs.append("exit") 473 | self.non_returning_funcs.append("_exit") 474 | self.non_returning_funcs.append("__assert_fail") 475 | self.non_returning_funcs.append("pthread_exit") 476 | self.non_returning_funcs.append("verr") 477 | self.non_returning_funcs.append("verrx") 478 | 479 | def __del__(self): 480 | del self.shdrs[:] 481 | del self.shdrs 482 | del self.phdrs[:] 483 | del self.phdrs 484 | del self.symbols[:] 485 | del self.symbols 486 | Loader.__del__(self) 487 | 488 | # Perform common ELF validation tasks 489 | def is_valid(self): 490 | if self.raw[0:4] != '\x7fELF': 491 | return False 492 | 493 | if ord(self.raw[EI_VERSION]) != EV_CURRENT: 494 | return False 495 | 496 | if ord(self.raw[EI_CLASS]) != ELFCLASS32 and ord(self.raw[EI_CLASS]) != ELFCLASS64: 497 | return False 498 | 499 | if ord(self.raw[EI_DATA]) != ELFDATA2MSB and ord(self.raw[EI_DATA]) != ELFDATA2LSB: 500 | return False 501 | 502 | if ord(self.raw[EI_DATA]) == ELFDATA2MSB: 503 | self.set_endianness(BIG_ENDIAN) 504 | 505 | self.e_type = self.get_word(16) 506 | 507 | if self.e_type < ET_REL or self.e_type > ET_CORE: 508 | return False 509 | 510 | self.e_machine = self.get_word(18) 511 | 512 | if self.e_machine == EM_386: 513 | self.arch = capstone.CS_ARCH_X86 514 | self.mode = capstone.CS_MODE_32 515 | self.arch_name = 'x86-32' 516 | elif self.e_machine == EM_X86_64: 517 | self.arch = capstone.CS_ARCH_X86 518 | self.mode = capstone.CS_MODE_64 519 | self.arch_name = 'x86-64' 520 | elif self.e_machine == EM_ARM: 521 | self.arch = capstone.CS_ARCH_ARM 522 | self.mode = capstone.CS_MODE_ARM 523 | self.arch_name = 'ARM' 524 | elif self.e_machine == EM_AARCH64: 525 | self.arch = capstone.CS_ARCH_ARM64 526 | self.mode = capstone.CS_MODE_ARM 527 | self.arch_name = 'AARCH64' 528 | elif self.e_machine == EM_PPC: 529 | self.arch = capstone.CS_ARCH_PPC 530 | self.mode = capstone.CS_MODE_32 531 | self.arch_name = 'PPC' 532 | elif self.e_machine == EM_PPC64: 533 | self.arch = capstone.CS_ARCH_PPC 534 | self.mode = capstone.CS_MODE_64 535 | self.arch_name = 'PPC-64' 536 | elif self.e_machine == EM_SPARC: 537 | self.arch = capstone.CS_ARCH_SPARC 538 | self.mode = capstone.CS_MODE_32 539 | self.arch_name = 'SPARC' 540 | elif self.e_machine == EM_MIPS: 541 | self.arch = capstone.CS_ARCH_MIPS 542 | if self.sizeof_ptr == 4: 543 | self.mode = capstone.CS_MODE_MIPS32 544 | self.arch_name = 'MIPS32' 545 | elif self.sizeof_ptr == 8: 546 | self.mode = capstone.CS_MODE_MIPS64 547 | self.arch_name = 'MIPS64' 548 | else: 549 | # anything else, we don't recognize 550 | # could move this check into the caller 551 | # to allow it to determine whether it has an appropriate 552 | # disassembler 553 | return False 554 | 555 | if self.endian == BIG_ENDIAN: 556 | self.mode |= capstone.CS_MODE_BIG_ENDIAN 557 | 558 | self.e_version = self.get_dword(20) 559 | self.e_entry = self.get_pointer(24) 560 | self.e_phoff = self.get_pointer(24 + self.sizeof_ptr) 561 | self.e_shoff = self.get_pointer(24 + self.sizeof_ptr * 2) 562 | self.e_flags = self.get_dword(24 + self.sizeof_ptr * 3) 563 | fields_offset = 28 + self.sizeof_ptr * 3 564 | fields = [] 565 | for i in range(6): 566 | # could do all this with struct.unpack, would need to ensure 567 | # we honor endian-ness in the format string that is used 568 | fields.append(self.get_word(fields_offset + i * 2)) 569 | self.e_ehsize = fields[0] 570 | self.e_phentsize = fields[1] 571 | self.e_phnum = fields[2] 572 | self.e_shentsize = fields[3] 573 | self.e_shnum = fields[4] 574 | self.e_shstrndx = fields[5] 575 | 576 | self.symbol_strtab = 0 577 | 578 | # some sanity checks 579 | 580 | # check e_ehsize 581 | if self.e_ehsize != (40 + 3 * self.sizeof_ptr): 582 | return False 583 | 584 | if self.e_shstrndx >= self.e_shnum: 585 | return False 586 | 587 | # check e_shentsize 588 | if self.e_shentsize != (16 + 6 * self.sizeof_ptr): 589 | return False 590 | 591 | # check e_phentsize 592 | if self.e_phentsize != (8 + 6 * self.sizeof_ptr): 593 | return False 594 | 595 | # Check that there is room for the phdr table 596 | if self.e_phoff > (len(self.raw) - self.e_phentsize * self.e_phnum): 597 | return False 598 | 599 | # Check that there is room for the shdr table 600 | if self.e_shoff > (len(self.raw) - self.e_shentsize * self.e_shnum): 601 | return False 602 | 603 | # many other checks we could perform 604 | return True 605 | 606 | def resolve_sym(self, symidx, addr): 607 | if symidx < len(self.symbols): 608 | sym = self.symbols[symidx] 609 | #print "Resolving symbol: %s" % sym.name 610 | self.add_symbol(addr, sym.name) 611 | if sym.type == STT_FUNC: 612 | self.add_import(addr, sym.name) 613 | 614 | def parse_rel(self, addr, size): 615 | if self.sizeof_ptr == 4: 616 | mask = 0xff 617 | shift = 8 618 | else: 619 | mask = 0xffffffff 620 | shift = 32 621 | relsz = 2 * self.sizeof_ptr 622 | num_rels = size // relsz 623 | for i in range(num_rels): 624 | r_offset = self.get_pointer(addr + i * relsz) 625 | r_info = self.get_pointer(addr + i * relsz + self.sizeof_ptr) 626 | r_sym = r_info >> shift 627 | r_type = r_info & mask 628 | #print "REL r_offset 0x%x" % r_offset 629 | self.resolve_sym(r_sym, r_offset) 630 | 631 | def parse_rela(self, addr, size): 632 | if self.sizeof_ptr == 4: 633 | mask = 0xff 634 | shift = 8 635 | else: 636 | mask = 0xffffffff 637 | shift = 32 638 | relsz = 3 * self.sizeof_ptr 639 | num_rels = size // relsz 640 | for i in range(num_rels): 641 | r_offset = self.get_pointer(addr + i * relsz) 642 | r_info = self.get_pointer(addr + i * relsz + self.sizeof_ptr) 643 | r_addend = self.get_pointer(addr + i * relsz + 2 * self.sizeof_ptr) 644 | r_sym = r_info >> shift 645 | r_type = r_info & mask 646 | #print "RELA r_offset 0x%x" % r_offset 647 | self.resolve_sym(r_sym, r_offset) 648 | 649 | def parse_imports(self): 650 | if self.dyn_hdr is None: 651 | return 652 | jmprel = None 653 | pltgot = None 654 | if DT_JMPREL in self.dyn_hdr.dyns: 655 | jmprel = self.dyn_hdr.dyns[DT_JMPREL] 656 | pltrelsz = self.dyn_hdr.dyns[DT_PLTRELSZ] 657 | pltrel = self.dyn_hdr.dyns[DT_PLTREL] 658 | if DT_PLTGOT in self.dyn_hdr.dyns: 659 | pltgot = self.dyn_hdr.dyns[DT_PLTGOT] 660 | 661 | if jmprel is not None: 662 | if pltrel == DT_REL: 663 | self.parse_rel(jmprel, pltrelsz) 664 | elif pltrel == DT_RELA: 665 | self.parse_rela(jmprel, pltrelsz) 666 | else: 667 | print "UNEXPECTED PLTREL value: %d" % pltrel 668 | 669 | def parse_symbols(self): 670 | symsz = 8 + 2 * self.sizeof_ptr 671 | for s in self.shdrs: 672 | if s.sh_type == SHT_SYMTAB or s.sh_type == SHT_DYNSYM: 673 | num_syms = s.sh_size // symsz 674 | #print "Section %s has %d symbols" % (s.name, num_syms) 675 | for i in range(num_syms): 676 | sym = s.get_symbol(i * symsz) 677 | self.symbols.append(sym) 678 | #if sym.type == STT_FUNC: 679 | #print "Function symbol %s at address 0x%x" % (name, st_value) 680 | 681 | def parse_exports(self): 682 | self.add_export(self.start, "_start") 683 | # add DT_INIT == init_proc and DT_FINI == term_proc 684 | if self.dyn_hdr is not None: 685 | if DT_INIT in self.dyn_hdr.dyns: 686 | self.add_export(self.dyn_hdr.dyns[DT_INIT], ".init_proc") 687 | if DT_FINI in self.dyn_hdr.dyns: 688 | self.add_export(self.dyn_hdr.dyns[DT_FINI], ".term_proc") 689 | for sym in self.symbols: 690 | if sym.type == STT_FUNC and sym.value != 0: 691 | self.add_export(sym.value, sym.name) 692 | #for addr,name in self.exports_by_addr.iteritems(): 693 | #print "EXPORT: 0x%x - %s" % (addr, name) 694 | 695 | def load_phdrs(self): 696 | self.dyn_hdr = None 697 | for i in range(self.e_phnum): 698 | phdr = ElfProgramHeader(self, self.e_phoff + self.e_phentsize * i) 699 | self.phdrs.append(phdr) 700 | if phdr.p_type == PT_DYNAMIC: 701 | self.dyn_hdr = phdr 702 | if phdr.p_type == PT_LOAD: 703 | va = phdr.p_vaddr 704 | if self.image_base is None or va < self.image_base: 705 | self.image_base = va 706 | mr = self.raw[phdr.p_offset:phdr.p_offset+phdr.p_filesz].ljust(phdr.p_memsz, '\x00') 707 | self.add_mapped(va, va + phdr.p_memsz, phdr.perms, mr) 708 | 709 | def load_shdrs(self): 710 | self.sections_by_name.clear() 711 | 712 | for i in range(self.e_shnum): 713 | shdr = ElfSectionHeader(self, self.e_shoff + self.e_shentsize * i) 714 | self.shdrs.append(shdr) 715 | if shdr.sh_type == SHT_STRTAB and i != self.e_shstrndx and self.symbol_strtab != 0: 716 | self.symbol_strtab = shdr.sh_addr 717 | 718 | # now that we have sections, go back and pull section names 719 | # out of the sh names table 720 | strtab = self.shdrs[self.e_shstrndx] 721 | for s in self.shdrs: 722 | # defer setting the name until we are sure we know about the shstrtab 723 | s.name = strtab.get_string(s.sh_name) 724 | 725 | va = s.sh_addr 726 | # match perms against phdrs? sh_flags ?? 727 | 728 | if (s.sh_flags & SHF_ALLOC) == 0: 729 | print 'Skipping section %s' % s.name 730 | continue 731 | self.add_section(s.name, va, va + s.sh_size, s.perms, s.sh_size) 732 | 733 | def load(self): 734 | if self.is_valid(): 735 | del self.mapped[:] 736 | del self.sections[:] 737 | self.phdrs = [] 738 | self.shdrs = [] 739 | 740 | self.osabi = ord(self.raw[EI_OSABI]) 741 | self.image_base = None # set in load_phdrs 742 | self.start = self.e_entry 743 | 744 | self.load_phdrs() 745 | self.load_shdrs() 746 | 747 | # deal with dynamic section imports 748 | # deal with .got .plt 749 | # deal with exports 750 | # deal with symbol table 751 | # deal with dwarf and other debug info 752 | 753 | self.parse_symbols() 754 | self.parse_imports() 755 | self.parse_exports() 756 | return True 757 | return False 758 | 759 | def find_main(self, insts, to, frm): 760 | if self.arch != capstone.CS_ARCH_X86: 761 | return None 762 | addr = self.start 763 | if self.osabi != ELFOSABI_LINUX: 764 | #find main by scanning Linux start stup 765 | while addr in frm: 766 | inst = insts[addr] 767 | if inst.group(capstone.CS_GRP_JUMP): 768 | break 769 | xrefs = frm[addr] 770 | if inst.group(capstone.CS_GRP_CALL): 771 | for x in xrefs: 772 | if x[1] == XR_CALL: 773 | #call to libc_start_main 774 | last = to[addr][0][0] 775 | inst = insts[last] 776 | main = inst.operands[-1].value.imm 777 | return main 778 | break 779 | elif len(xrefs) == 1: 780 | if xrefs[0][1] == XR_FLOW: 781 | addr = xrefs[0][0] 782 | else: 783 | break 784 | else: 785 | break 786 | return None 787 | 788 | class Elf32(ElfBase): 789 | 790 | def __init__(self, elf_file): 791 | ElfBase.__init__(self, elf_file) 792 | 793 | # override to perform file type validation checks such 794 | # as checking magic numbers, etc 795 | def is_valid(self): 796 | # try: 797 | if ord(self.raw[EI_CLASS]) != ELFCLASS32: 798 | return False 799 | self.set_pointer_size(4) 800 | if not ElfBase.is_valid(self): 801 | return False 802 | # now do Elf32 specific checks 803 | # following e_ident we have: self.endian + "HHIIIIIHHHHHH" 804 | # except Exception as e: 805 | #any exception means it's not a PE32 806 | # raise e 807 | return True 808 | 809 | class Elf64(ElfBase): 810 | 811 | def __init__(self, elf_file): 812 | ElfBase.__init__(self, elf_file) 813 | 814 | # override to perform file type validation checks such 815 | # as checking magic numbers, etc 816 | def is_valid(self): 817 | try: 818 | if ord(self.raw[EI_CLASS]) != ELFCLASS64: 819 | return False 820 | self.set_pointer_size(8) 821 | if not ElfBase.is_valid(self): 822 | return False 823 | #now do Elf64 specific checks 824 | # following e_ident we have: self.endian + "HHIQQQIHHHHHH 825 | except Exception as e: 826 | #any exception means it's not a PE32 827 | raise e 828 | # return False 829 | return True 830 | -------------------------------------------------------------------------------- /fREedom.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | Stand-alone binnavi compatible disassembler based on capstone 5 | ''' 6 | 7 | __author__ = "Chris Eagle" 8 | __copyright__ = "Copyright 2015, Chris Eagle" 9 | __credits__ = ["Chris Eagle"] 10 | __license__ = "GPL" 11 | __version__ = "2.0" 12 | __maintainer__ = "Chris Eagle" 13 | __email__ = "cseagle@gmail.com" 14 | __status__ = "Use at your own risk" 15 | 16 | import sys 17 | import argparse 18 | import capstone 19 | import x86_disasm 20 | import pe_loader 21 | import elf_loader 22 | import binnavi_db 23 | 24 | class UnsupportedArch(Exception): 25 | def __init__(self, msg): 26 | Exception.__init__(self, msg) 27 | 28 | class UnsupportedFormat(Exception): 29 | def __init__(self, msg): 30 | Exception.__init__(self,msg) 31 | 32 | def main(args): 33 | 34 | # cycle through available loaders, if one matches 35 | # pass it into the disassembler 36 | ldr = pe_loader.Pe32(args.binary) 37 | if not ldr.load(): 38 | del ldr 39 | ldr = pe_loader.Pe64(args.binary) 40 | if not ldr.load(): 41 | del ldr 42 | ldr = elf_loader.Elf32(args.binary) 43 | if not ldr.load(): 44 | del ldr 45 | ldr = elf_loader.Elf64(args.binary) 46 | if not ldr.load(): 47 | del ldr 48 | raise UnsupportedFormat("Unsupported file format for %s" % args.binary) 49 | 50 | if ldr.arch == capstone.CS_ARCH_X86: 51 | dis = x86_disasm.x86_disasm(ldr) 52 | else: 53 | raise UnsupportedArch("Unsupported processor architecture for %s" % args.binary) 54 | 55 | dis.generate_data() 56 | 57 | print "found %d instructions" % len(dis.visited) 58 | print "found %d basic blocks" % len(dis.basic_blocks) 59 | print "found %d functions" % len(dis.call_targets) 60 | 61 | ''' 62 | print "Functions identified at:" 63 | dis.call_targets.sort() 64 | for c in dis.call_targets: 65 | print " 0x%x" % c 66 | ''' 67 | 68 | #dis.print_disassembly() 69 | 70 | db = binnavi_db.binnavi_db(args.database, args.user, args.passwd, args.dbhost) 71 | db.export(dis) 72 | 73 | # add argument parsing for database commection parameters 74 | if __name__ == "__main__": 75 | parser = argparse.ArgumentParser(description='Export to binnavi.') 76 | parser.add_argument('--database', help='name of database to export to') 77 | parser.add_argument('--user', help='database user name') 78 | parser.add_argument('--pass', dest='passwd', help='database user password') 79 | parser.add_argument('--dbhost', help='database host name') 80 | parser.add_argument('--binary', type=str, required=False, help='binary file to export') 81 | parser.add_argument('--delete', action='store_true', required=False, 82 | help='flag to initiate module deletion') 83 | parser.add_argument('--modules', type=int, nargs='+', required=False, 84 | help='module numbers to delete') 85 | 86 | args = parser.parse_args() 87 | 88 | if args.delete: 89 | db = binnavi_db.binnavi_db(args.database, args.user, args.passwd, args.dbhost) 90 | for m in args.modules: 91 | db.delete_module(m) 92 | else: 93 | main(args) 94 | -------------------------------------------------------------------------------- /loader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | Base class for loaders (file parsers) for a stand-alone binnavi compatible disassembler 5 | ''' 6 | 7 | __author__ = "Chris Eagle" 8 | __copyright__ = "Copyright 2015, Chris Eagle" 9 | __credits__ = ["Chris Eagle"] 10 | __license__ = "GPL" 11 | __version__ = "2.0" 12 | __maintainer__ = "Chris Eagle" 13 | __email__ = "cseagle@gmail.com" 14 | __status__ = "Use at your own risk" 15 | 16 | import sys 17 | import struct 18 | import hashlib 19 | import os 20 | 21 | LITTLE_ENDIAN = '<' 22 | BIG_ENDIAN = '>' 23 | 24 | PROT_READ = 1 25 | PROT_WRITE = 2 26 | PROT_EXEC = 4 27 | PROT_ALL = PROT_READ | PROT_WRITE | PROT_EXEC 28 | 29 | class MappedRegion(object): 30 | def __init__(self, start, end, perms, raw): 31 | self.start = start 32 | self.end = end 33 | self.perms = perms 34 | self.raw = raw 35 | 36 | def contains(self, addr, blen = 1): 37 | _end = addr + blen 38 | return addr >= self.start and addr < self.end and _end <= self.end 39 | 40 | def get_bytes(self, addr, blen = 1): 41 | if self.contains(addr, blen): 42 | offset = addr - self.start 43 | return self.raw[offset:offset+blen] 44 | return None 45 | 46 | class Section(object): 47 | def __init__(self, name, start, end, perms, filesz): 48 | self.name = name 49 | self.start = start 50 | self.end = end 51 | self.perms = perms 52 | self.filesz = filesz 53 | 54 | print "Created section %s, 0x%x:0x%x, raw length 0x%x, perms %d" % (self.name, self.start, self.end, filesz, self.perms) 55 | 56 | def contains(self, addr): 57 | return addr >= self.start and addr < self.end 58 | 59 | def get_raw_bytes(self, ldr): 60 | raw = ldr.get_bytes(self.start, self.filesz) 61 | if raw is None: 62 | sys.stderr.write("Failed to get raw content for section %s at address 0x%x for size 0x%s\n" % (self.name, self.start, self.filesz)) 63 | return ldr.get_bytes(self.start, self.filesz) 64 | 65 | class Loader(object): 66 | 67 | def __init__(self, fname): 68 | self.exe = fname 69 | f = open(fname, 'rb') 70 | self.raw = f.read() 71 | self.md5 = hashlib.md5(self.raw).hexdigest() 72 | self.sha1 = hashlib.sha1(self.raw).hexdigest() 73 | f.close() 74 | 75 | self.name = os.path.basename(fname) 76 | 77 | self.image_base = 0 78 | self.start = 0 79 | 80 | self.sections = [] # Section 81 | self.sections_by_name = {} # str:Section 82 | self.imports_by_name = {} # str:int 83 | self.imports_by_addr = {} # int:str 84 | self.exports_by_addr = {} # int:str 85 | 86 | self.symbols_by_addr = {} # int:str 87 | self.symbols_by_name = {} # str:int 88 | self.mapped = [] # MappedRegion 89 | 90 | self.non_returning_funcs = [] 91 | 92 | self.add_mapped(0, len(self.raw), PROT_ALL, self.raw) 93 | 94 | self.set_endianness(LITTLE_ENDIAN) 95 | self.sizeof_ptr = 4 96 | self.arch = None 97 | self.mode = None 98 | self.cached_section = None 99 | self.cached_region = None 100 | 101 | def __del__(self): 102 | del self.mapped[:] 103 | del self.sections[:] 104 | del self.sections 105 | del self.raw 106 | del self.name 107 | self.sections_by_name.clear() 108 | self.imports_by_name.clear() 109 | self.imports_by_addr.clear() 110 | self.exports_by_addr.clear() 111 | self.symbols_by_name.clear() 112 | self.symbols_by_addr.clear() 113 | 114 | def set_endianness(self, which_endian): 115 | self.endian = which_endian 116 | 117 | def set_pointer_size(self, sizeof_ptr): 118 | self.sizeof_ptr = sizeof_ptr 119 | 120 | # override to create a mapped process binary image where 121 | # raw does not match the memory layout of the running 122 | # process. 123 | def load(self): 124 | # probably want to start with: 125 | # del sections[:] 126 | # sections_by_name.clear() 127 | pass 128 | 129 | def get_mapped(self, addr): 130 | if self.cached_region is not None and self.cached_region.contains(addr): 131 | return self.cached_region 132 | for m in self.mapped: 133 | if m.contains(addr): 134 | self.cached_region = m 135 | return m 136 | return None 137 | 138 | #regions should not overlap! 139 | def add_mapped(self, start, end, perms, raw): 140 | self.mapped.append(MappedRegion(start, end, perms, raw)) 141 | 142 | def del_mapped(self, start): 143 | rem = None 144 | for m in self.mapped: 145 | if m.start == addr: 146 | if self.cached_region == m: 147 | self.cached_region = None 148 | rem = m 149 | break 150 | if rem is not None: 151 | self.mapped.remove(rem) 152 | del rem 153 | 154 | # override to perform file type validation checks such 155 | # as checking magic numbers, etc 156 | def is_valid(self): 157 | return True 158 | 159 | def get_bytes(self, addr, len): 160 | m = self.get_mapped(addr) 161 | if m is not None: 162 | return m.get_bytes(addr, len) 163 | return None 164 | 165 | def get_byte(self, addr): 166 | return self.get_bytes(addr, 1) 167 | 168 | def get_word(self, addr): 169 | return struct.unpack(self.endian + "H", self.get_bytes(addr, 2))[0] 170 | 171 | def get_dword(self, addr): 172 | try: 173 | return struct.unpack(self.endian + "I", self.get_bytes(addr, 4))[0] 174 | except Exception, e: 175 | print "Unable to read dword from address 0x%x" % addr 176 | raise e 177 | 178 | def get_qword(self, addr): 179 | return struct.unpack(self.endian + "Q", self.get_bytes(addr, 8))[0] 180 | 181 | def get_pointer(self, addr): 182 | if self.sizeof_ptr == 4: 183 | return self.get_dword(addr) 184 | elif self.sizeof_ptr == 8: 185 | return self.get_qword(addr) 186 | 187 | def get_string(self, addr): 188 | res = '' 189 | while True: 190 | ch = self.get_byte(addr) 191 | if ch == '\x00': 192 | break 193 | addr += 1 194 | res += ch 195 | return res 196 | 197 | # get containing section for given address 198 | def get_section(self, addr): 199 | if self.cached_section is not None and self.cached_section.contains(addr): 200 | return self.cached_section 201 | for s in self.sections: 202 | if s.contains(addr): 203 | self.cached_section = s 204 | return s 205 | return None 206 | 207 | def add_section(self, name, start, end, perms, filesz): 208 | sect = Section(name, start, end, perms, filesz) 209 | self.sections.append(sect) 210 | self.sections_by_name[name] = sect 211 | 212 | def add_import(self, addr, name): 213 | self.imports_by_addr[addr] = name 214 | self.imports_by_name[name] = addr 215 | 216 | def add_symbol(self, addr, name): 217 | self.symbols_by_addr[addr] = name 218 | self.symbols_by_name[name] = addr 219 | 220 | def add_export(self, addr, name): 221 | self.exports_by_addr[addr] = name 222 | 223 | #override in subclasses if you have an algorithm 224 | #for finding main given the address of start 225 | #and all currently known instructions 226 | def find_main(self, insts, to, frm): 227 | return None 228 | 229 | -------------------------------------------------------------------------------- /pe_loader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | Crude PE32 / PE32+ loader, conforming to the Loader interface, for a stand-alone binnavi compatible disassembler 5 | ''' 6 | 7 | __author__ = "Chris Eagle" 8 | __copyright__ = "Copyright 2015, Chris Eagle" 9 | __credits__ = ["Chris Eagle"] 10 | __license__ = "GPL" 11 | __version__ = "2.0" 12 | __maintainer__ = "Chris Eagle" 13 | __email__ = "cseagle@gmail.com" 14 | __status__ = "Use at your own risk" 15 | 16 | import sys 17 | import struct 18 | import hashlib 19 | import binascii 20 | import capstone 21 | from loader import * 22 | 23 | IMAGE_FILE_MACHINE_I386 = 0x14c 24 | IMAGE_FILE_MACHINE_ARM = 0x1c0 25 | IMAGE_FILE_MACHINE_THUMB = 0x1c2 26 | IMAGE_FILE_MACHINE_ARMV7 = 0x1c4 27 | IMAGE_FILE_MACHINE_AMD64 = 0x8664 28 | 29 | OK_PE_MACHINES = [IMAGE_FILE_MACHINE_I386, IMAGE_FILE_MACHINE_ARM, 30 | IMAGE_FILE_MACHINE_THUMB, IMAGE_FILE_MACHINE_ARMV7, 31 | IMAGE_FILE_MACHINE_AMD64] 32 | 33 | IMAGE_NT_OPTIONAL_HDR32_MAGIC = 0x10b 34 | IMAGE_NT_OPTIONAL_HDR64_MAGIC = 0x20b 35 | 36 | IMAGE_DOS_SIGNATURE = 0x5A4D 37 | IMAGE_NT_SIGNATURE = 0x00004550 38 | 39 | IMAGE_SCN_MEM_EXECUTE = 0x20000000 40 | IMAGE_SCN_MEM_READ = 0x40000000 41 | IMAGE_SCN_MEM_WRITE = 0x80000000 42 | 43 | DATA_DIRECTORY_EXPORT = 0 44 | DATA_DIRECTORY_IMPORT = 1 45 | 46 | class InvalidHeader(Exception): 47 | def __init__(self, msg): 48 | Exception.__init__(self, msg) 49 | 50 | class FileHeader(object): 51 | 52 | def __init__(self, raw, offset): 53 | self.raw = raw[offset:offset+20] 54 | fields = struct.unpack("= self.rva and func_rva < self.end_rva: 144 | #this is a forwarded entry 145 | fcount += 1 146 | continue 147 | else: 148 | self.pe.add_export(func_rva + self.pe.image_base, name) 149 | 150 | for f in range(self.NumberOfNames, self.NumberOfFunctions): 151 | name = "%s_%d" % (self.dll.replace('.', '_'), f) 152 | func_idx = self.pe.get_word(aono + f * 2) 153 | func_rva = self.pe.get_dword(aof + func_idx * 4) 154 | self.pe.add_export(func_rva + self.pe.image_base, name) 155 | 156 | class OptionalHeaderBase(object): 157 | 158 | def __init__(self, raw, offset): 159 | try: 160 | self.common = raw[offset:offset+24] 161 | fields = struct.unpack("= 2 and not self.is_returning_call(from_list): 128 | # add all destinations to basic_blocks 129 | for xr in from_list: 130 | if xr[0] != 0: 131 | #treat address zero differently, don't add xrefs to it 132 | self.add_basic_block_start(xr[0]) 133 | 134 | if to not in self.xrefs_to: 135 | self.xrefs_to[to] = [] 136 | self.xrefs_to[to].append((frm, xr_type)) 137 | to_list = self.xrefs_to[to] 138 | if to not in self.names: 139 | if xr_type == XR_CALL: 140 | self.names[to] = 'sub_%x' % to 141 | elif xr_type >= XR_JUMP: # JUMP or JCC 142 | self.names[to] = 'loc_%x' % to 143 | self.add_loc(to) 144 | elif xr_type == XR_CALL and self.names[to] == ('loc_%x' % to): 145 | #update loc_ label to sub_ label now that a call was found 146 | self.names[to] = 'sub_%x' % to 147 | if to not in self.basic_blocks: 148 | if xr_type == XR_CALL or len(to_list) > 1: 149 | if to != 0: 150 | #treat address zero differently, don't add xrefs to it 151 | self.add_basic_block_start(to) 152 | 153 | #add an address we need to explore 154 | def add_loc(self, addr): 155 | if addr in self.visited: 156 | return 157 | self.locs.append(addr) 158 | 159 | def is_conditional(self, i): 160 | op = i.bytes[0] 161 | if (op >= 0x70 and op <= 0x7f) or (op >= 0xe0 and op <= 0xe3): 162 | return True 163 | elif op == 0x0f: 164 | op2 = i.bytes[1] 165 | if op2 >= 0x80 and op2 <= 0x8f: 166 | return True 167 | return False 168 | 169 | def process_jump(self, i): 170 | opcode = i.bytes[0] 171 | offset = signed_byte(i.bytes[1]) 172 | short_tgt = i.address + i.size + offset 173 | if opcode == 0xeb: # jmp disp8 174 | self.add_xref(i.address, short_tgt, XR_JUMP) 175 | self.jmp_targets.add(short_tgt) 176 | return True 177 | if opcode == 0xe9: # jmp disp32 178 | offset = signed_dword(i.bytes[1:5]) 179 | tgt = i.address + i.size + offset 180 | self.add_xref(i.address, tgt, XR_JUMP) 181 | self.jmp_targets.add(tgt) 182 | return True 183 | if (opcode >= 0x70 and opcode <= 0x7f) or opcode == 0xe3: # jcc jecx disp8 184 | self.add_xref(i.address, short_tgt, XR_JCC) 185 | self.jmp_targets.add(short_tgt) 186 | return False 187 | elif opcode == 0x0f: # jcc disp32 188 | op2 = i.bytes[1] 189 | if op2 >= 0x80 and op2 <= 0x8f: 190 | offset = signed_dword(i.bytes[2:6]) 191 | tgt = i.address + i.size + offset 192 | self.add_xref(i.address, tgt, XR_JCC) 193 | self.jmp_targets.add(tgt) 194 | return False 195 | # else: 196 | # sys.stderr.write("Classified jump (0x0f), not categorized at address 0x%x: %s\n" % (i.address, tostr(i))) 197 | elif opcode == 0xff: 198 | modrm = i.modrm # i.bytes[1] 199 | if modrm == 0x25: #near jump [disp] 200 | slot = unsigned_dword(i.bytes[2:6]) 201 | if i.address in self.loader.imports_by_addr: #this is a thunk DO BETTER HERE 202 | self.thunks.add(i.address) 203 | self.names[i.address] = self.loader.imports_by_addr[i.address] 204 | # else: 205 | # sys.stderr.write("Classified jump (0xff), not categorized at address 0x%x: %s\n" % (i.address, tostr(i))) 206 | return True 207 | # else: 208 | # sys.stderr.write("Classified jump, not categorized at address 0x%x: %s\n" % (i.address, tostr(i))) 209 | return True 210 | 211 | def process_call(self, i): 212 | opcode = i.bytes[0] 213 | if opcode == 0xe8: # call disp32 214 | offset = signed_dword(i.bytes[1:5]) 215 | tgt = i.address + i.size + offset 216 | self.call_targets.add(tgt) 217 | self.add_xref(i.address, tgt, XR_CALL) 218 | #add a minimal stack frame for this function, it will have at least a 219 | #return address 220 | #self.add_type("__SF%x" % tgt, self.loader.sizeof_ptr, None, False, STRUCT) 221 | return tgt in self.names and self.names[tgt] in self.loader.non_returning_funcs 222 | elif opcode == 0xff: 223 | modrm = i.modrm # i.bytes[1] 224 | if modrm == 0x15: #near call [disp] 225 | slot = unsigned_dword(i.bytes[2:6]) 226 | #sometimes this will be an imported function other times not 227 | #only xref that is really taking place here is a data reference 228 | #self.add_xref(i.address, slot, XR_CALL) 229 | # else: 230 | # sys.stderr.write("Classified call (0xff), not categorized at address 0x%x: %s\n" % (i.address, tostr(i))) 231 | # else: 232 | # sys.stderr.write("Classified call, not categorized at address 0x%x: %s\n" % (i.address, tostr(i))) 233 | #assume all calls return 234 | return False 235 | 236 | def add_address_ref(self, inst, opnum, node_id, aref_addr, false_id): 237 | is_jump = inst.group(capstone.CS_GRP_JUMP) 238 | is_call = inst.group(capstone.CS_GRP_CALL) 239 | if inst.operands[opnum].type == capstone.x86_const.X86_OP_IMM: 240 | if is_jump: 241 | if self.is_conditional(inst): 242 | self.arefs.append(AddressRef(inst.address, opnum, node_id, aref_addr, CONDITION_TRUE)) 243 | self.arefs.append(AddressRef(inst.address, opnum, false_id, inst.address + inst.size, CONDITION_FALSE)) 244 | else: 245 | self.arefs.append(AddressRef(inst.address, opnum, node_id, aref_addr, UNCONDITIONAL)) 246 | elif is_call: 247 | self.arefs.append(AddressRef(inst.address, opnum, node_id, aref_addr, CALL_DIRECT)) 248 | else: 249 | #raw data, aref_addr is an offset 250 | pass 251 | elif inst.operands[opnum].type == capstone.x86_const.X86_OP_MEM: 252 | if is_jump: 253 | dest = self.get_pointer(aref_addr) 254 | #try to determine whether this is a switch table 255 | if dest is not None and self.is_possible_code(dest): 256 | self.arefs.append(AddressRef(inst.address, opnum, node_id, dest, UNCONDITIONAL)) 257 | elif is_call: 258 | dest = self.get_pointer(aref_addr) 259 | #try to determine whether this is a switch table 260 | if dest is not None and self.is_possible_code(dest): 261 | self.arefs.append(AddressRef(inst.address, opnum, node_id, dest, CALL_INDIRECT)) 262 | else: 263 | #raw data, aref_addr is a pointer 264 | #could check content at aref_addr to see if its a string 265 | self.arefs.append(AddressRef(inst.address, opnum, node_id, aref_addr, DATA)) 266 | 267 | #THIS IS HIGHLY ARCHITECTURE DEPENDENT 268 | def process_operands(self, inst): 269 | opnum = 0 270 | #annotate the CsInsn with the operands we build here 271 | op_exprs = [] 272 | for op in inst.operands: 273 | add_aref = False 274 | aref_addr = 0 275 | aref_op = 0 276 | aref_type = -1 277 | op_size = 'b%d' % op.size 278 | tree = [] 279 | tree.append(OpNode(SIZE_PREFIX, op_size)) 280 | if op.type == capstone.x86_const.X86_OP_REG: 281 | reg = inst.reg_name(op.reg) 282 | #operand expr is: op_size reg 283 | tree.append(OpNode(REGISTER, reg)) 284 | elif op.type == capstone.x86_const.X86_OP_IMM: 285 | imm = op.imm 286 | #operand expr is: op_size imm 287 | tree.append(OpNode(IMMEDIATE_INT, imm)) 288 | s = self.loader.get_section(imm) 289 | if s is not None: 290 | #immediate refers to a memory address 291 | #let's add an AddressRef 292 | add_aref = True 293 | aref_op = 1 294 | aref_addr = imm 295 | elif op.type == capstone.x86_const.X86_OP_MEM: 296 | if op.mem.segment == capstone.x86_const.X86_REG_INVALID: 297 | op_seg = None 298 | else: 299 | op_seg = '%s:' % inst.reg_name(op.mem.segment) 300 | tree.append(OpNode(OPERATOR + 10, op_seg)) # 10 = unary operator 301 | 302 | op_disp = op.mem.disp 303 | tree.append(OpNode(DEREFERENCE, '[')) 304 | s = self.loader.get_section(op_disp) 305 | if s is not None: 306 | #immediate refers to a memory address 307 | #let's add an AddressRef 308 | add_aref = True 309 | aref_addr = op_disp 310 | 311 | if op.mem.base != capstone.x86_const.X86_REG_INVALID: #has a base reg 312 | op_base = inst.reg_name(op.mem.base) 313 | if op.mem.index != capstone.x86_const.X86_REG_INVALID: #has an index reg 314 | op_scale = op.mem.scale 315 | op_index = inst.reg_name(op.mem.index) 316 | tree.append(OpNode(OPERATOR + 20, '+')) # 20 = unary operator 317 | tree.append(OpNode(REGISTER, op_base)) 318 | if op_scale == 1: 319 | if op_disp == 0: 320 | #operand expr is: op_size op_seg [ + op_base op_index 321 | tree.append(OpNode(REGISTER, op_index)) 322 | else: 323 | #operand expr is: op_size op_seg [ + op_base + op_index op_disp 324 | tree.append(OpNode(OPERATOR + 20, '+')) # 20 = unary operator 325 | tree.append(OpNode(REGISTER, op_index)) 326 | aref_op = len(tree) 327 | tree.append(OpNode(IMMEDIATE_INT, op_disp)) 328 | else: 329 | if op_disp == 0: 330 | #operand expr is: op_size op_seg [ + op_base * op_index op_scale 331 | tree.append(OpNode(OPERATOR + 20, '*')) # 20 = unary operator 332 | tree.append(OpNode(REGISTER, op_index)) 333 | tree.append(OpNode(IMMEDIATE_INT, op_scale)) 334 | else: 335 | #operand expr is: op_size op_seg [ + op_base + * op_index op_scale op_disp 336 | tree.append(OpNode(OPERATOR + 20, '+')) # 20 = unary operator 337 | tree.append(OpNode(OPERATOR + 20, '*')) # 20 = unary operator 338 | tree.append(OpNode(REGISTER, op_index)) 339 | tree.append(OpNode(IMMEDIATE_INT, op_scale)) 340 | aref_op = len(tree) 341 | tree.append(OpNode(IMMEDIATE_INT, op_disp)) 342 | else: 343 | if op_disp == 0: 344 | #operand expr is: op_size op_seg [ op_base 345 | tree.append(OpNode(REGISTER, op_base)) 346 | else: 347 | #operand expr is: op_size op_seg [ + op_base op_disp 348 | tree.append(OpNode(OPERATOR + 20, '+')) # 20 = unary operator 349 | tree.append(OpNode(REGISTER, op_base)) 350 | aref_op = len(tree) 351 | tree.append(OpNode(IMMEDIATE_INT, op_disp)) 352 | elif op.mem.index != capstone.x86_const.X86_REG_INVALID: #has an index reg 353 | op_scale = op.mem.scale 354 | op_index = inst.reg_name(op.mem.index) 355 | if op_scale == 1: 356 | if op_disp == 0: 357 | #operand expr is: op_size op_seg [ op_index 358 | tree.append(OpNode(REGISTER, op_index)) 359 | else: 360 | #operand expr is: op_size op_seg [ + op_index op_disp 361 | tree.append(OpNode(OPERATOR + 20, '+')) # 20 = unary operator 362 | tree.append(OpNode(REGISTER, op_index)) 363 | aref_op = len(tree) 364 | tree.append(OpNode(IMMEDIATE_INT, op_disp)) 365 | else: 366 | if op_disp == 0: 367 | #operand expr is: op_size op_seg [ * op_index op_scale 368 | tree.append(OpNode(OPERATOR + 20, '*')) # 20 = unary operator 369 | tree.append(OpNode(REGISTER, op_index)) 370 | tree.append(OpNode(IMMEDIATE_INT, op_scale)) 371 | else: 372 | #operand expr is: op_size op_seg [ + * op_index op_scale op_disp 373 | tree.append(OpNode(OPERATOR + 20, '+')) # 20 = unary operator 374 | tree.append(OpNode(OPERATOR + 20, '*')) # 20 = unary operator 375 | tree.append(OpNode(REGISTER, op_index)) 376 | tree.append(OpNode(IMMEDIATE_INT, op_scale)) 377 | aref_op = len(tree) 378 | tree.append(OpNode(IMMEDIATE_INT, op_disp)) 379 | else: #must be [disp] only, mem with no registers 380 | #operand expr is: op_size op_seg [ op_disp 381 | aref_op = len(tree) 382 | tree.append(OpNode(IMMEDIATE_INT, op_disp)) 383 | elif op.type == capstone.x86_const.X86_OP_FP: 384 | sys.stderr.write("found an FP operand at 0x%x, op %d\n" % (inst.address, opnum)) 385 | else: 386 | sys.stderr.write("Unknown operand at 0x%x, op %d\n" % (inst.address, opnum)) 387 | # store operand expression tree for inst.addr, opnum 388 | if len(tree) > 0: 389 | expr = self.add_expr_tree(tree) 390 | if expr != 0: 391 | op_exprs.append(expr) 392 | if add_aref: 393 | self.add_address_ref(inst, opnum, tree[aref_op].node_id, aref_addr, tree[0].node_id) 394 | opnum += 1 395 | self.operands[inst.address] = op_exprs 396 | 397 | def scan_gap_data(self): 398 | ptr_sz = self.loader.sizeof_ptr 399 | 400 | keylist = [a for a in self.visited] 401 | keylist.sort() 402 | last = None 403 | count = 0 404 | for a in keylist: 405 | i = self.insts[a] 406 | if last is not None and (last.address + last.size) != a: 407 | gap_start = last.address + last.size 408 | #round up to ptr aligned address 409 | gap_start = (gap_start + ptr_sz - 1) & ~(ptr_sz - 1) 410 | if gap_start >= a: 411 | continue 412 | for addr in range(gap_start, a, ptr_sz): 413 | val = self.get_pointer(addr) 414 | if val is None: 415 | break 416 | if self.is_possible_code(val) and val not in self.visited: 417 | self.locs.append(val) 418 | #print "Adding text ptr 0x%x" % val 419 | count += 1 420 | last = i 421 | #print "Gap data analysis added %d new locations" % count 422 | 423 | 424 | def main(exe_file): 425 | ldr = pe_loader.Pe32(exe_file) 426 | if not ldr.load(): 427 | del ldr 428 | ldr = pe_loader.Pe64(exe_file) 429 | if not ldr.load(): 430 | del ldr 431 | ldr = elf_loader.Elf32(exe_file) 432 | if not ldr.load(): 433 | del ldr 434 | ldr = elf_loader.Elf64(exe_file) 435 | if not ldr.load(): 436 | del ldr 437 | print "Failed to recognize input file type" 438 | return 439 | 440 | dis = x86_disasm(ldr) 441 | print "starting with %d initial locations" % len(dis.locs) 442 | dis.generate_data() 443 | 444 | print "found %d instructions" % len(dis.visited) 445 | 446 | ''' 447 | print "Functions identified at:" 448 | dis.call_targets.sort() 449 | for c in dis.call_targets: 450 | print " 0x%x" % c 451 | ''' 452 | 453 | dis.print_disassembly() 454 | 455 | if __name__ == "__main__": 456 | main(sys.argv[1]) 457 | --------------------------------------------------------------------------------