├── Gemfile ├── Gemfile.lock ├── LICENSE ├── README.md ├── asm_embedding ├── FunctionNormalizer.py ├── InstructionsConverter.py └── __init__.py ├── binary_similarity ├── PairFactory.py ├── __init__.py ├── parameters.py ├── s2v_network.py ├── s2v_network_arith_mean.py ├── s2v_network_attention_mean.py ├── s2v_network_rnn.py ├── s2v_trainer.py ├── train.py ├── train.sh └── utils.py ├── compiler_provenance ├── FunctionFactory.py ├── __init__.py ├── parameters.py ├── s2v_classification_network_annotations.py ├── s2v_classification_network_arith_mean.py ├── s2v_classification_network_attention_mean.py ├── s2v_classification_network_rnn.py ├── s2v_trainer.py ├── train.py ├── train.sh └── utils.py ├── dataset_creation ├── BlockFeaturesExtractor.py ├── DataSplitter.py ├── DatabaseFactory.py ├── ExperimentUtil.py ├── FunctionAnalyzerRadare.py └── __init__.py ├── downloader.py ├── godown.pl └── requirements.txt /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | # Hello! This is where you manage which Jekyll version is used to run. 4 | # When you want to use a different version, change it below, save the 5 | # file and run `bundle install`. Run Jekyll with `bundle exec`, like so: 6 | # 7 | # bundle exec jekyll serve 8 | # 9 | # This will help ensure the proper Jekyll version is running. 10 | # Happy Jekylling! 11 | gem "jekyll", "~> 3.7.4" 12 | 13 | # This is the default theme for new Jekyll sites. You may change this to anything you like. 14 | gem "minima", "~> 2.0" 15 | 16 | # If you want to use GitHub Pages, remove the "gem "jekyll"" above and 17 | # uncomment the line below. To upgrade, run `bundle update github-pages`. 18 | # gem "github-pages", group: :jekyll_plugins 19 | #gem "github-pages", group: :jekyll_plugins 20 | 21 | # If you have any plugins, put them here! 22 | group :jekyll_plugins do 23 | gem "jekyll-feed", "~> 0.6" 24 | end 25 | 26 | # Windows does not include zoneinfo files, so bundle the tzinfo-data gem 27 | gem "tzinfo-data", platforms: [:mingw, :mswin, :x64_mingw, :jruby] 28 | 29 | # Performance-booster for watching directories on Windows 30 | gem "wdm", "~> 0.1.0" if Gem.win_platform? 31 | 32 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | activesupport (4.2.10) 5 | i18n (~> 0.7) 6 | minitest (~> 5.1) 7 | thread_safe (~> 0.3, >= 0.3.4) 8 | tzinfo (~> 1.1) 9 | addressable (2.5.2) 10 | public_suffix (>= 2.0.2, < 4.0) 11 | coffee-script (2.4.1) 12 | coffee-script-source 13 | execjs 14 | coffee-script-source (1.11.1) 15 | colorator (1.1.0) 16 | commonmarker (0.17.13) 17 | ruby-enum (~> 0.5) 18 | concurrent-ruby (1.1.3) 19 | dnsruby (1.61.2) 20 | addressable (~> 2.5) 21 | em-websocket (0.5.1) 22 | eventmachine (>= 0.12.9) 23 | http_parser.rb (~> 0.6.0) 24 | ethon (0.11.0) 25 | ffi (>= 1.3.0) 26 | eventmachine (1.2.7) 27 | execjs (2.7.0) 28 | faraday (0.15.3) 29 | multipart-post (>= 1.2, < 3) 30 | ffi (1.9.25) 31 | forwardable-extended (2.6.0) 32 | gemoji (3.0.0) 33 | github-pages (193) 34 | activesupport (= 4.2.10) 35 | github-pages-health-check (= 1.8.1) 36 | jekyll (= 3.7.4) 37 | jekyll-avatar (= 0.6.0) 38 | jekyll-coffeescript (= 1.1.1) 39 | jekyll-commonmark-ghpages (= 0.1.5) 40 | jekyll-default-layout (= 0.1.4) 41 | jekyll-feed (= 0.11.0) 42 | jekyll-gist (= 1.5.0) 43 | jekyll-github-metadata (= 2.9.4) 44 | jekyll-mentions (= 1.4.1) 45 | jekyll-optional-front-matter (= 0.3.0) 46 | jekyll-paginate (= 1.1.0) 47 | jekyll-readme-index (= 0.2.0) 48 | jekyll-redirect-from (= 0.14.0) 49 | jekyll-relative-links (= 0.5.3) 50 | jekyll-remote-theme (= 0.3.1) 51 | jekyll-sass-converter (= 1.5.2) 52 | jekyll-seo-tag (= 2.5.0) 53 | jekyll-sitemap (= 1.2.0) 54 | jekyll-swiss (= 0.4.0) 55 | jekyll-theme-architect (= 0.1.1) 56 | jekyll-theme-cayman (= 0.1.1) 57 | jekyll-theme-dinky (= 0.1.1) 58 | jekyll-theme-hacker (= 0.1.1) 59 | jekyll-theme-leap-day (= 0.1.1) 60 | jekyll-theme-merlot (= 0.1.1) 61 | jekyll-theme-midnight (= 0.1.1) 62 | jekyll-theme-minimal (= 0.1.1) 63 | jekyll-theme-modernist (= 0.1.1) 64 | jekyll-theme-primer (= 0.5.3) 65 | jekyll-theme-slate (= 0.1.1) 66 | jekyll-theme-tactile (= 0.1.1) 67 | jekyll-theme-time-machine (= 0.1.1) 68 | jekyll-titles-from-headings (= 0.5.1) 69 | jemoji (= 0.10.1) 70 | kramdown (= 1.17.0) 71 | liquid (= 4.0.0) 72 | listen (= 3.1.5) 73 | mercenary (~> 0.3) 74 | minima (= 2.5.0) 75 | nokogiri (>= 1.8.2, < 2.0) 76 | rouge (= 2.2.1) 77 | terminal-table (~> 1.4) 78 | github-pages-health-check (1.8.1) 79 | addressable (~> 2.3) 80 | dnsruby (~> 1.60) 81 | octokit (~> 4.0) 82 | public_suffix (~> 2.0) 83 | typhoeus (~> 1.3) 84 | html-pipeline (2.9.1) 85 | activesupport (>= 2) 86 | nokogiri (>= 1.4) 87 | http_parser.rb (0.6.0) 88 | i18n (0.9.5) 89 | concurrent-ruby (~> 1.0) 90 | jekyll (3.7.4) 91 | addressable (~> 2.4) 92 | colorator (~> 1.0) 93 | em-websocket (~> 0.5) 94 | i18n (~> 0.7) 95 | jekyll-sass-converter (~> 1.0) 96 | jekyll-watch (~> 2.0) 97 | kramdown (~> 1.14) 98 | liquid (~> 4.0) 99 | mercenary (~> 0.3.3) 100 | pathutil (~> 0.9) 101 | rouge (>= 1.7, < 4) 102 | safe_yaml (~> 1.0) 103 | jekyll-avatar (0.6.0) 104 | jekyll (~> 3.0) 105 | jekyll-coffeescript (1.1.1) 106 | coffee-script (~> 2.2) 107 | coffee-script-source (~> 1.11.1) 108 | jekyll-commonmark (1.2.0) 109 | commonmarker (~> 0.14) 110 | jekyll (>= 3.0, < 4.0) 111 | jekyll-commonmark-ghpages (0.1.5) 112 | commonmarker (~> 0.17.6) 113 | jekyll-commonmark (~> 1) 114 | rouge (~> 2) 115 | jekyll-default-layout (0.1.4) 116 | jekyll (~> 3.0) 117 | jekyll-feed (0.11.0) 118 | jekyll (~> 3.3) 119 | jekyll-gist (1.5.0) 120 | octokit (~> 4.2) 121 | jekyll-github-metadata (2.9.4) 122 | jekyll (~> 3.1) 123 | octokit (~> 4.0, != 4.4.0) 124 | jekyll-mentions (1.4.1) 125 | html-pipeline (~> 2.3) 126 | jekyll (~> 3.0) 127 | jekyll-optional-front-matter (0.3.0) 128 | jekyll (~> 3.0) 129 | jekyll-paginate (1.1.0) 130 | jekyll-readme-index (0.2.0) 131 | jekyll (~> 3.0) 132 | jekyll-redirect-from (0.14.0) 133 | jekyll (~> 3.3) 134 | jekyll-relative-links (0.5.3) 135 | jekyll (~> 3.3) 136 | jekyll-remote-theme (0.3.1) 137 | jekyll (~> 3.5) 138 | rubyzip (>= 1.2.1, < 3.0) 139 | jekyll-sass-converter (1.5.2) 140 | sass (~> 3.4) 141 | jekyll-seo-tag (2.5.0) 142 | jekyll (~> 3.3) 143 | jekyll-sitemap (1.2.0) 144 | jekyll (~> 3.3) 145 | jekyll-swiss (0.4.0) 146 | jekyll-theme-architect (0.1.1) 147 | jekyll (~> 3.5) 148 | jekyll-seo-tag (~> 2.0) 149 | jekyll-theme-cayman (0.1.1) 150 | jekyll (~> 3.5) 151 | jekyll-seo-tag (~> 2.0) 152 | jekyll-theme-dinky (0.1.1) 153 | jekyll (~> 3.5) 154 | jekyll-seo-tag (~> 2.0) 155 | jekyll-theme-hacker (0.1.1) 156 | jekyll (~> 3.5) 157 | jekyll-seo-tag (~> 2.0) 158 | jekyll-theme-leap-day (0.1.1) 159 | jekyll (~> 3.5) 160 | jekyll-seo-tag (~> 2.0) 161 | jekyll-theme-merlot (0.1.1) 162 | jekyll (~> 3.5) 163 | jekyll-seo-tag (~> 2.0) 164 | jekyll-theme-midnight (0.1.1) 165 | jekyll (~> 3.5) 166 | jekyll-seo-tag (~> 2.0) 167 | jekyll-theme-minimal (0.1.1) 168 | jekyll (~> 3.5) 169 | jekyll-seo-tag (~> 2.0) 170 | jekyll-theme-modernist (0.1.1) 171 | jekyll (~> 3.5) 172 | jekyll-seo-tag (~> 2.0) 173 | jekyll-theme-primer (0.5.3) 174 | jekyll (~> 3.5) 175 | jekyll-github-metadata (~> 2.9) 176 | jekyll-seo-tag (~> 2.0) 177 | jekyll-theme-slate (0.1.1) 178 | jekyll (~> 3.5) 179 | jekyll-seo-tag (~> 2.0) 180 | jekyll-theme-tactile (0.1.1) 181 | jekyll (~> 3.5) 182 | jekyll-seo-tag (~> 2.0) 183 | jekyll-theme-time-machine (0.1.1) 184 | jekyll (~> 3.5) 185 | jekyll-seo-tag (~> 2.0) 186 | jekyll-titles-from-headings (0.5.1) 187 | jekyll (~> 3.3) 188 | jekyll-watch (2.1.2) 189 | listen (~> 3.0) 190 | jemoji (0.10.1) 191 | gemoji (~> 3.0) 192 | html-pipeline (~> 2.2) 193 | jekyll (~> 3.0) 194 | kramdown (1.17.0) 195 | liquid (4.0.0) 196 | listen (3.1.5) 197 | rb-fsevent (~> 0.9, >= 0.9.4) 198 | rb-inotify (~> 0.9, >= 0.9.7) 199 | ruby_dep (~> 1.2) 200 | mercenary (0.3.6) 201 | mini_portile2 (2.3.0) 202 | minima (2.5.0) 203 | jekyll (~> 3.5) 204 | jekyll-feed (~> 0.9) 205 | jekyll-seo-tag (~> 2.1) 206 | minitest (5.11.3) 207 | multipart-post (2.0.0) 208 | nokogiri (1.8.5) 209 | mini_portile2 (~> 2.3.0) 210 | octokit (4.13.0) 211 | sawyer (~> 0.8.0, >= 0.5.3) 212 | pathutil (0.16.2) 213 | forwardable-extended (~> 2.6) 214 | public_suffix (2.0.5) 215 | rb-fsevent (0.10.3) 216 | rb-inotify (0.9.10) 217 | ffi (>= 0.5.0, < 2) 218 | rouge (2.2.1) 219 | ruby-enum (0.7.2) 220 | i18n 221 | ruby_dep (1.5.0) 222 | rubyzip (1.2.2) 223 | safe_yaml (1.0.4) 224 | sass (3.7.2) 225 | sass-listen (~> 4.0.0) 226 | sass-listen (4.0.0) 227 | rb-fsevent (~> 0.9, >= 0.9.4) 228 | rb-inotify (~> 0.9, >= 0.9.7) 229 | sawyer (0.8.1) 230 | addressable (>= 2.3.5, < 2.6) 231 | faraday (~> 0.8, < 1.0) 232 | terminal-table (1.8.0) 233 | unicode-display_width (~> 1.1, >= 1.1.1) 234 | thread_safe (0.3.6) 235 | typhoeus (1.3.1) 236 | ethon (>= 0.9.0) 237 | tzinfo (1.2.5) 238 | thread_safe (~> 0.1) 239 | unicode-display_width (1.4.0) 240 | 241 | PLATFORMS 242 | ruby 243 | 244 | DEPENDENCIES 245 | github-pages 246 | jekyll (~> 3.7.4) 247 | jekyll-feed (~> 0.6) 248 | minima (~> 2.0) 249 | tzinfo-data 250 | 251 | BUNDLED WITH 252 | 1.17.1 253 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 2 | Public License 3 | 4 | By exercising the Licensed Rights (defined below), You accept and agree 5 | to be bound by the terms and conditions of this Creative Commons 6 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 7 | ("Public License"). To the extent this Public License may be 8 | interpreted as a contract, You are granted the Licensed Rights in 9 | consideration of Your acceptance of these terms and conditions, and the 10 | Licensor grants You such rights in consideration of benefits the 11 | Licensor receives from making the Licensed Material available under 12 | these terms and conditions. 13 | 14 | 15 | Section 1 -- Definitions. 16 | 17 | a. Adapted Material means material subject to Copyright and Similar 18 | Rights that is derived from or based upon the Licensed Material 19 | and in which the Licensed Material is translated, altered, 20 | arranged, transformed, or otherwise modified in a manner requiring 21 | permission under the Copyright and Similar Rights held by the 22 | Licensor. For purposes of this Public License, where the Licensed 23 | Material is a musical work, performance, or sound recording, 24 | Adapted Material is always produced where the Licensed Material is 25 | synched in timed relation with a moving image. 26 | 27 | b. Adapter's License means the license You apply to Your Copyright 28 | and Similar Rights in Your contributions to Adapted Material in 29 | accordance with the terms and conditions of this Public License. 30 | 31 | c. BY-NC-SA Compatible License means a license listed at 32 | creativecommons.org/compatiblelicenses, approved by Creative 33 | Commons as essentially the equivalent of this Public License. 34 | 35 | d. Copyright and Similar Rights means copyright and/or similar rights 36 | closely related to copyright including, without limitation, 37 | performance, broadcast, sound recording, and Sui Generis Database 38 | Rights, without regard to how the rights are labeled or 39 | categorized. For purposes of this Public License, the rights 40 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 41 | Rights. 42 | 43 | e. Effective Technological Measures means those measures that, in the 44 | absence of proper authority, may not be circumvented under laws 45 | fulfilling obligations under Article 11 of the WIPO Copyright 46 | Treaty adopted on December 20, 1996, and/or similar international 47 | agreements. 48 | 49 | f. Exceptions and Limitations means fair use, fair dealing, and/or 50 | any other exception or limitation to Copyright and Similar Rights 51 | that applies to Your use of the Licensed Material. 52 | 53 | g. License Elements means the license attributes listed in the name 54 | of a Creative Commons Public License. The License Elements of this 55 | Public License are Attribution, NonCommercial, and ShareAlike. 56 | 57 | h. Licensed Material means the artistic or literary work, database, 58 | or other material to which the Licensor applied this Public 59 | License. 60 | 61 | i. Licensed Rights means the rights granted to You subject to the 62 | terms and conditions of this Public License, which are limited to 63 | all Copyright and Similar Rights that apply to Your use of the 64 | Licensed Material and that the Licensor has authority to license. 65 | 66 | j. Licensor means the individual(s) or entity(ies) granting rights 67 | under this Public License. 68 | 69 | k. NonCommercial means not primarily intended for or directed towards 70 | commercial advantage or monetary compensation. For purposes of 71 | this Public License, the exchange of the Licensed Material for 72 | other material subject to Copyright and Similar Rights by digital 73 | file-sharing or similar means is NonCommercial provided there is 74 | no payment of monetary compensation in connection with the 75 | exchange. 76 | 77 | l. Share means to provide material to the public by any means or 78 | process that requires permission under the Licensed Rights, such 79 | as reproduction, public display, public performance, distribution, 80 | dissemination, communication, or importation, and to make material 81 | available to the public including in ways that members of the 82 | public may access the material from a place and at a time 83 | individually chosen by them. 84 | 85 | m. Sui Generis Database Rights means rights other than copyright 86 | resulting from Directive 96/9/EC of the European Parliament and of 87 | the Council of 11 March 1996 on the legal protection of databases, 88 | as amended and/or succeeded, as well as other essentially 89 | equivalent rights anywhere in the world. 90 | 91 | n. You means the individual or entity exercising the Licensed Rights 92 | under this Public License. Your has a corresponding meaning. 93 | 94 | 95 | Section 2 -- Scope. 96 | 97 | a. License grant. 98 | 99 | 1. Subject to the terms and conditions of this Public License, 100 | the Licensor hereby grants You a worldwide, royalty-free, 101 | non-sublicensable, non-exclusive, irrevocable license to 102 | exercise the Licensed Rights in the Licensed Material to: 103 | 104 | a. reproduce and Share the Licensed Material, in whole or 105 | in part, for NonCommercial purposes only; and 106 | 107 | b. produce, reproduce, and Share Adapted Material for 108 | NonCommercial purposes only. 109 | 110 | 2. Exceptions and Limitations. For the avoidance of doubt, where 111 | Exceptions and Limitations apply to Your use, this Public 112 | License does not apply, and You do not need to comply with 113 | its terms and conditions. 114 | 115 | 3. Term. The term of this Public License is specified in Section 116 | 6(a). 117 | 118 | 4. Media and formats; technical modifications allowed. The 119 | Licensor authorizes You to exercise the Licensed Rights in 120 | all media and formats whether now known or hereafter created, 121 | and to make technical modifications necessary to do so. The 122 | Licensor waives and/or agrees not to assert any right or 123 | authority to forbid You from making technical modifications 124 | necessary to exercise the Licensed Rights, including 125 | technical modifications necessary to circumvent Effective 126 | Technological Measures. For purposes of this Public License, 127 | simply making modifications authorized by this Section 2(a) 128 | (4) never produces Adapted Material. 129 | 130 | 5. Downstream recipients. 131 | 132 | a. Offer from the Licensor -- Licensed Material. Every 133 | recipient of the Licensed Material automatically 134 | receives an offer from the Licensor to exercise the 135 | Licensed Rights under the terms and conditions of this 136 | Public License. 137 | 138 | b. Additional offer from the Licensor -- Adapted Material. 139 | Every recipient of Adapted Material from You 140 | automatically receives an offer from the Licensor to 141 | exercise the Licensed Rights in the Adapted Material 142 | under the conditions of the Adapter's License You apply. 143 | 144 | c. No downstream restrictions. You may not offer or impose 145 | any additional or different terms or conditions on, or 146 | apply any Effective Technological Measures to, the 147 | Licensed Material if doing so restricts exercise of the 148 | Licensed Rights by any recipient of the Licensed 149 | Material. 150 | 151 | 6. No endorsement. Nothing in this Public License constitutes or 152 | may be construed as permission to assert or imply that You 153 | are, or that Your use of the Licensed Material is, connected 154 | with, or sponsored, endorsed, or granted official status by, 155 | the Licensor or others designated to receive attribution as 156 | provided in Section 3(a)(1)(A)(i). 157 | 158 | b. Other rights. 159 | 160 | 1. Moral rights, such as the right of integrity, are not 161 | licensed under this Public License, nor are publicity, 162 | privacy, and/or other similar personality rights; however, to 163 | the extent possible, the Licensor waives and/or agrees not to 164 | assert any such rights held by the Licensor to the limited 165 | extent necessary to allow You to exercise the Licensed 166 | Rights, but not otherwise. 167 | 168 | 2. Patent and trademark rights are not licensed under this 169 | Public License. 170 | 171 | 3. To the extent possible, the Licensor waives any right to 172 | collect royalties from You for the exercise of the Licensed 173 | Rights, whether directly or through a collecting society 174 | under any voluntary or waivable statutory or compulsory 175 | licensing scheme. In all other cases the Licensor expressly 176 | reserves any right to collect such royalties, including when 177 | the Licensed Material is used other than for NonCommercial 178 | purposes. 179 | 180 | 181 | Section 3 -- License Conditions. 182 | 183 | Your exercise of the Licensed Rights is expressly made subject to the 184 | following conditions. 185 | 186 | a. Attribution. 187 | 188 | 1. If You Share the Licensed Material (including in modified 189 | form), You must: 190 | 191 | a. retain the following if it is supplied by the Licensor 192 | with the Licensed Material: 193 | 194 | i. identification of the creator(s) of the Licensed 195 | Material and any others designated to receive 196 | attribution, in any reasonable manner requested by 197 | the Licensor (including by pseudonym if 198 | designated); 199 | 200 | ii. a copyright notice; 201 | 202 | iii. a notice that refers to this Public License; 203 | 204 | iv. a notice that refers to the disclaimer of 205 | warranties; 206 | 207 | v. a URI or hyperlink to the Licensed Material to the 208 | extent reasonably practicable; 209 | 210 | b. indicate if You modified the Licensed Material and 211 | retain an indication of any previous modifications; and 212 | 213 | c. indicate the Licensed Material is licensed under this 214 | Public License, and include the text of, or the URI or 215 | hyperlink to, this Public License. 216 | 217 | 2. You may satisfy the conditions in Section 3(a)(1) in any 218 | reasonable manner based on the medium, means, and context in 219 | which You Share the Licensed Material. For example, it may be 220 | reasonable to satisfy the conditions by providing a URI or 221 | hyperlink to a resource that includes the required 222 | information. 223 | 3. If requested by the Licensor, You must remove any of the 224 | information required by Section 3(a)(1)(A) to the extent 225 | reasonably practicable. 226 | 227 | b. ShareAlike. 228 | 229 | In addition to the conditions in Section 3(a), if You Share 230 | Adapted Material You produce, the following conditions also apply. 231 | 232 | 1. The Adapter's License You apply must be a Creative Commons 233 | license with the same License Elements, this version or 234 | later, or a BY-NC-SA Compatible License. 235 | 236 | 2. You must include the text of, or the URI or hyperlink to, the 237 | Adapter's License You apply. You may satisfy this condition 238 | in any reasonable manner based on the medium, means, and 239 | context in which You Share Adapted Material. 240 | 241 | 3. You may not offer or impose any additional or different terms 242 | or conditions on, or apply any Effective Technological 243 | Measures to, Adapted Material that restrict exercise of the 244 | rights granted under the Adapter's License You apply. 245 | 246 | 247 | Section 4 -- Sui Generis Database Rights. 248 | 249 | Where the Licensed Rights include Sui Generis Database Rights that 250 | apply to Your use of the Licensed Material: 251 | 252 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 253 | to extract, reuse, reproduce, and Share all or a substantial 254 | portion of the contents of the database for NonCommercial purposes 255 | only; 256 | 257 | b. if You include all or a substantial portion of the database 258 | contents in a database in which You have Sui Generis Database 259 | Rights, then the database in which You have Sui Generis Database 260 | Rights (but not its individual contents) is Adapted Material, 261 | including for purposes of Section 3(b); and 262 | 263 | c. You must comply with the conditions in Section 3(a) if You Share 264 | all or a substantial portion of the contents of the database. 265 | 266 | For the avoidance of doubt, this Section 4 supplements and does not 267 | replace Your obligations under this Public License where the Licensed 268 | Rights include other Copyright and Similar Rights. 269 | 270 | 271 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 272 | 273 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 274 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 275 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 276 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 277 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 278 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 279 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 280 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 281 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 282 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 283 | 284 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 285 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 286 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 287 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 288 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 289 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 290 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 291 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 292 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 293 | 294 | c. The disclaimer of warranties and limitation of liability provided 295 | above shall be interpreted in a manner that, to the extent 296 | possible, most closely approximates an absolute disclaimer and 297 | waiver of all liability. 298 | 299 | 300 | Section 6 -- Term and Termination. 301 | 302 | a. This Public License applies for the term of the Copyright and 303 | Similar Rights licensed here. However, if You fail to comply with 304 | this Public License, then Your rights under this Public License 305 | terminate automatically. 306 | 307 | b. Where Your right to use the Licensed Material has terminated under 308 | Section 6(a), it reinstates: 309 | 310 | 1. automatically as of the date the violation is cured, provided 311 | it is cured within 30 days of Your discovery of the 312 | violation; or 313 | 314 | 2. upon express reinstatement by the Licensor. 315 | 316 | For the avoidance of doubt, this Section 6(b) does not affect any 317 | right the Licensor may have to seek remedies for Your violations 318 | of this Public License. 319 | 320 | c. For the avoidance of doubt, the Licensor may also offer the 321 | Licensed Material under separate terms or conditions or stop 322 | distributing the Licensed Material at any time; however, doing so 323 | will not terminate this Public License. 324 | 325 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 326 | License. 327 | 328 | 329 | Section 7 -- Other Terms and Conditions. 330 | 331 | a. The Licensor shall not be bound by any additional or different 332 | terms or conditions communicated by You unless expressly agreed. 333 | 334 | b. Any arrangements, understandings, or agreements regarding the 335 | Licensed Material not stated herein are separate from and 336 | independent of the terms and conditions of this Public License. 337 | 338 | 339 | Section 8 -- Interpretation. 340 | 341 | a. For the avoidance of doubt, this Public License does not, and 342 | shall not be interpreted to, reduce, limit, restrict, or impose 343 | conditions on any use of the Licensed Material that could lawfully 344 | be made without permission under this Public License. 345 | 346 | b. To the extent possible, if any provision of this Public License is 347 | deemed unenforceable, it shall be automatically reformed to the 348 | minimum extent necessary to make it enforceable. If the provision 349 | cannot be reformed, it shall be severed from this Public License 350 | without affecting the enforceability of the remaining terms and 351 | conditions. 352 | 353 | c. No term or condition of this Public License will be waived and no 354 | failure to comply consented to unless expressly agreed to by the 355 | Licensor. 356 | 357 | d. Nothing in this Public License constitutes or may be interpreted 358 | as a limitation upon, or waiver of, any privileges and immunities 359 | that apply to the Licensor or You, including from the legal 360 | processes of any jurisdiction or authority. 361 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Investigating Graph Embedding Neural Networks with Unsupervised Features Extraction for Binary Analysis 2 | This repository contains the code to reproduce the experiment of the paper accepted at the Workshop on Binary 3 | Analysis Research (BAR) 2019. https://ruoyuwang.me/bar2019/pdfs/bar2019-paper20.pdf 4 | 5 | ## Tasks 6 | 7 | You can use the code to solve two different tasks: 8 | 9 | - Binary Similarity with functions embeddings. 10 | - Compiler Provenance 11 | 12 | ## Reproducing the experiment 13 | 14 | Following this step you will be able to reproduce the experiments of the paper! 15 | 16 | ### Install the requirements 17 | 18 | ``` 19 | pip install -r requirements.txt 20 | ``` 21 | 22 | ### Download datasets 23 | First you need to download at least one of the datasets. 24 | We release the three datasets used in the paper: 25 | 26 | - **OpenSSL_dataset**: It includes two version of OpenSSL libraries compiled for X86 and ARM with 27 | gcc with optimizations from 0 to 3. It has been used for binary similarity task. 28 | To download it: 29 | 30 | ``` 31 | python downloader.py -op 32 | ``` 33 | 34 | 35 | - **Restricted_Compiler_Dataset**: It includes different projects compiled for X86 gcc-3, gcc-5, 36 | clang-3.9 with optimizations from 0 to 3. It has been used for compiler provenance. To download it: 37 | 38 | ``` 39 | python downloader.py -rc 40 | ``` 41 | 42 | - **Compiler_Dataset**: It includes different projects compiled for X86 different compilers (see the paper) with 43 | optimizations from 0 to 3. It has been used for compiler provenance. This dataset is very huge, 44 | you need 30 GB of space to download it. To download it: 45 | 46 | ``` 47 | python downloader.py -c 48 | ``` 49 | 50 | 51 | ### Download word2vec model for asm 52 | 53 | Before to run the experiment you need to download the word2vec model for asm. 54 | It consists of two file, the embedding matrix and the word2id file. The latter that assigns to 55 | each instruction an id. The id correspond to the relative row of the instruction inside the 56 | embedding matrix. 57 | 58 | ``` 59 | python downloader.py -i2v 60 | ``` 61 | 62 | 63 | ### Binary Similarity 64 | 65 | To train the network for binary similarity task go into binary similarity folder and look at the file 66 | train.sh. 67 | 68 | Here you can change different parameter, like network architecture, path for saving the trained model, 69 | the databases you want to use for the training, and the embedding matrix for asm instructions. 70 | By default the script uses the data downloaded before. 71 | 72 | If you want to change the hyperparameter of the network take a look at parameters.py file! 73 | 74 | To start the training just run: 75 | 76 | ``` 77 | export PYTHONPATH=path-to-repository 78 | cd binary_similarity 79 | chmod +x train.sh 80 | ./train.sh 81 | ``` 82 | 83 | ### Compiler provenance 84 | 85 | Like in the previous case just run: 86 | 87 | ``` 88 | export PYTHONPATH=path-to-repository 89 | cd compiler_provenance 90 | chmod +x train.sh 91 | ./train.sh 92 | ``` 93 | 94 | ## Creating your own dataset 95 | 96 | Following this steps you will be able to create your own dataset! 97 | 98 | - Install radare2 on your system. 99 | 100 | - Put the executable you want to add to your dataset inside a directory three as follow: 101 | 102 | ``` 103 | dataset_root/ 104 | \ 105 | \--project/ 106 | \--compiler 107 | \--optimization 108 | \executables 109 | ``` 110 | 111 | For example you will ends up with a three like: 112 | 113 | ``` 114 | my_dataset/ 115 | \ 116 | \--openSSL/ 117 | \--gcc-3 118 | \--O1 119 | \executables 120 | \--O0 121 | \executables 122 | \--binutil/ 123 | \--gcc-3 124 | \--O1 125 | \executables 126 | \--gcc-5 127 | \--O1 128 | \executables 129 | ``` 130 | 131 | - Once you have your executable in the correct path just launch: 132 | 133 | ``` 134 | python dataset_creation/ExperimentUtil.py -db name_of_the_db -b --dir dataset_root [-s (if you want to use debug symbols)] 135 | ``` 136 | 137 | - To split your dataset in train validation and test you can use the following command: 138 | 139 | ``` 140 | python dataset_creation/ExperimentUtil.py -db name_of_the_db -s 141 | ``` 142 | 143 | 144 | 145 | 146 | ## Citation 147 | If you use this repository or datasets for your project please cite: 148 | 149 | Massarelli L., Di Luna G. A., Petroni F., Querzoni L., Baldoni R. Investigating Graph Embedding Neural Networks with Unsupervised Features Extraction for Binary Analysis. To Appear in: Workshop on Binary Analysis Research (BAR) colocated with Symposium on Network and Distributed System Security (NDSS). 2019. 150 | 151 | ## Aknowledgement 152 | 153 | In our code we use godown to download data from Google drive. We thank circulosmeos, the creator of godown. 154 | 155 | -------------------------------------------------------------------------------- /asm_embedding/FunctionNormalizer.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | import numpy as np 7 | 8 | 9 | class FunctionNormalizer: 10 | 11 | def __init__(self, max_instruction): 12 | self.max_instructions = max_instruction 13 | 14 | def normalize(self, f): 15 | f = np.asarray(f[0:self.max_instructions]) 16 | length = f.shape[0] 17 | if f.shape[0] < self.max_instructions: 18 | f = np.pad(f, (0, self.max_instructions - f.shape[0]), mode='constant') 19 | return f, length 20 | 21 | def normalize_function_pairs(self, pairs): 22 | lengths = [] 23 | new_pairs = [] 24 | for x in pairs: 25 | f0, len0 = self.normalize(x[0]) 26 | f1, len1 = self.normalize(x[1]) 27 | lengths.append((len0, len1)) 28 | new_pairs.append((f0, f1)) 29 | return new_pairs, lengths 30 | 31 | def normalize_functions(self, functions): 32 | lengths = [] 33 | new_functions = [] 34 | for f in functions: 35 | f, length = self.normalize(f) 36 | lengths.append(length) 37 | new_functions.append(f) 38 | return new_functions, lengths 39 | -------------------------------------------------------------------------------- /asm_embedding/InstructionsConverter.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | import json 7 | 8 | 9 | class InstructionsConverter: 10 | 11 | def __init__(self, json_i2id): 12 | f = open(json_i2id, 'r') 13 | self.i2id = json.load(f) 14 | f.close() 15 | 16 | def convert_to_ids(self, instructions_list): 17 | ret_array = [] 18 | # For each instruction we add +1 to its ID because the first 19 | # element of the embedding matrix is zero 20 | for x in instructions_list: 21 | if x in self.i2id: 22 | ret_array.append(self.i2id[x] + 1) 23 | elif 'X_' in x: 24 | # print(str(x) + " is not a known x86 instruction") 25 | ret_array.append(self.i2id['X_UNK'] + 1) 26 | elif 'A_' in x: 27 | # print(str(x) + " is not a known arm instruction") 28 | ret_array.append(self.i2id['A_UNK'] + 1) 29 | else: 30 | # print("There is a problem " + str(x) + " does not appear to be an asm or arm instruction") 31 | ret_array.append(self.i2id['X_UNK'] + 1) 32 | return ret_array 33 | 34 | 35 | -------------------------------------------------------------------------------- /asm_embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # -------------------------------------------------------------------------------- /binary_similarity/PairFactory.py: -------------------------------------------------------------------------------- 1 | import json 2 | from multiprocessing import Queue 3 | import networkx as nx 4 | from networkx import json_graph 5 | import numpy as np 6 | from scipy import sparse 7 | import sqlite3 8 | from threading import Thread 9 | from binary_similarity.utils import __padAndFilterLSTM as padAndFilterLSTM 10 | from binary_similarity.utils import __padAndFilter as padAndFilter 11 | from asm_embedding.InstructionsConverter import InstructionsConverter 12 | from asm_embedding.FunctionNormalizer import FunctionNormalizer 13 | 14 | class DatasetGenerator: 15 | 16 | def get_dataset(self, epoch_number): 17 | pass 18 | 19 | 20 | class PairFactory(DatasetGenerator): 21 | 22 | def __init__(self, db_name, feature_type, dataset_type, json_asm2id, max_instructions, max_num_vertices): 23 | self.db_name = db_name 24 | self.feature_type = feature_type 25 | self.dataset_type = dataset_type 26 | self.max_instructions = max_instructions 27 | self.max_num_vertices = max_num_vertices 28 | self.batch_dim = 0 29 | self.num_pairs = 0 30 | self.num_batches = 0 31 | self.converter = InstructionsConverter(json_asm2id) 32 | self.normalizer = FunctionNormalizer(self.max_instructions) 33 | 34 | def get_data_from_cfg(self, cfg): 35 | adj = sparse.csr_matrix([1,1]) 36 | lenghts = [] 37 | node_matrix = [] 38 | 39 | try: 40 | adj = nx.adjacency_matrix(cfg) 41 | nodes = cfg.nodes(data=True) 42 | for i, n in enumerate(nodes): 43 | filtered = self.converter.convert_to_ids(n[1]['features']) 44 | lenghts.append(len(filtered)) 45 | node_matrix.append(self.normalizer.normalize(filtered)[0]) 46 | except: 47 | pass 48 | return adj, node_matrix, lenghts 49 | 50 | def remove_bad_acfg_node(self, g): 51 | nodeToRemove = [] 52 | for n in g.nodes(data=True): 53 | f = n[1]['features'] 54 | if len(f.keys()) == 0: 55 | nodeToRemove.append(n[0]) 56 | for n in nodeToRemove: 57 | g.remove_node(n) 58 | return g 59 | 60 | def get_node_matrix(self, nodes): 61 | num_node = len(nodes) 62 | node_matrix = np.zeros([num_node, 8]) 63 | for i, n in enumerate(nodes): 64 | f = n[1]['features'] 65 | if isinstance(f['constant'], list): 66 | node_matrix[i, 0] = len(f['constant']) 67 | else: 68 | node_matrix[i, 0] = f['constant'] 69 | if isinstance(f['string'], list): 70 | node_matrix[i, 1] = len(f['string']) 71 | else: 72 | node_matrix[i, 1] = f['string'] 73 | node_matrix[i, 2] = f['transfer'] 74 | node_matrix[i, 3] = f['call'] 75 | node_matrix[i, 4] = f['instruction'] 76 | node_matrix[i, 5] = f['arith'] 77 | node_matrix[i, 6] = f['offspring'] 78 | node_matrix[i, 7] = f['betweenness'] 79 | return node_matrix 80 | 81 | def get_data_from_acfg(self, g): 82 | g = self.remove_bad_acfg_node(g) 83 | if len(g.nodes) > 0: 84 | adj = nx.adjacency_matrix(g) 85 | node_matrix = self.get_node_matrix(g.nodes(data=True)) 86 | else: 87 | adj = sparse.bsr_matrix(np.zeros([1, 1])) 88 | node_matrix = np.zeros([1, 8]) 89 | lenght = 8 90 | return adj, node_matrix, lenght 91 | 92 | def async_chunker(self, epoch, number_of_pairs, shuffle=True): 93 | self.num_pairs = 0 94 | 95 | conn = sqlite3.connect(self.db_name) 96 | cur = conn.cursor() 97 | q = cur.execute("SELECT true_pair, false_pair from " + self.dataset_type + " WHERE id=?", (int(epoch),)) 98 | true_pairs_id, false_pairs_id = q.fetchone() 99 | true_pairs_id = json.loads(true_pairs_id) 100 | false_pairs_id = json.loads(false_pairs_id) 101 | 102 | assert len(true_pairs_id) == len(false_pairs_id) 103 | data_len = len(true_pairs_id) 104 | 105 | print("Data Len: " + str(data_len)) 106 | conn.close() 107 | 108 | n_chunk = int(data_len / (number_of_pairs/2)) - 1 109 | self.num_batches = n_chunk 110 | 111 | q = Queue(maxsize=50) 112 | 113 | t = Thread(target=self.async_create_pairs, args=(epoch, n_chunk, number_of_pairs, q)) 114 | t.start() 115 | 116 | for i in range(0, n_chunk): 117 | yield self.async_get_dataset(i, n_chunk, number_of_pairs, q, shuffle) 118 | 119 | def get_pair_from_db(self, epoch_number, chunk, number_of_pairs): 120 | 121 | conn = sqlite3.connect(self.db_name) 122 | cur = conn.cursor() 123 | 124 | pairs = [] 125 | labels = [] 126 | lenghts = [] 127 | 128 | q = cur.execute("SELECT true_pair, false_pair from " + self.dataset_type + " WHERE id=?", (int(epoch_number),)) 129 | true_pairs_id, false_pairs_id = q.fetchone() 130 | 131 | true_pairs_id = json.loads(true_pairs_id) 132 | false_pairs_id = json.loads(false_pairs_id) 133 | 134 | data_len = len(true_pairs_id) 135 | 136 | i = 0 137 | 138 | while i < number_of_pairs: 139 | if chunk * int(number_of_pairs/2) + i > data_len: 140 | break 141 | 142 | p = true_pairs_id[chunk * int(number_of_pairs/2) + i] 143 | q0 = cur.execute("SELECT " + self.feature_type + " FROM " + self.feature_type + " WHERE id=?", (p[0],)) 144 | if self.feature_type == 'acfg': 145 | adj0, node0, lenghts0 = self.get_data_from_acfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0]))) 146 | elif self.feature_type == 'lstm_cfg': 147 | adj0, node0, lenghts0 = self.get_data_from_cfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0]))) 148 | 149 | q1 = cur.execute("SELECT " + self.feature_type + " FROM " + self.feature_type + " WHERE id=?", (p[1],)) 150 | if self.feature_type == 'acfg': 151 | adj1, node1, lenghts1 = self.get_data_from_acfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0]))) 152 | elif self.feature_type == 'lstm_cfg': 153 | adj1, node1, lenghts1 = self.get_data_from_cfg(json_graph.adjacency_graph(json.loads(q1.fetchone()[0]))) 154 | 155 | pairs.append(((adj0, node0), (adj1, node1))) 156 | lenghts.append([lenghts0, lenghts1]) 157 | labels.append(+1) 158 | 159 | p = false_pairs_id[chunk * int(number_of_pairs/2) + i] 160 | q0 = cur.execute("SELECT " + self.feature_type + " FROM " + self.feature_type + " WHERE id=?", (p[0],)) 161 | if self.feature_type == 'acfg': 162 | adj0, node0,lenghts0 = self.get_data_from_acfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0]))) 163 | elif self.feature_type == 'lstm_cfg': 164 | adj0, node0, lenghts0 = self.get_data_from_cfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0]))) 165 | 166 | q1 = cur.execute("SELECT " + self.feature_type + " FROM " + self.feature_type + " WHERE id=?", (p[1],)) 167 | if self.feature_type == 'acfg': 168 | adj1, node1, lenghts1 = self.get_data_from_acfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0]))) 169 | elif self.feature_type == 'lstm_cfg': 170 | adj1, node1, lenghts1 = self.get_data_from_cfg(json_graph.adjacency_graph(json.loads(q1.fetchone()[0]))) 171 | 172 | pairs.append(((adj0, node0), (adj1, node1))) 173 | lenghts.append([lenghts0, lenghts1]) 174 | labels.append(-1) 175 | 176 | i += 2 177 | if self.feature_type == 'acfg': 178 | pairs, labels, output_len = padAndFilter(pairs, labels, self.max_num_vertices) 179 | elif self.feature_type == 'lstm_cfg': 180 | pairs, labels, output_len = padAndFilterLSTM(pairs, labels, lenghts, self.max_num_vertices) 181 | return pairs, labels, output_len 182 | 183 | def async_create_pairs(self, epoch, n_chunk, number_of_pairs, q): 184 | for i in range(0, n_chunk): 185 | pairs, y_, lenghts = self.get_pair_from_db(epoch, i, number_of_pairs) 186 | q.put((pairs, y_, lenghts), block=True) 187 | 188 | def async_get_dataset(self, chunk, n_chunk, number_of_pairs, q, shuffle): 189 | 190 | item = q.get() 191 | pairs = item[0] 192 | y_ = item[1] 193 | lenghts = item[2] 194 | 195 | assert (len(pairs) == len(y_)) 196 | n_samples = len(pairs) 197 | self.batch_dim = n_samples 198 | self.num_pairs += n_samples 199 | 200 | # Unpack the list 201 | graph1, graph2 = zip(*pairs) 202 | len1, len2 = zip(*lenghts) 203 | adj1, nodes1 = zip(*graph1) 204 | adj2, nodes2 = zip(*graph2) 205 | 206 | if shuffle: 207 | shuffle_indices = np.random.permutation(np.arange(n_samples)) 208 | adj1 = np.array(adj1)[shuffle_indices] 209 | nodes1 = np.array(nodes1)[shuffle_indices] 210 | adj2 = np.array(adj2)[shuffle_indices] 211 | nodes2 = np.array(nodes2)[shuffle_indices] 212 | y_ = np.array(y_)[shuffle_indices] 213 | 214 | for i in range(0, n_samples, number_of_pairs): 215 | upper_bound = min(i + number_of_pairs, n_samples) 216 | 217 | ret_adj1 = adj1[i:upper_bound] 218 | ret_nodes1 = nodes1[i:upper_bound] 219 | ret_len1=len1[i:upper_bound] 220 | ret_adj2 = adj2[i:upper_bound] 221 | ret_nodes2 = nodes2[i:upper_bound] 222 | ret_len2 = len2[i:upper_bound] 223 | ret_y = y_[i:upper_bound] 224 | 225 | return ret_adj1, ret_nodes1, ret_adj2, ret_nodes2, ret_y, ret_len1, ret_len2 -------------------------------------------------------------------------------- /binary_similarity/__init__.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # -------------------------------------------------------------------------------- /binary_similarity/parameters.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | import argparse 8 | import time 9 | import os 10 | import logging 11 | 12 | def getLogger(logfile): 13 | logger = logging.getLogger(__name__) 14 | hdlr = logging.FileHandler(logfile) 15 | formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') 16 | hdlr.setFormatter(formatter) 17 | logger.addHandler(hdlr) 18 | logger.setLevel(logging.INFO) 19 | return logger, hdlr 20 | 21 | class Flags: 22 | 23 | def __init__(self): 24 | parser = argparse.ArgumentParser(description=' cryptoarb.') 25 | 26 | parser.add_argument("-o", "--output", dest="output_file", help="output directory for logging and models", required=False) 27 | parser.add_argument("-e", "--embedding_matrix", dest="embedding_matrix", help="file with the embedding matrix for the instructions",required=False) 28 | parser.add_argument("-j", "--json_asm2id", dest="json_asm2id",help="file with the dictionary of instructions ids", required=False) 29 | parser.add_argument("-n", "--dbName", dest="db_name", help="Name of the database", required=False) 30 | parser.add_argument("-ld","--load_dir", dest="load_dir", help="Load the model from directory load_dir", required=False) 31 | parser.add_argument("-nn","--network_type", help="network type: Arith_Mean, Weighted_Mean, RNN, CCS", required=True, dest="network_type") 32 | parser.add_argument("-r", "--random", help="if present the network use random embedder", default=False, action="store_true", dest="random_embedding", required=False) 33 | parser.add_argument("-te","--trainable_embedding", help="if present the network consider the embedding as trainable", action="store_true", dest="trainable_embeddings", default=False) 34 | parser.add_argument("-cv","--cross_val", help="if present the training is done with cross validiation", default=False, action="store_true", dest="cross_val") 35 | 36 | args = parser.parse_args() 37 | self.network_type = args.network_type 38 | 39 | if self.network_type == "Annotations": 40 | self.feature_type = 'acfg' 41 | elif self.network_type in ["Arith_Mean", "Attention_Mean", "RNN"]: 42 | self.feature_type = 'lstm_cfg' 43 | else: 44 | print("ERROR NETWORK NOT FOUND") 45 | exit(0) 46 | 47 | self.batch_size = 250 # minibatch size (-1 = whole dataset) 48 | self.num_epochs = 50 # number of epochs 49 | self.embedding_size = 64 # dimension of latent layers 50 | self.learning_rate = 0.001 # init learning_rate 51 | self.max_lv = 2 # embedd depth 52 | self.T_iterations= 2 # max rounds of message passing 53 | self.l2_reg_lambda = 0 # 0.002 #0.002 # regularization coefficient 54 | self.num_checkpoints = 1 # max number of checkpoints 55 | self.out_dir = args.output_file # directory for logging 56 | self.db_name = args.db_name 57 | self.load_dir=str(args.load_dir) 58 | self.random_embedding = args.random_embedding 59 | self.trainable_embeddings = args.trainable_embeddings 60 | self.cross_val = args.cross_val 61 | self.cross_val_fold = 5 62 | 63 | self.rnn_depth = 2 # depth of the rnn 64 | self.max_instructions = 150 # number of instructions 65 | self.rnn_kind = 0 #kind of rnn cell 0: lstm cell 1: GRU cell 66 | 67 | self.seed = 2 # random seed 68 | 69 | self.reset_logdir() 70 | 71 | self.file_embedding_matrix = args.embedding_matrix 72 | self.json_asm2id = args.json_asm2id 73 | 74 | self.MAX_NUM_VERTICES = 150 75 | self.MIN_NUM_VERTICES = 1 76 | 77 | def reset_logdir(self): 78 | # create logdir 79 | timestamp = str(int(time.time())) 80 | self.logdir = os.path.abspath(os.path.join(self.out_dir, "runs", timestamp)) 81 | os.makedirs(self.logdir, exist_ok=True) 82 | 83 | # create logger 84 | self.log_file = str(self.logdir)+'/console.log' 85 | self.logger, self.hdlr = getLogger(self.log_file) 86 | 87 | # create symlink for last_run 88 | sym_path_logdir = str(self.out_dir)+"/last_run" 89 | try: 90 | os.unlink(sym_path_logdir) 91 | except: 92 | pass 93 | try: 94 | os.symlink(self.logdir, sym_path_logdir) 95 | except: 96 | print("\nfailed to create symlink!\n") 97 | 98 | def close_log(self): 99 | self.hdlr.close() 100 | self.logger.removeHandler(self.hdlr) 101 | handlers = self.logger.handlers[:] 102 | for handler in handlers: 103 | handler.close() 104 | self.logger.removeHandler(handler) 105 | 106 | def __str__(self): 107 | msg = "" 108 | msg +="\n Parameters:\n" 109 | msg +="\tNetwork_Type: {}\n".format(self.network_type) 110 | msg +="\tRandom embedding: {}\n".format(self.random_embedding) 111 | msg +="\tTrainable embedding: {}\n".format(self.trainable_embeddings) 112 | msg +="\tFeature Type: {}\n".format(self.feature_type) 113 | msg +="\tlogdir: {}\n".format(self.logdir) 114 | msg +="\tbatch_size: {}\n".format(self.batch_size) 115 | msg +="\tnum_epochs: {}\n".format(self.num_epochs) 116 | msg +="\tembedding_size: {}\n".format(self.embedding_size) 117 | msg +="\tlearning_rate: {}\n".format(self.learning_rate) 118 | msg +="\tmax_lv: {}\n".format(self.max_lv) 119 | msg +="\tT_iterations: {}\n".format(self.T_iterations) 120 | msg +="\tl2_reg_lambda: {}\n".format(self.l2_reg_lambda) 121 | msg +="\tnum_checkpoints: {}\n".format(self.num_checkpoints) 122 | msg +="\tseed: {}\n".format(self.seed) 123 | msg +="\tMAX_NUM_VERTICES: {}\n".format(self.MAX_NUM_VERTICES) 124 | msg += "\tMax Instructions per cfg node: {}\n".format(self.max_instructions) 125 | if self.network_type == "RNN" or self.network_type=="Attention": 126 | msg += "\tRNN type (0, lstm; 1, GRU): {}\n".format(self.rnn_kind) 127 | msg += "\tRNN Depth: {}\n".format(self.rnn_depth) 128 | if self.network_type== "Attention": 129 | msg += "\tAttention hops:{}\n".format(self.attention_hops) 130 | msg += "\tAttention depth:{}\n".format(self.attention_detph) 131 | if self.network_type=="RNN_SINGLE": 132 | msg += "\tAttention hops:{}\n".format(self.attention_hops) 133 | msg += "\tAttention depth:{}\n".format(self.attention_detph) 134 | msg += "\tDense Layer Size:{}\n".format(self.dense_layer_size) 135 | return msg 136 | -------------------------------------------------------------------------------- /binary_similarity/s2v_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | # structure2vec 5 | # DE-MF : discriminative embedding using Mean Field 6 | # SAFE TEAM 7 | # 8 | # 9 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 10 | # 11 | 12 | class Network: 13 | 14 | def __init__(self, 15 | features_size, 16 | embedding_size, 17 | max_lv, 18 | T_iterations, 19 | learning_rate, 20 | l2_reg_lambda 21 | ): 22 | self.features_size = features_size 23 | self.embedding_size = embedding_size 24 | self.max_lv = max_lv 25 | self.T_iterations = T_iterations 26 | self.learning_rate=learning_rate 27 | self.l2_reg_lambda = l2_reg_lambda 28 | self.generateGraphClassificationNetwork() 29 | 30 | def meanField(self, input_x, input_adj, name): 31 | 32 | W1_tiled = tf.tile(tf.expand_dims(self.W1,0), [tf.shape(input_x)[0],1,1], name=name + "_W1_tiled") 33 | W2_tiled = tf.tile(tf.expand_dims(self.W2,0), [tf.shape(input_x)[0],1,1], name=name + "_W2_tiled") 34 | 35 | CONV_PARAMS_tiled = [] 36 | for lv in range(self.max_lv): 37 | CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv],0), [tf.shape(input_x)[0],1,1], name=name + "_CONV_PARAMS_tiled_" + str(lv))) 38 | 39 | w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv") 40 | l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1)) 41 | out=w1xv 42 | for i in range(self.T_iterations-1): 43 | ol = l 44 | lv = self.max_lv -1 45 | while lv >= 0 : 46 | with tf.name_scope('cell_' + str(lv)) as scope: 47 | node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv)) 48 | if lv > 0: 49 | ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv)) 50 | else: 51 | ol = node_linear 52 | lv -= 1 53 | out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2)) 54 | l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2)) 55 | 56 | fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1, name=name + "_y_potential_expand_dims") 57 | 58 | graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding') 59 | return graph_embedding 60 | 61 | 62 | def generateGraphClassificationNetwork(self): 63 | 64 | # Placeholders for input, output 65 | self.x_1 = tf.placeholder(tf.float32,[None, None,self.features_size], name = "x_1") # Vettore del nodo in input 1 66 | self.adj_1 = tf.placeholder(tf.float32,[None, None, None],name="adj_1") # Matrice di adiacenza 1 67 | self.x_2 = tf.placeholder(tf.float32,[None, None,self.features_size], name = "x_2") # Vettore del nodo in input 2 68 | self.adj_2 = tf.placeholder(tf.float32,[None, None, None],name="adj_2") # Matrice di adiacenza 2 69 | self.y = tf.placeholder(tf.float32, [None], name='y_') 70 | 71 | self.lenghts_1 = tf.placeholder(tf.float32, [None], name="len1") 72 | self.lenghts_2 = tf.placeholder(tf.float32, [None], name="len2") 73 | 74 | self.norms = [] 75 | 76 | l2_loss = tf.constant(0.0) 77 | 78 | # ------------------------------- 79 | # 1. MEAN FIELD COMPONENT 80 | # ------------------------------- 81 | 82 | #1. parameters for MeanField 83 | with tf.name_scope('parameters_MeanField'): 84 | 85 | # W1 is a [d,p] matrix, and p is the embedding size as explained above 86 | self.W1 = tf.Variable(tf.truncated_normal([self.features_size,self.embedding_size], stddev=0.1), name="W1") 87 | self.norms.append(tf.norm(self.W1)) 88 | 89 | # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv) 90 | self.CONV_PARAMS = [] 91 | for lv in range(self.max_lv): 92 | v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), name="CONV_PARAMS_"+str(lv)) 93 | self.CONV_PARAMS.append(v) 94 | self.norms.append(tf.norm(v)) 95 | 96 | # W2 is another [p,p] matrix to transform the embedding vector 97 | self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), name="W2") 98 | self.norms.append(tf.norm(self.W2)) 99 | 100 | # Mean Field 101 | with tf.name_scope('MeanField1'): 102 | self.graph_embedding_1 = tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x_1,self.adj_1,"MeanField1"), axis=1), axis=1,name="embedding1") # tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x_1,self.adj_1), axis=1), 1) 103 | 104 | with tf.name_scope('MeanField2'): 105 | self.graph_embedding_2 = tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x_2,self.adj_2,"MeanField2"), axis=1), axis=1,name="embedding2") # tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x_2,self.adj_2), axis=1), 1) 106 | 107 | with tf.name_scope('Siamese'): 108 | self.cos_similarity = tf.reduce_sum(tf.multiply(self.graph_embedding_1, self.graph_embedding_2), axis=1,name="cosSimilarity") 109 | 110 | # Regularization 111 | with tf.name_scope("Regularization"): 112 | l2_loss += tf.nn.l2_loss(self.W1) 113 | for lv in range(self.max_lv): 114 | l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv]) 115 | l2_loss += tf.nn.l2_loss(self.W2) 116 | 117 | # CalculateMean cross-entropy loss 118 | with tf.name_scope("Loss"): 119 | self.loss = tf.reduce_sum(tf.squared_difference(self.cos_similarity, self.y), name="loss") 120 | self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss 121 | 122 | # Train step 123 | with tf.name_scope("Train_Step"): 124 | self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss) 125 | -------------------------------------------------------------------------------- /binary_similarity/s2v_network_arith_mean.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | import tensorflow as tf 7 | 8 | 9 | # structure2vec 10 | # DE-MF : discriminative embedding using Mean Field 11 | 12 | 13 | class NetworkLSTM: 14 | 15 | def __init__(self, 16 | features_size, 17 | embedding_size, 18 | max_lv, 19 | T_iterations, 20 | learning_rate, 21 | l2_reg_lambda, 22 | batch_size, 23 | max_instructions, 24 | max_nodes, 25 | rnn_depth, 26 | rnn_kind, 27 | embedding_matrix, 28 | trainable_embeddings 29 | ): 30 | print("Features size"+str(features_size)) 31 | self.features_size = features_size 32 | self.embedding_size = embedding_size 33 | self.max_lv = max_lv 34 | self.T_iterations = T_iterations 35 | self.learning_rate = learning_rate 36 | self.l2_reg_lambda = l2_reg_lambda 37 | self.RRN_HIDDEN = features_size 38 | self.batch_size = batch_size 39 | self.max_instructions = max_instructions 40 | self.max_nodes = max_nodes 41 | self.rnn_depth = rnn_depth 42 | self.rnn_kind=rnn_kind 43 | self.embedding_matrix = embedding_matrix 44 | self.trainable_embeddings = trainable_embeddings 45 | self.generateGraphClassificationNetwork() 46 | 47 | def extract_axis_1(self, data, ind): 48 | """ 49 | Get specified elements along the first axis of tensor. 50 | :param data: Tensorflow tensor that will be subsetted. 51 | :param ind: Indices to take (one for each element along axis 0 of data). 52 | :return: Subsetted tensor. 53 | """ 54 | ind=tf.nn.relu(ind-1) 55 | batch_range = tf.range(tf.shape(data)[0]) 56 | indices = tf.stack([batch_range, ind], axis=1) 57 | res = tf.gather_nd(data, indices) 58 | 59 | return res 60 | 61 | def create_flattening_array(self, max_nodes, batch_size): 62 | shape_array = [] 63 | for p in range(0, batch_size): 64 | for i in range(0, max_nodes): 65 | shape_array.append([p, i]) 66 | return shape_array 67 | 68 | def create_gather_array(self, max_nodes, batch_size): 69 | shape_array = [] 70 | for p in range(0, batch_size): 71 | x = [] 72 | for i in range(0, max_nodes): 73 | x.append([0, i + p * max_nodes]) 74 | shape_array.append(x) 75 | return shape_array 76 | 77 | def lstmFeatures(self, input_x, lengths): 78 | flattened_inputs = tf.reshape(input_x, [-1, tf.shape(input_x)[2]], name="Flattening") 79 | flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs) 80 | last_outputs = tf.squeeze(tf.nn.l2_normalize(tf.reduce_mean(flattened_embedded, name='arith_mean', axis=1), axis=1)) 81 | print("shape: " + str(tf.shape(last_outputs))) 82 | gather_output2 = tf.reshape(last_outputs, [-1, tf.shape(input_x)[1], self.features_size], name="Deflattening") 83 | output = tf.identity(gather_output2, name="LSTMOutput") 84 | output = tf.nn.l2_normalize(output) 85 | return output 86 | 87 | def meanField(self, input_x, input_adj, name): 88 | 89 | # for batch processing 90 | W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled") 91 | W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled") 92 | 93 | CONV_PARAMS_tiled = [] 94 | for lv in range(self.max_lv): 95 | CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1], 96 | name=name + "_CONV_PARAMS_tiled_" + str(lv))) 97 | 98 | w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv") 99 | l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1)) 100 | out = w1xv 101 | for i in range(self.T_iterations - 1): 102 | ol = l 103 | lv = self.max_lv - 1 104 | while lv >= 0: 105 | with tf.name_scope('cell_' + str(lv)) as scope: 106 | node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv)) 107 | if lv > 0: 108 | ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv)) 109 | else: 110 | ol = node_linear 111 | lv -= 1 112 | 113 | out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2)) 114 | l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2)) 115 | 116 | fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1, 117 | name=name + "_y_potential_expand_dims") 118 | 119 | graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding') 120 | return graph_embedding 121 | 122 | def generateGraphClassificationNetwork(self): 123 | print("Features size:"+str(self.features_size)) 124 | 125 | self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix), 126 | trainable=self.trainable_embeddings, 127 | name="instruction_embedding", dtype=tf.float32) 128 | 129 | self.x_1 = tf.placeholder(tf.int32, [None, None, self.max_instructions],name="x_1") 130 | self.adj_1 = tf.placeholder(tf.float32, [None, None, None], name="adj_1") # 131 | self.lenghts_1 = tf.placeholder(tf.int32, [None,None], name='lenghts_1') 132 | self.x_2 = tf.placeholder(tf.int32, [None, None, self.max_instructions], name="x_2") 133 | self.adj_2 = tf.placeholder(tf.float32, [None, None, None], name="adj_2") 134 | self.lenghts_2 = tf.placeholder(tf.int32, [None,None], name='lenghts_2') 135 | self.y = tf.placeholder(tf.float32, [None], name='y_') 136 | 137 | # Euclidean norms; p = 2 138 | self.norms = [] 139 | 140 | l2_loss = tf.constant(0.0) 141 | 142 | # ------------------------------- 143 | # 1. MEAN FIELD COMPONENT 144 | # ------------------------------- 145 | 146 | # 1. parameters for MeanField 147 | with tf.name_scope('parameters_MeanField'): 148 | 149 | # W1 is a [d,p] matrix, and p is the embedding size as explained above 150 | self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1") 151 | self.norms.append(tf.norm(self.W1)) 152 | 153 | # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv) 154 | self.CONV_PARAMS = [] 155 | for lv in range(self.max_lv): 156 | v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), 157 | name="CONV_PARAMS_" + str(lv)) 158 | self.CONV_PARAMS.append(v) 159 | self.norms.append(tf.norm(v)) 160 | 161 | # W2 is another [p,p] matrix to transform the embedding vector 162 | self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), 163 | name="W2") 164 | self.norms.append(tf.norm(self.W2)) 165 | 166 | # LSTMExtraction 167 | with tf.name_scope('LSTMExtraction1'): 168 | with tf.variable_scope('lstm1'): 169 | self.x_1_after_lstm = self.lstmFeatures(self.x_1, self.lenghts_1) 170 | with tf.name_scope('LSTMExtraction2'): 171 | with tf.variable_scope('lstm2'): 172 | self.x2_after_lstm = self.lstmFeatures(self.x_2, self.lenghts_2) 173 | 174 | # Mean Field 175 | with tf.name_scope('MeanField1'): 176 | self.graph_embedding_1 = tf.nn.l2_normalize( 177 | tf.squeeze(self.meanField(self.x_1_after_lstm, self.adj_1, "MeanField1"), axis=1), axis=1, 178 | name="embedding1") 179 | 180 | with tf.name_scope('MeanField2'): 181 | self.graph_embedding_2 = tf.nn.l2_normalize( 182 | tf.squeeze(self.meanField(self.x2_after_lstm, self.adj_2, "MeanField2"), axis=1), axis=1, 183 | name="embedding2") 184 | 185 | with tf.name_scope('Siamese'): 186 | self.cos_similarity = tf.reduce_sum(tf.multiply(self.graph_embedding_1, self.graph_embedding_2), axis=1, 187 | name="cosSimilarity") 188 | 189 | # Regularization 190 | with tf.name_scope("Regularization"): 191 | l2_loss += tf.nn.l2_loss(self.W1) 192 | for lv in range(self.max_lv): 193 | l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv]) 194 | l2_loss += tf.nn.l2_loss(self.W2) 195 | 196 | # CalculateMean cross-entropy loss 197 | with tf.name_scope("Loss"): 198 | 199 | self.loss = tf.reduce_sum(tf.squared_difference(self.cos_similarity, self.y), name="loss") 200 | self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss # regularization 201 | 202 | # Train step 203 | with tf.name_scope("Train_Step"): 204 | self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss) 205 | -------------------------------------------------------------------------------- /binary_similarity/s2v_network_attention_mean.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | import tensorflow as tf 8 | 9 | 10 | # structure2vec 11 | # DE-MF : discriminative embedding using Mean Field 12 | 13 | 14 | class NetworkLSTM: 15 | 16 | def __init__(self, 17 | features_size, 18 | embedding_size, 19 | max_lv, 20 | T_iterations, 21 | learning_rate, 22 | l2_reg_lambda, 23 | batch_size, 24 | max_instructions, 25 | max_nodes, 26 | rnn_depth, 27 | rnn_kind, 28 | embedding_matrix, 29 | trainable_embeddings 30 | ): 31 | print("Features size"+str(features_size)) 32 | self.features_size = features_size 33 | self.embedding_size = embedding_size 34 | self.max_lv = max_lv 35 | self.T_iterations = T_iterations 36 | self.learning_rate = learning_rate 37 | self.l2_reg_lambda = l2_reg_lambda 38 | self.RRN_HIDDEN = features_size 39 | self.batch_size = batch_size 40 | self.max_instructions = max_instructions 41 | self.max_nodes = max_nodes 42 | self.rnn_depth = rnn_depth 43 | self.rnn_kind=rnn_kind 44 | self.embedding_matrix = embedding_matrix 45 | self.trainable_embeddings = trainable_embeddings 46 | self.generateGraphClassificationNetwork() 47 | 48 | def extract_axis_1(self, data, ind): 49 | """ 50 | Get specified elements along the first axis of tensor. 51 | :param data: Tensorflow tensor that will be subsetted. 52 | :param ind: Indices to take (one for each element along axis 0 of data). 53 | :return: Subsetted tensor. 54 | """ 55 | ind=tf.nn.relu(ind-1) 56 | batch_range = tf.range(tf.shape(data)[0]) 57 | indices = tf.stack([batch_range, ind], axis=1) 58 | res = tf.gather_nd(data, indices) 59 | 60 | return res 61 | 62 | def create_flattening_array(self, max_nodes, batch_size): 63 | shape_array = [] 64 | for p in range(0, batch_size): 65 | for i in range(0, max_nodes): 66 | shape_array.append([p, i]) 67 | return shape_array 68 | 69 | def create_gather_array(self, max_nodes, batch_size): 70 | shape_array = [] 71 | for p in range(0, batch_size): 72 | x = [] 73 | for i in range(0, max_nodes): 74 | x.append([0, i + p * max_nodes]) 75 | shape_array.append(x) 76 | return shape_array 77 | 78 | def lstmFeatures(self, input_x, lengths): 79 | 80 | flattened_inputs = tf.reshape(input_x, [-1, tf.shape(input_x)[2]], name="Flattening") 81 | flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs) 82 | self.W0 = tf.Variable(tf.constant(1.0 / self.max_instructions, shape=[1, self.max_instructions]), name="W0") 83 | w0_tiled = tf.tile(tf.expand_dims(self.W0, 0), [tf.shape(flattened_inputs)[0], 1, 1], name="W0_tiled") 84 | last_outputs = tf.squeeze(tf.nn.l2_normalize(tf.matmul(w0_tiled, flattened_embedded, 85 | name='features_weighted_mean')), axis=1) 86 | gather_output2 = tf.reshape(last_outputs, [-1, tf.shape(input_x)[1], self.features_size], name="Deflattening") 87 | output = tf.identity(gather_output2, name="LSTMOutput") 88 | output=tf.nn.l2_normalize(output) 89 | return output 90 | 91 | def meanField(self, input_x, input_adj, name): 92 | 93 | # for batch processing 94 | W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled") 95 | W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled") 96 | 97 | CONV_PARAMS_tiled = [] 98 | for lv in range(self.max_lv): 99 | CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1], 100 | name=name + "_CONV_PARAMS_tiled_" + str(lv))) 101 | 102 | w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv") 103 | l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1)) 104 | out = w1xv 105 | for i in range(self.T_iterations - 1): 106 | ol = l 107 | lv = self.max_lv - 1 108 | while lv >= 0: 109 | with tf.name_scope('cell_' + str(lv)) as scope: 110 | node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv)) 111 | if lv > 0: 112 | ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv)) 113 | else: 114 | ol = node_linear 115 | lv -= 1 116 | 117 | out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2)) 118 | l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2)) 119 | 120 | fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1, 121 | name=name + "_y_potential_expand_dims") 122 | 123 | graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding') 124 | return graph_embedding 125 | 126 | def generateGraphClassificationNetwork(self): 127 | print("Features size:"+str(self.features_size)) 128 | 129 | self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix), 130 | trainable=self.trainable_embeddings, 131 | name="instruction_embedding", dtype=tf.float32) 132 | 133 | self.x_1 = tf.placeholder(tf.int32, [None, None, self.max_instructions],name="x_1") 134 | self.adj_1 = tf.placeholder(tf.float32, [None, None, None], name="adj_1") # 135 | self.lenghts_1 = tf.placeholder(tf.int32, [None,None], name='lenghts_1') 136 | self.x_2 = tf.placeholder(tf.int32, [None, None, self.max_instructions], name="x_2") 137 | self.adj_2 = tf.placeholder(tf.float32, [None, None, None], name="adj_2") 138 | self.lenghts_2 = tf.placeholder(tf.int32, [None,None], name='lenghts_2') 139 | self.y = tf.placeholder(tf.float32, [None], name='y_') 140 | 141 | # Euclidean norms; p = 2 142 | self.norms = [] 143 | 144 | l2_loss = tf.constant(0.0) 145 | 146 | # ------------------------------- 147 | # 1. MEAN FIELD COMPONENT 148 | # ------------------------------- 149 | 150 | # 1. parameters for MeanField 151 | with tf.name_scope('parameters_MeanField'): 152 | 153 | # W1 is a [d,p] matrix, and p is the embedding size as explained above 154 | self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1") 155 | self.norms.append(tf.norm(self.W1)) 156 | 157 | # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv) 158 | self.CONV_PARAMS = [] 159 | for lv in range(self.max_lv): 160 | v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), 161 | name="CONV_PARAMS_" + str(lv)) 162 | self.CONV_PARAMS.append(v) 163 | self.norms.append(tf.norm(v)) 164 | 165 | # W2 is another [p,p] matrix to transform the embedding vector 166 | self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), 167 | name="W2") 168 | self.norms.append(tf.norm(self.W2)) 169 | 170 | # LSTMExtraction 171 | with tf.name_scope('LSTMExtraction1'): 172 | with tf.variable_scope('lstm1'): 173 | self.x_1_after_lstm = self.lstmFeatures(self.x_1, self.lenghts_1) 174 | with tf.name_scope('LSTMExtraction2'): 175 | with tf.variable_scope('lstm2'): 176 | self.x2_after_lstm = self.lstmFeatures(self.x_2, self.lenghts_2) 177 | 178 | # Mean Field 179 | with tf.name_scope('MeanField1'): 180 | self.graph_embedding_1 = tf.nn.l2_normalize( 181 | tf.squeeze(self.meanField(self.x_1_after_lstm, self.adj_1, "MeanField1"), axis=1), axis=1, 182 | name="embedding1") 183 | 184 | with tf.name_scope('MeanField2'): 185 | self.graph_embedding_2 = tf.nn.l2_normalize( 186 | tf.squeeze(self.meanField(self.x2_after_lstm, self.adj_2, "MeanField2"), axis=1), axis=1, 187 | name="embedding2") 188 | 189 | with tf.name_scope('Siamese'): 190 | self.cos_similarity = tf.reduce_sum(tf.multiply(self.graph_embedding_1, self.graph_embedding_2), axis=1, 191 | name="cosSimilarity") 192 | 193 | # Regularization 194 | with tf.name_scope("Regularization"): 195 | l2_loss += tf.nn.l2_loss(self.W1) 196 | for lv in range(self.max_lv): 197 | l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv]) 198 | l2_loss += tf.nn.l2_loss(self.W2) 199 | 200 | # CalculateMean cross-entropy loss 201 | with tf.name_scope("Loss"): 202 | 203 | self.loss = tf.reduce_sum(tf.squared_difference(self.cos_similarity, self.y), name="loss") 204 | self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss # regularization 205 | 206 | # Train step 207 | with tf.name_scope("Train_Step"): 208 | self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss) -------------------------------------------------------------------------------- /binary_similarity/s2v_network_rnn.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | import tensorflow as tf 8 | 9 | class NetworkLSTM: 10 | 11 | def __init__(self, 12 | features_size, 13 | embedding_size, 14 | max_lv, 15 | T_iterations, 16 | learning_rate, 17 | l2_reg_lambda, 18 | batch_size, 19 | max_instructions, 20 | max_nodes, 21 | rnn_depth, 22 | rnn_kind, 23 | embedding_matrix, 24 | trainable_embeddings 25 | ): 26 | print("Features size"+str(features_size)) 27 | self.features_size = features_size 28 | self.embedding_size = embedding_size 29 | self.max_lv = max_lv 30 | self.T_iterations = T_iterations 31 | self.learning_rate = learning_rate 32 | self.l2_reg_lambda = l2_reg_lambda 33 | self.RRN_HIDDEN = features_size 34 | self.batch_size = batch_size 35 | self.max_instructions = max_instructions 36 | self.max_nodes = max_nodes 37 | self.rnn_depth = rnn_depth 38 | self.rnn_kind=rnn_kind 39 | self.embedding_matrix = embedding_matrix 40 | self.trainable_embeddings = trainable_embeddings 41 | self.generateGraphClassificationNetwork() 42 | 43 | def extract_axis_1(self, data, ind): 44 | """ 45 | Get specified elements along the first axis of tensor. 46 | :param data: Tensorflow tensor that will be subsetted. 47 | :param ind: Indices to take (one for each element along axis 0 of data). 48 | :return: Subsetted tensor. 49 | """ 50 | ind=tf.nn.relu(ind-1) 51 | batch_range = tf.range(tf.shape(data)[0]) 52 | indices = tf.stack([batch_range, ind], axis=1) 53 | res = tf.gather_nd(data, indices) 54 | 55 | return res 56 | 57 | def lstmFeatures(self, input_x, lengths): 58 | 59 | flattened_inputs=tf.reshape(input_x,[-1,tf.shape(input_x)[2]],name="Flattening") 60 | 61 | flattened_lenghts = tf.reshape(lengths, [-1]) 62 | max = tf.reduce_max(flattened_lenghts) 63 | flattened_inputs=flattened_inputs[:,:max] 64 | flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs) 65 | 66 | zeros = tf.zeros(tf.shape(flattened_lenghts)[0], dtype=tf.int32) 67 | mask = tf.not_equal(flattened_lenghts, zeros) 68 | int_mask = tf.cast(mask, tf.int32) 69 | fake_output = tf.zeros([self.features_size], dtype=tf.float32) 70 | partitions = tf.dynamic_partition(flattened_embedded, int_mask, 2) 71 | real_nodes=partitions[1] 72 | real_lenghts=tf.boolean_mask(flattened_lenghts,mask) 73 | fake_zero = tf.tile([fake_output], [tf.shape(flattened_embedded)[0] - tf.shape(partitions[1])[0], 1]) 74 | 75 | if self.rnn_kind==0: 76 | rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in ([self.features_size] * self.rnn_depth)] 77 | else: 78 | rnn_layers = [tf.nn.rnn_cell.GRUCell(size) for size in ([self.features_size] * self.rnn_depth)] 79 | cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers) 80 | 81 | rnn_outputs, _ = tf.nn.dynamic_rnn(cell, real_nodes, sequence_length=real_lenghts, 82 | dtype=tf.float32, 83 | time_major=False, 84 | parallel_iterations=88) 85 | 86 | last_outputs = self.extract_axis_1(rnn_outputs, real_lenghts) 87 | 88 | condition_indices = tf.dynamic_partition( 89 | tf.range(tf.shape(flattened_embedded)[0]), int_mask, 2) 90 | last_outputs = tf.dynamic_stitch(condition_indices, [fake_zero, last_outputs]) 91 | 92 | gather_output2 = tf.reshape(last_outputs, 93 | [-1,tf.shape(input_x)[1],self.features_size], name="Deflattening") 94 | 95 | output = tf.identity(gather_output2, name="LSTMOutput") 96 | output=tf.nn.l2_normalize(output) 97 | return output 98 | 99 | def meanField(self, input_x, input_adj, name): 100 | 101 | W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled") 102 | W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled") 103 | 104 | CONV_PARAMS_tiled = [] 105 | for lv in range(self.max_lv): 106 | CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1], 107 | name=name + "_CONV_PARAMS_tiled_" + str(lv))) 108 | 109 | w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv") 110 | l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1)) 111 | out = w1xv 112 | for i in range(self.T_iterations - 1): 113 | ol = l 114 | lv = self.max_lv - 1 115 | while lv >= 0: 116 | with tf.name_scope('cell_' + str(lv)) as scope: 117 | node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv)) 118 | if lv > 0: 119 | ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv)) 120 | else: 121 | ol = node_linear 122 | lv -= 1 123 | 124 | out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2)) 125 | l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2)) 126 | 127 | fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1, 128 | name=name + "_y_potential_expand_dims") 129 | 130 | graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding') 131 | return graph_embedding 132 | 133 | def generateGraphClassificationNetwork(self): 134 | print("Features size:"+str(self.features_size)) 135 | 136 | self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix), 137 | trainable=self.trainable_embeddings, 138 | name="instruction_embedding", dtype=tf.float32) 139 | 140 | self.x_1 = tf.placeholder(tf.int32, [None, None, None], name="x_1") 141 | self.adj_1 = tf.placeholder(tf.float32, [None, None, None], name="adj_1") 142 | self.lenghts_1 = tf.placeholder(tf.int32, [None,None], name='lenghts_1') 143 | self.x_2 = tf.placeholder(tf.int32, [None, None, None], name="x_2") 144 | self.adj_2 = tf.placeholder(tf.float32, [None, None, None], name="adj_2") 145 | self.lenghts_2 = tf.placeholder(tf.int32, [None,None], name='lenghts_2') 146 | self.y = tf.placeholder(tf.float32, [None], name='y_') 147 | 148 | self.norms = [] 149 | 150 | l2_loss = tf.constant(0.0) 151 | 152 | # 1. parameters for MeanField 153 | with tf.name_scope('parameters_MeanField'): 154 | 155 | self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1") 156 | self.norms.append(tf.norm(self.W1)) 157 | 158 | self.CONV_PARAMS = [] 159 | for lv in range(self.max_lv): 160 | v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), 161 | name="CONV_PARAMS_" + str(lv)) 162 | self.CONV_PARAMS.append(v) 163 | self.norms.append(tf.norm(v)) 164 | 165 | self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), 166 | name="W2") 167 | self.norms.append(tf.norm(self.W2)) 168 | 169 | with tf.name_scope('LSTMExtraction1'): 170 | with tf.variable_scope('lstm1'): 171 | self.x_1_after_lstm = self.lstmFeatures(self.x_1, self.lenghts_1) 172 | with tf.name_scope('LSTMExtraction2'): 173 | with tf.variable_scope('lstm2'): 174 | self.x2_after_lstm = self.lstmFeatures(self.x_2, self.lenghts_2) 175 | 176 | with tf.name_scope('MeanField1'): 177 | self.graph_embedding_1 = tf.nn.l2_normalize( 178 | tf.squeeze(self.meanField(self.x_1_after_lstm, self.adj_1, "MeanField1"), axis=1), axis=1, 179 | name="embedding1") 180 | 181 | with tf.name_scope('MeanField2'): 182 | self.graph_embedding_2 = tf.nn.l2_normalize( 183 | tf.squeeze(self.meanField(self.x2_after_lstm, self.adj_2, "MeanField2"), axis=1), axis=1, 184 | name="embedding2") 185 | 186 | with tf.name_scope('Siamese'): 187 | self.cos_similarity = tf.reduce_sum(tf.multiply(self.graph_embedding_1, self.graph_embedding_2), axis=1, 188 | name="cosSimilarity") 189 | 190 | # Regularization 191 | with tf.name_scope("Regularization"): 192 | l2_loss += tf.nn.l2_loss(self.W1) 193 | for lv in range(self.max_lv): 194 | l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv]) 195 | l2_loss += tf.nn.l2_loss(self.W2) 196 | 197 | # CalculateMean cross-entropy loss 198 | with tf.name_scope("Loss"): 199 | 200 | self.loss = tf.reduce_sum(tf.squared_difference(self.cos_similarity, self.y), name="loss") 201 | self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss 202 | 203 | # Train step 204 | with tf.name_scope("Train_Step"): 205 | self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss) 206 | -------------------------------------------------------------------------------- /binary_similarity/train.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | import numpy as np 8 | from binary_similarity.s2v_trainer import S2VTrainer 9 | from binary_similarity.parameters import Flags 10 | import pickle 11 | 12 | def run_test(): 13 | flags = Flags() 14 | flags.logger.info("\n{}\n".format(flags)) 15 | 16 | print(str(flags)) 17 | 18 | file_embedding_matrix = flags.file_embedding_matrix 19 | 20 | embedding_matrix = np.float32(np.load(file_embedding_matrix)) 21 | if flags.random_embedding: 22 | embedding_matrix = np.random.rand(*np.shape(embedding_matrix)).astype(np.float32) 23 | embedding_matrix[0, :] = np.zeros(np.shape(embedding_matrix)[1]).astype(np.float32) 24 | 25 | if flags.cross_val: 26 | print("STARTING CROSS VALIDATION") 27 | res = [] 28 | mean = 0 29 | for i in range(0, flags.cross_val_fold): 30 | print("CROSS VALIDATION STARTING FOLD: " + str(i)) 31 | if i > 0: 32 | flags.close_log() 33 | flags.reset_logdir() 34 | del flags 35 | flags = Flags() 36 | flags.logger.info("\n{}\n".format(flags)) 37 | 38 | flags.logger.info("Starting cross validation fold: {}".format(i)) 39 | 40 | flags.db_name = flags.db_name + "_val_" + str(i+1) + ".db" 41 | flags.logger.info("Cross validation db name: {}".format(flags.db_name)) 42 | 43 | trainer = S2VTrainer(flags, embedding_matrix) 44 | best_val_auc = trainer.train() 45 | 46 | mean += best_val_auc 47 | res.append(best_val_auc) 48 | 49 | flags.logger.info("Cross validation fold {} finished best auc: {}".format(i, best_val_auc)) 50 | print("FINISH FOLD: " + str(i) + " BEST VAL AUC: " + str(best_val_auc)) 51 | 52 | print("CROSS VALIDATION ENDED") 53 | print("Result: " + str(res)) 54 | print("") 55 | 56 | flags.logger.info("Cross validation finished results: {}".format(res)) 57 | flags.logger.info(" mean: {}".format(mean / flags.cross_val_fold)) 58 | flags.close_log() 59 | 60 | flags.close_log() 61 | 62 | else: 63 | trainer = S2VTrainer(flags, embedding_matrix) 64 | trainer.train() 65 | flags.close_log() 66 | 67 | 68 | if __name__ == '__main__': 69 | run_test() 70 | -------------------------------------------------------------------------------- /binary_similarity/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Type of the network to use 4 | 5 | NETWORK_TYPE="Attention_Mean" 6 | #NETWORK_TYPE="Arith_Mean" 7 | #NETWORK_TYPE="RNN" 8 | #NETWORK_TYPE="Annotations" 9 | 10 | # Root path for the experiment 11 | MODEL_PATH=experiments/ 12 | 13 | # Path to the sqlite db with diassembled functions 14 | DB_PATH=../data/OpenSSL_dataset.db 15 | 16 | # Path to embedding matrix 17 | EMBEDDING_MATRIX=../data/i2v/embedding_matrix.npy 18 | 19 | # Path to instruction2id dictionary 20 | INS2ID=../data/i2v/word2id.json 21 | 22 | # Add this argument to train.py to use random instructions embeddings 23 | RANDOM_EMBEDDINGS="-r" 24 | 25 | # Add this argument to train.py to use trainable instructions embeddings 26 | TRAINABLE_EMBEDDINGS="-te" 27 | 28 | python3 train.py --o $MODEL_PATH -n $DB_PATH -nn $NETWORK_TYPE -e $EMBEDDING_MATRIX -j $INS2ID 29 | -------------------------------------------------------------------------------- /binary_similarity/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import csr_matrix 3 | # SAFE TEAM 4 | # 5 | # 6 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 7 | # 8 | 9 | 10 | def __padAndFilter(input_pairs, input_labels, max_num_vertices): 11 | 12 | output_pairs = [] 13 | output_labels = [] 14 | output_len = [] 15 | 16 | for pair, label in zip(input_pairs, input_labels): 17 | g1 = pair[0] 18 | g2 = pair[1] 19 | 20 | # graph 1 21 | adj1 = g1[0] 22 | nodes1 = g1[1] 23 | 24 | # graph 2 25 | adj2 = g2[0] 26 | nodes2 = g2[1] 27 | 28 | if (len(nodes1) <= max_num_vertices) and (len(nodes2) <= max_num_vertices): 29 | pad_lenght1 = max_num_vertices - len(nodes1) 30 | new_node1 = np.pad(nodes1, [(0, pad_lenght1), (0, 0)], mode='constant') 31 | pad_lenght1 = max_num_vertices - adj1.shape[0] 32 | adj1_dense = np.pad(adj1.todense(), [(0, pad_lenght1), (0, pad_lenght1)], mode='constant') 33 | g1 = (adj1_dense, new_node1) 34 | pad_lenght2 = max_num_vertices - len(nodes2) 35 | new_node2 = np.pad(nodes2, [(0, pad_lenght2), (0, 0)], mode='constant') 36 | pad_lenght2 = max_num_vertices - adj2.shape[0] 37 | adj2_dense = np.pad(adj2.todense(), [(0, pad_lenght2), (0, pad_lenght2)], mode='constant') 38 | g2 = (adj2_dense, new_node2) 39 | output_pairs.append([g1, g2]) 40 | output_labels.append(label) 41 | output_len.append([8,8]) 42 | 43 | return output_pairs, output_labels, output_len 44 | 45 | def __padAndFilterLSTM(input_pairs, input_labels, input_len, max_num_vertices): 46 | 47 | 48 | output_pairs = [] 49 | output_labels = [] 50 | output_len=[] 51 | 52 | for pair, label, lens in zip(input_pairs, input_labels, input_len): 53 | 54 | try: 55 | 56 | g1 = pair[0] 57 | g2 = pair[1] 58 | 59 | # graph 1 60 | adj1 = g1[0] 61 | nodes1 = g1[1] 62 | 63 | # graph 2 64 | adj2 = g2[0] 65 | nodes2 = g2[1] 66 | if (len(nodes1) <= max_num_vertices) and (len(nodes2) <= max_num_vertices): 67 | 68 | pad_lenght1 = max_num_vertices - len(nodes1) 69 | new_node1 = np.pad(nodes1, [(0, pad_lenght1), (0, 0)], mode='constant') 70 | 71 | pad_lenght1 = max_num_vertices - adj1.shape[0] 72 | adj1_dense = np.pad(adj1.todense(), [(0, pad_lenght1), (0, pad_lenght1)], mode='constant') 73 | g1 = (adj1_dense, new_node1) 74 | 75 | pad_lenght2 = max_num_vertices - len(nodes2) 76 | new_node2 = np.pad(nodes2, [(0, pad_lenght2), (0, 0)], mode='constant') 77 | pad_lenght2 = max_num_vertices - adj2.shape[0] 78 | adj2_dense = np.pad(adj2.todense(), [(0, pad_lenght2), (0, pad_lenght2)], mode='constant') 79 | g2 = (adj2_dense, new_node2) 80 | 81 | output_pairs.append([g1, g2]) 82 | output_labels.append(label) 83 | new_lens_0 = lens[0]+[0]*(max_num_vertices-len(lens[0])) 84 | new_lens_1 = lens[1]+[0]*(max_num_vertices-len(lens[1])) 85 | output_len.append([new_lens_0, new_lens_1]) 86 | 87 | except: 88 | pass 89 | 90 | return output_pairs, output_labels, output_len 91 | -------------------------------------------------------------------------------- /compiler_provenance/FunctionFactory.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | from compiler_provenance.utils import __padAndFilter as padAndFilter 8 | from asm_embedding.InstructionsConverter import InstructionsConverter 9 | from asm_embedding.FunctionNormalizer import FunctionNormalizer 10 | import json 11 | from multiprocessing import Queue 12 | from multiprocessing import Process 13 | import networkx as nx 14 | from networkx.readwrite import json_graph 15 | import numpy as np 16 | import random 17 | from scipy import sparse 18 | import sqlite3 19 | 20 | 21 | 22 | class DatasetGenerator: 23 | 24 | def get_dataset(self, epoch_number): 25 | pass 26 | 27 | 28 | class PairFactory(DatasetGenerator): 29 | 30 | def __init__(self, db_name, feature_type, dataset_type, json_asm2id, max_instructions, max_num_vertices, encoder, batch_size, flags=None): 31 | self.db_name = db_name 32 | self.feature_type = feature_type 33 | self.dataset_type = dataset_type 34 | self.encoder = encoder 35 | self.max_instructions = max_instructions 36 | self.max_num_vertices = max_num_vertices 37 | self.batch_dim = 0 38 | self.num_pairs = 0 39 | self.num_batches = 0 40 | self.flags=flags 41 | 42 | self.converter = InstructionsConverter(json_asm2id) 43 | self.normalizer = FunctionNormalizer(self.max_instructions) 44 | 45 | conn = sqlite3.connect(self.db_name) 46 | cur = conn.cursor() 47 | q = cur.execute("SELECT count(*) from " + self.dataset_type) 48 | count=int(q.fetchone()[0]) 49 | n_chunk = int(count / batch_size) - 1 50 | 51 | self.num_batches = n_chunk 52 | conn.close() 53 | 54 | def remove_bad_acfg_node(self, g): 55 | nodeToRemove = [] 56 | for n in g.nodes(data=True): 57 | f = n[1]['features'] 58 | if len(f.keys()) == 0: 59 | nodeToRemove.append(n[0]) 60 | for n in nodeToRemove: 61 | g.remove_node(n) 62 | return g 63 | 64 | def split(self,a, n): 65 | return [a[i::n] for i in range(n)] 66 | 67 | def get_node_matrix(self, nodes): 68 | 69 | if isinstance(nodes, int): 70 | print(nodes) 71 | 72 | num_node = len(nodes) 73 | node_matrix = np.zeros([num_node, 8]) 74 | for i, n in enumerate(nodes): 75 | f = n[1]['features'] 76 | 77 | if isinstance(f['constant'], int): 78 | node_matrix[i, 0] = f['constant'] 79 | else: 80 | node_matrix[i, 0] = len(f['constant']) 81 | 82 | if isinstance(f['string'], int): 83 | node_matrix[i, 1] = f['string'] 84 | else: 85 | node_matrix[i, 1] = len(f['string']) 86 | 87 | node_matrix[i, 2] = f['transfer'] 88 | node_matrix[i, 3] = f['call'] 89 | node_matrix[i, 4] = f['instruction'] 90 | node_matrix[i, 5] = f['arith'] 91 | node_matrix[i, 6] = f['offspring'] 92 | node_matrix[i, 7] = f['betweenness'] 93 | return node_matrix 94 | 95 | def get_data_from_acfg(self, g): 96 | g = self.remove_bad_acfg_node(g) 97 | if len(g.nodes) > 0: 98 | adj = nx.adjacency_matrix(g) 99 | node_matrix = self.get_node_matrix(g.nodes(data=True)) 100 | else: 101 | adj = sparse.bsr_matrix(np.zeros([1, 1])) 102 | node_matrix = np.zeros([1, 8]) 103 | lenghts = [8] * len(node_matrix) 104 | return adj, node_matrix, lenghts 105 | 106 | def get_data_from_cfg(self, cfg): 107 | adj = sparse.csr_matrix([1, 1]) 108 | lenghts = [] 109 | node_matrix = [] 110 | 111 | try: 112 | adj = nx.adjacency_matrix(cfg) 113 | nodes = cfg.nodes(data=True) 114 | for i, n in enumerate(nodes): 115 | filtered = self.converter.convert_to_ids(n[1]['features']) 116 | lenghts.append(len(filtered)) 117 | node_matrix.append(self.normalizer.normalize(filtered)[0]) 118 | except: 119 | pass 120 | return adj, node_matrix, lenghts 121 | 122 | def async_chunker(self, epoch, batch_size, shuffle=True): 123 | self.num_pairs = 0 124 | 125 | conn = sqlite3.connect(self.db_name) 126 | cur = conn.cursor() 127 | q = cur.execute("SELECT id from " + self.dataset_type) 128 | ids = q.fetchall() 129 | ids = [ii[0] for ii in ids] 130 | 131 | data_len = len(ids) 132 | 133 | n_chunk = int(data_len / batch_size) - 1 134 | random.seed(17) 135 | self.num_batches = n_chunk 136 | lista_chunk=range(0,n_chunk) 137 | coda = Queue(maxsize=50) 138 | n_proc = 10 139 | listone = self.split(lista_chunk, n_proc) 140 | for i in range(0,n_proc): 141 | l = list(listone[i]) 142 | p = Process(target=self.async_create_pair,args=((epoch, l, batch_size, coda, shuffle, self.encoder))) 143 | p.start() 144 | 145 | while coda.empty(): 146 | pass 147 | for i in range(0, n_chunk): 148 | yield self.async_get_dataset(i, n_chunk, batch_size, coda, shuffle) 149 | 150 | def get_pair_from_db(self, epoch_number, chunk, number_of_functions, label_encoder): 151 | 152 | conn = sqlite3.connect(self.db_name) 153 | cur = conn.cursor() 154 | 155 | functions = [] 156 | labels = [] 157 | lenghts = [] 158 | 159 | q = cur.execute("SELECT id FROM " + self.dataset_type) 160 | ids = q.fetchall() 161 | rng = random.Random(epoch_number) 162 | rng.shuffle(ids) 163 | data_len = len(ids) 164 | i = 0 165 | 166 | while i < number_of_functions: 167 | if chunk * int(number_of_functions) + i > data_len: 168 | break 169 | 170 | ii = ids[chunk * int(number_of_functions) + i] 171 | q = cur.execute("SELECT " + self.feature_type + " FROM " + self.feature_type + " WHERE id=?", ii) 172 | 173 | if self.feature_type == 'acfg': 174 | adj, node, lenghts0 = self.get_data_from_acfg(json_graph.adjacency_graph(json.loads(q.fetchone()[0]))) 175 | elif self.feature_type == 'lstm_cfg': 176 | adj, node, lenghts0 = self.get_data_from_cfg(json_graph.adjacency_graph(json.loads(q.fetchone()[0]))) 177 | 178 | functions.append([(adj, node)]) 179 | lenghts.append(lenghts0) 180 | 181 | if self.flags is None or self.flags.class_kind == "CMP" or self.flags.class_kind == "FML": 182 | query_str = "SELECT compiler FROM functions WHERE id=?" 183 | elif self.flags.class_kind == "CMPOPT": 184 | query_str = "SELECT compiler,optimization FROM functions WHERE id=?" 185 | elif self.flags.class_kind == "OPT": 186 | query_str = "SELECT optimization FROM functions WHERE id=?" 187 | 188 | q = cur.execute(query_str, ii) 189 | q_compiler = q.fetchone() 190 | 191 | if self.flags.class_kind == "CMPOPT": 192 | compiler = q_compiler[0] + '-' + q_compiler[1] 193 | elif self.flags.class_kind == "FML": 194 | compiler = str(q_compiler[0]).split('-')[0] 195 | else: 196 | compiler = q_compiler[0] 197 | 198 | encoded = label_encoder.transform([compiler]) 199 | labels.append(encoded) 200 | i += 1 201 | 202 | if self.feature_type == 'acfg': 203 | pairs, labels, output_len = padAndFilter(functions, labels, [[[1]]]*len(functions), self.max_num_vertices) 204 | output_len = [[1]] 205 | 206 | elif self.feature_type == 'lstm_cfg': 207 | pairs, labels, output_len = padAndFilter(functions, labels, lenghts, self.max_num_vertices) 208 | 209 | return pairs, labels, output_len 210 | 211 | def async_create_pair(self, epoch, n_chunk, number_of_functions, q, shuffle, encoder): 212 | 213 | for i in n_chunk: 214 | pairs, y, lenghts = self.get_pair_from_db(epoch, i, number_of_functions, encoder) 215 | assert (len(pairs) == len(y)) 216 | n_samples=len(pairs) 217 | len1 = [] 218 | for l in lenghts: 219 | len1.append(l[0]) 220 | adj1 = [] 221 | nodes1 = [] 222 | for p in pairs: 223 | adj1.append(p[0][0]) 224 | nodes1.append(p[0][1]) 225 | y_ = [] 226 | for yy in y: 227 | y_.append(yy[0]) 228 | 229 | for i in range(0, n_samples, number_of_functions): 230 | upper_bound = min(i + number_of_functions, n_samples) 231 | 232 | ret_adj = adj1[i:upper_bound] 233 | ret_nodes = nodes1[i:upper_bound] 234 | ret_len = len1[i:upper_bound] 235 | ret_y = y_[i:upper_bound] 236 | 237 | q.put((ret_adj,ret_nodes,ret_y,ret_len), block=True) 238 | 239 | def async_get_dataset(self, chunk, n_chunk, number_of_pairs, q, shuffle): 240 | item = q.get() 241 | n_samples = len(item[0]) 242 | self.batch_dim = n_samples 243 | self.num_pairs += n_samples 244 | return item[0], item[1], item[2], item[3] 245 | 246 | -------------------------------------------------------------------------------- /compiler_provenance/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucamassarelli/Unsupervised-Features-Learning-For-Binary-Similarity/fe37c4aa0d1ac8d14488e096a5f6deb7aea929fe/compiler_provenance/__init__.py -------------------------------------------------------------------------------- /compiler_provenance/parameters.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | import argparse 8 | import time 9 | import sys, os 10 | import logging 11 | 12 | def getLogger(logfile): 13 | logger = logging.getLogger(__name__) 14 | hdlr = logging.FileHandler(logfile) 15 | formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') 16 | hdlr.setFormatter(formatter) 17 | logger.addHandler(hdlr) 18 | logger.setLevel(logging.INFO) 19 | return logger, hdlr 20 | 21 | class Flags: 22 | 23 | def __init__(self): 24 | parser = argparse.ArgumentParser(description=' cryptoarb.') 25 | 26 | parser.add_argument("-o", "--output", dest="output_file", help="output directory for logging and models", required=False) 27 | parser.add_argument("-e", "--embedding_matrix", dest="embedding_matrix", help="file with the embedding matrix for the instructions",required=False) 28 | parser.add_argument("-j", "--json_asm2id", dest="json_asm2id",help="file with the dictionary of instructions ids", required=False) 29 | parser.add_argument("-n", "--dbName", dest="db_name", help="Name of the database", required=False) 30 | parser.add_argument("-ld","--load_dir", dest="load_dir", help="Load the model from directory load_dir", required=False) 31 | parser.add_argument("-nn","--network_type", help="network type: Arith_Mean, Weighted_Mean, RNN, CCS", required=True, dest="network_type") 32 | parser.add_argument("-r", "--random", help="if present the network use random embedder", default=False, action="store_true", dest="random_embedding", required=False) 33 | parser.add_argument("-te","--trainable_embedding", help="if present the network consider the embedding as trainable", action="store_true", dest="trainable_embeddings", default=False) 34 | parser.add_argument("-cv","--cross_val", help="if present the training is done with cross validiation", default=False, action="store_true", dest="cross_val") 35 | parser.add_argument("-cl", "--classification_kind", help="classification kind: Compiler, Compiler+Opt, Opt",default="Compiler", required=False, dest="classification_kind") 36 | 37 | args = parser.parse_args() 38 | self.network_type = args.network_type 39 | 40 | if self.network_type == "Annotations": 41 | self.feature_type = 'acfg' 42 | elif self.network_type in ["Arith_Mean", "Attention_Mean", "RNN"]: 43 | self.feature_type = 'lstm_cfg' 44 | else: 45 | print("ERROR NETWORK NOT FOUND") 46 | exit(0) 47 | 48 | if args.classification_kind == "Family": 49 | self.class_kind="FML" 50 | elif args.classification_kind == "Compiler": 51 | self.class_kind="CMP" 52 | elif args.classification_kind == "Compiler+Opt": 53 | self.class_kind="CMPOPT" 54 | elif args.classification_kind == "Opt": 55 | self.class_kind = "OPT" 56 | else: 57 | print("Classification option unkown") 58 | exit(0) 59 | 60 | # mode = mean_field 61 | self.batch_size = 250 # minibatch size (-1 = whole dataset) 62 | self.num_epochs = 50 # number of epochs 63 | self.embedding_size = 64 # dimension of latent layers 64 | self.learning_rate = 0.001 # init learning_rate 65 | self.max_lv = 2 # embedd depth 66 | self.T_iterations= 2 # max rounds of message passing 67 | self.l2_reg_lambda = 0 # 0.002 #0.002 # regularization coefficient 68 | self.num_checkpoints = 1 # max number of checkpoints 69 | self.out_dir = args.output_file # directory for logging 70 | self.db_name = args.db_name 71 | self.load_dir=str(args.load_dir) 72 | self.random_embedding = args.random_embedding 73 | self.trainable_embeddings = args.trainable_embeddings 74 | self.cross_val = args.cross_val 75 | self.cross_val_fold = 5 76 | self.dense_layer_size = 3000 77 | self.rnn_depth = 1 # depth of the rnn 78 | self.max_instructions = 50 # number of instructions 79 | self.rnn_kind = 1 #kind of rnn cell 0: lstm cell 1: GRU cell 80 | 81 | 82 | self.seed = 2 # random seed 83 | 84 | # create logdir and logger 85 | self.reset_logdir() 86 | 87 | self.file_embedding_matrix = args.embedding_matrix 88 | self.json_asm2id = args.json_asm2id 89 | 90 | self.MAX_NUM_VERTICES = 150 91 | self.MIN_NUM_VERTICES = 1 92 | 93 | def reset_logdir(self): 94 | # create logdir 95 | timestamp = str(int(time.time())) 96 | self.logdir = os.path.abspath(os.path.join(self.out_dir, "runs", timestamp)) 97 | os.makedirs(self.logdir, exist_ok=True) 98 | 99 | # create logger 100 | self.log_file = str(self.logdir)+'/console.log' 101 | self.logger, self.hdlr = getLogger(self.log_file) 102 | 103 | # create symlink for last_run 104 | sym_path_logdir = str(self.out_dir)+"/last_run" 105 | try: 106 | os.unlink(sym_path_logdir) 107 | except: 108 | pass 109 | try: 110 | os.symlink(self.logdir, sym_path_logdir) 111 | except: 112 | print("\nfailed to create symlink!\n") 113 | 114 | def close_log(self): 115 | self.hdlr.close() 116 | self.logger.removeHandler(self.hdlr) 117 | handlers = self.logger.handlers[:] 118 | for handler in handlers: 119 | handler.close() 120 | self.logger.removeHandler(handler) 121 | 122 | def __str__(self): 123 | msg = "" 124 | msg +="\n Parameters:\n" 125 | msg +="\tNetwork_Type: {}\n".format(self.network_type) 126 | msg +="\tRandom embedding: {}\n".format(self.random_embedding) 127 | msg +="\tTrainable embedding: {}\n".format(self.trainable_embeddings) 128 | msg +="\tFeature Type: {}\n".format(self.feature_type) 129 | msg +="\tlogdir: {}\n".format(self.logdir) 130 | msg +="\tbatch_size: {}\n".format(self.batch_size) 131 | msg +="\tnum_epochs: {}\n".format(self.num_epochs) 132 | msg +="\tembedding_size: {}\n".format(self.embedding_size) 133 | msg +="\tlearning_rate: {}\n".format(self.learning_rate) 134 | msg +="\tmax_lv: {}\n".format(self.max_lv) 135 | msg +="\tT_iterations: {}\n".format(self.T_iterations) 136 | msg +="\tl2_reg_lambda: {}\n".format(self.l2_reg_lambda) 137 | msg +="\tnum_checkpoints: {}\n".format(self.num_checkpoints) 138 | msg +="\tseed: {}\n".format(self.seed) 139 | msg +="\tMAX_NUM_VERTICES: {}\n".format(self.MAX_NUM_VERTICES) 140 | msg +="\tMax Instructions per cfg node: {}\n".format(self.max_instructions) 141 | msg +="\tDense Layer Size: {}\n".format(self.dense_layer_size) 142 | msg += "\tClasses kind: {}\n".format(self.class_kind) 143 | if self.network_type == "RNN": 144 | msg += "\tRNN type (0, lstm; 1, GRU): {}\n".format(self.rnn_kind) 145 | msg += "\tRNN Depth: {}\n".format(self.rnn_depth) 146 | if self.network_type == "Attention": 147 | msg += "\tRNN type (0, lstm; 1, GRU): {}\n".format(self.rnn_kind) 148 | msg += "\tRNN Depth: {}\n".format(self.rnn_depth) 149 | msg += "\tAttention hops: {}\n".format(self.attention_hops) 150 | msg += "\tAttention depth: {}\n".format(self.attention_detph) 151 | return msg 152 | -------------------------------------------------------------------------------- /compiler_provenance/s2v_classification_network_annotations.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | import tensorflow as tf 8 | 9 | class Network: 10 | 11 | def __init__(self, 12 | features_size, 13 | embedding_size, 14 | max_lv, 15 | T_iterations, 16 | learning_rate, 17 | l2_reg_lambda, 18 | dense_layer_size, 19 | num_classes 20 | ): 21 | self.features_size = features_size 22 | self.embedding_size = embedding_size 23 | self.max_lv = max_lv 24 | self.T_iterations = T_iterations 25 | self.learning_rate=learning_rate 26 | self.l2_reg_lambda = l2_reg_lambda 27 | self.dense_layer_size = dense_layer_size 28 | self.number_of_classes = num_classes 29 | self.generateGraphClassificationNetwork() 30 | 31 | 32 | def meanField(self, input_x, input_adj, name): 33 | 34 | W1_tiled = tf.tile(tf.expand_dims(self.W1,0), [tf.shape(input_x)[0],1,1], name=name + "_W1_tiled") 35 | W2_tiled = tf.tile(tf.expand_dims(self.W2,0), [tf.shape(input_x)[0],1,1], name=name + "_W2_tiled") 36 | 37 | CONV_PARAMS_tiled = [] 38 | for lv in range(self.max_lv): 39 | CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv],0), [tf.shape(input_x)[0],1,1], name=name + "_CONV_PARAMS_tiled_" + str(lv))) 40 | 41 | w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv") 42 | l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1)) 43 | out=w1xv 44 | for i in range(self.T_iterations-1): 45 | ol = l 46 | lv = self.max_lv - 1 47 | while lv >= 0 : 48 | with tf.name_scope('cell_' + str(lv)) as scope: 49 | node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv)) 50 | if lv > 0: 51 | ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv)) 52 | else: 53 | ol = node_linear 54 | lv -= 1 55 | out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2)) 56 | l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2)) 57 | 58 | fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1, name=name + "_y_potential_expand_dims") 59 | 60 | graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding') 61 | return graph_embedding 62 | 63 | 64 | def generateGraphClassificationNetwork(self): 65 | 66 | self.x = tf.placeholder(tf.float32,[None, None,self.features_size], name = "x_1") # Vettore del nodo in input 1 67 | self.adj = tf.placeholder(tf.float32,[None, None, None],name="adj_1") # Matrice di adiacenza 1 68 | self.y = tf.placeholder(tf.int32, [None], name='y_') 69 | 70 | self.lenghts = tf.placeholder(tf.float32, [None], name="len1") 71 | 72 | self.norms = [] 73 | l2_loss = tf.constant(0.0) 74 | 75 | # ------------------------------- 76 | # 1. MEAN FIELD COMPONENT 77 | # ------------------------------- 78 | 79 | #1. parameters for MeanField 80 | with tf.name_scope('parameters_MeanField'): 81 | 82 | # W1 is a [d,p] matrix, and p is the embedding size as explained above 83 | self.W1 = tf.Variable(tf.truncated_normal([self.features_size,self.embedding_size], stddev=0.1), name="W1") 84 | self.norms.append(tf.norm(self.W1)) 85 | 86 | # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv) 87 | self.CONV_PARAMS = [] 88 | for lv in range(self.max_lv): 89 | v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), name="CONV_PARAMS_"+str(lv)) 90 | self.CONV_PARAMS.append(v) 91 | self.norms.append(tf.norm(v)) 92 | 93 | # W2 is another [p,p] matrix to transform the embedding vector 94 | self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), name="W2") 95 | self.norms.append(tf.norm(self.W2)) 96 | 97 | # Mean Field 98 | with tf.name_scope('MeanField1'): 99 | self.graph_embedding = tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x,self.adj,"MeanField1"), axis=1), axis=1,name="embedding1") 100 | 101 | with tf.name_scope('Hidden_Layer'): 102 | self.dense_ouput = tf.nn.relu(tf.layers.dense(self.graph_embedding, self.dense_layer_size)) 103 | 104 | with tf.name_scope('Output_Layer'): 105 | self.logits = tf.layers.dense(self.dense_ouput, self.number_of_classes) 106 | 107 | with tf.name_scope('Prediction'): 108 | self.pred_classes = tf.argmax(self.logits, axis=1) 109 | self.pred_probab = tf.nn.softmax(self.logits) 110 | 111 | # Regularization 112 | with tf.name_scope("Regularization"): 113 | l2_loss += tf.nn.l2_loss(self.W1) 114 | for lv in range(self.max_lv): 115 | l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv]) 116 | l2_loss += tf.nn.l2_loss(self.W2) 117 | 118 | # CalculateMean cross-entropy loss 119 | with tf.name_scope("Loss"): 120 | self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y)) 121 | self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss # regularization 122 | 123 | # Train step 124 | with tf.name_scope("Train_Step"): 125 | self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss) 126 | -------------------------------------------------------------------------------- /compiler_provenance/s2v_classification_network_arith_mean.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | import tensorflow as tf 8 | 9 | class NetworkLSTM: 10 | 11 | def __init__(self, 12 | features_size, 13 | embedding_size, 14 | max_lv, 15 | T_iterations, 16 | learning_rate, 17 | l2_reg_lambda, 18 | batch_size, 19 | max_instructions, 20 | max_nodes, 21 | rnn_depth, 22 | rnn_kind, 23 | embedding_matrix, 24 | trainable_embeddings, 25 | dense_layer_size, 26 | num_classes 27 | ): 28 | print("Features size"+str(features_size)) 29 | self.features_size = features_size 30 | self.embedding_size = embedding_size 31 | self.max_lv = max_lv 32 | self.T_iterations = T_iterations 33 | self.learning_rate = learning_rate 34 | self.l2_reg_lambda = l2_reg_lambda 35 | self.RRN_HIDDEN = features_size 36 | self.batch_size = batch_size 37 | self.max_instructions = max_instructions 38 | self.max_nodes = max_nodes 39 | self.rnn_depth = rnn_depth 40 | self.rnn_kind=rnn_kind 41 | self.embedding_matrix = embedding_matrix 42 | self.trainable_embeddings = trainable_embeddings 43 | self.dense_layer_size = dense_layer_size 44 | self.number_of_classes = num_classes 45 | self.generateGraphClassificationNetwork() 46 | 47 | def extract_axis_1(self, data, ind): 48 | """ 49 | Get specified elements along the first axis of tensor. 50 | :param data: Tensorflow tensor that will be subsetted. 51 | :param ind: Indices to take (one for each element along axis 0 of data). 52 | :return: Subsetted tensor. 53 | """ 54 | ind=tf.nn.relu(ind-1) 55 | batch_range = tf.range(tf.shape(data)[0]) 56 | indices = tf.stack([batch_range, ind], axis=1) 57 | res = tf.gather_nd(data, indices) 58 | 59 | return res 60 | 61 | def create_flattening_array(self, max_nodes, batch_size): 62 | shape_array = [] 63 | for p in range(0, batch_size): 64 | for i in range(0, max_nodes): 65 | shape_array.append([p, i]) 66 | return shape_array 67 | 68 | def create_gather_array(self, max_nodes, batch_size): 69 | shape_array = [] 70 | for p in range(0, batch_size): 71 | x = [] 72 | for i in range(0, max_nodes): 73 | x.append([0, i + p * max_nodes]) 74 | shape_array.append(x) 75 | return shape_array 76 | 77 | def lstmFeatures(self, input_x, lengths): 78 | flattened_inputs = tf.reshape(input_x, [-1, tf.shape(input_x)[2]], name="Flattening") 79 | flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs) 80 | last_outputs = tf.squeeze(tf.nn.l2_normalize(tf.reduce_mean(flattened_embedded, name='features_arith_mean', axis=1), axis=1)) 81 | gather_output2 = tf.reshape(last_outputs, 82 | [-1, tf.shape(input_x)[1], self.features_size], name="Deflattening") 83 | output = tf.identity(gather_output2, name="LSTMOutput") 84 | output=tf.nn.l2_normalize(output) 85 | return output 86 | 87 | def meanField(self, input_x, input_adj, name): 88 | W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled") 89 | W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled") 90 | CONV_PARAMS_tiled = [] 91 | for lv in range(self.max_lv): 92 | CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1], 93 | name=name + "_CONV_PARAMS_tiled_" + str(lv))) 94 | w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv") 95 | l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1)) 96 | out = w1xv 97 | for i in range(self.T_iterations - 1): 98 | ol = l 99 | lv = self.max_lv - 1 100 | while lv >= 0: 101 | with tf.name_scope('cell_' + str(lv)) as scope: 102 | node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv)) 103 | if lv > 0: 104 | ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv)) 105 | else: 106 | ol = node_linear 107 | lv -= 1 108 | out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2)) 109 | l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2)) 110 | fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1, 111 | name=name + "_y_potential_expand_dims") 112 | graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding') 113 | return graph_embedding 114 | 115 | def generateGraphClassificationNetwork(self): 116 | print("Features size:"+str(self.features_size)) 117 | self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix), 118 | trainable=self.trainable_embeddings, 119 | name="instruction_embedding", dtype=tf.float32) 120 | self.x = tf.placeholder(tf.int32, [None, None, self.max_instructions], name="x_1") # Vettore del nodo in input 121 | self.adj = tf.placeholder(tf.float32, [None, None, None], name="adj_1") # Matrice di adiacenza 122 | self.lenghts = tf.placeholder(tf.int32, [None,None], name='lenghts_1') 123 | self.y = tf.placeholder(tf.int32, [None], name='y_') 124 | # Euclidean norms; p = 2 125 | self.norms = [] 126 | # Keeping track of l2 regularization loss (optional) 127 | l2_loss = tf.constant(0.0) 128 | 129 | # ------------------------------- 130 | # 1. MEAN FIELD COMPONENT 131 | # ------------------------------- 132 | 133 | # 1. parameters for MeanField 134 | with tf.name_scope('parameters_MeanField'): 135 | # W1 is a [d,p] matrix, and p is the embedding size as explained above 136 | self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1") 137 | self.norms.append(tf.norm(self.W1)) 138 | # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv) 139 | self.CONV_PARAMS = [] 140 | for lv in range(self.max_lv): 141 | v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), 142 | name="CONV_PARAMS_" + str(lv)) 143 | self.CONV_PARAMS.append(v) 144 | self.norms.append(tf.norm(v)) 145 | # W2 is another [p,p] matrix to transform the embedding vector 146 | self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), 147 | name="W2") 148 | self.norms.append(tf.norm(self.W2)) 149 | 150 | # LSTMExtraction 151 | with tf.name_scope('BlockFeatureExtraction'): 152 | with tf.variable_scope('arithmetic_mean'): 153 | self.x_after_lstm = self.lstmFeatures(self.x, self.lenghts) 154 | 155 | # Mean Field 156 | with tf.name_scope('MeanField'): 157 | self.graph_embedding = tf.nn.l2_normalize( 158 | tf.squeeze(self.meanField(self.x_after_lstm, self.adj, "MeanField"), axis=1), axis=1, 159 | name="embedding1") # tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x_1,self.adj_1), axis=1), 1) 160 | 161 | with tf.name_scope('Hidden_Layer'): 162 | self.dense_ouput = tf.nn.relu(tf.layers.dense(self.graph_embedding, self.dense_layer_size)) 163 | 164 | with tf.name_scope('Output_Layer'): 165 | self.logits = tf.layers.dense(self.dense_ouput, self.number_of_classes) 166 | 167 | with tf.name_scope('Prediction'): 168 | self.pred_classes = tf.argmax(self.logits, axis=1) 169 | self.pred_probab = tf.nn.softmax(self.logits) 170 | 171 | # Regularization 172 | with tf.name_scope("Regularization"): 173 | l2_loss += tf.nn.l2_loss(self.W1) 174 | for lv in range(self.max_lv): 175 | l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv]) 176 | l2_loss += tf.nn.l2_loss(self.W2) 177 | 178 | # CalculateMean cross-entropy loss 179 | with tf.name_scope("Loss"): 180 | self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y)) 181 | self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss # regularization 182 | 183 | # Train step 184 | with tf.name_scope("Train_Step"): 185 | self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss) 186 | -------------------------------------------------------------------------------- /compiler_provenance/s2v_classification_network_attention_mean.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | import tensorflow as tf 8 | 9 | 10 | # structure2vec 11 | # DE-MF : discriminative embedding using Mean Field 12 | 13 | 14 | class Network: 15 | 16 | def __init__(self, 17 | features_size, 18 | embedding_size, 19 | max_lv, 20 | T_iterations, 21 | learning_rate, 22 | l2_reg_lambda, 23 | batch_size, 24 | max_instructions, 25 | max_nodes, 26 | rnn_depth, 27 | rnn_kind, 28 | embedding_matrix, 29 | trainable_embeddings, 30 | dense_layer_size, 31 | num_classes 32 | ): 33 | print("Features size"+str(features_size)) 34 | self.features_size = features_size 35 | self.embedding_size = embedding_size 36 | self.max_lv = max_lv 37 | self.T_iterations = T_iterations 38 | self.learning_rate = learning_rate 39 | self.l2_reg_lambda = l2_reg_lambda 40 | self.RRN_HIDDEN = features_size 41 | self.batch_size = batch_size 42 | self.max_instructions = max_instructions 43 | self.max_nodes = max_nodes 44 | self.rnn_depth = rnn_depth 45 | self.rnn_kind=rnn_kind 46 | self.embedding_matrix = embedding_matrix 47 | self.trainable_embeddings = trainable_embeddings 48 | self.dense_layer_size = dense_layer_size 49 | self.number_of_classes = num_classes 50 | self.generateGraphClassificationNetwork() 51 | 52 | def extract_axis_1(self, data, ind): 53 | """ 54 | Get specified elements along the first axis of tensor. 55 | :param data: Tensorflow tensor that will be subsetted. 56 | :param ind: Indices to take (one for each element along axis 0 of data). 57 | :return: Subsetted tensor. 58 | """ 59 | ind=tf.nn.relu(ind-1) 60 | batch_range = tf.range(tf.shape(data)[0]) 61 | indices = tf.stack([batch_range, ind], axis=1) 62 | res = tf.gather_nd(data, indices) 63 | 64 | return res 65 | 66 | def create_flattening_array(self, max_nodes, batch_size): 67 | shape_array = [] 68 | for p in range(0, batch_size): 69 | for i in range(0, max_nodes): 70 | shape_array.append([p, i]) 71 | return shape_array 72 | 73 | def create_gather_array(self, max_nodes, batch_size): 74 | shape_array = [] 75 | for p in range(0, batch_size): 76 | x = [] 77 | for i in range(0, max_nodes): 78 | x.append([0, i + p * max_nodes]) 79 | shape_array.append(x) 80 | return shape_array 81 | 82 | def lstmFeatures(self, input_x, lengths): 83 | 84 | flattened_inputs = tf.reshape(input_x, [-1, tf.shape(input_x)[2]], name="Flattening") 85 | flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs) 86 | self.W0 = tf.Variable(tf.constant(1.0 / self.max_instructions, shape=[1, self.max_instructions]), name="W0") 87 | w0_tiled = tf.tile(tf.expand_dims(self.W0, 0), [tf.shape(flattened_inputs)[0], 1, 1], name="W0_tiled") 88 | last_outputs = tf.squeeze(tf.nn.l2_normalize(tf.matmul(w0_tiled, flattened_embedded, name='features_weighted_mean')), axis=1) 89 | gather_output2 = tf.reshape(last_outputs, 90 | [-1, tf.shape(input_x)[1], self.features_size], name="Deflattening") 91 | output = tf.identity(gather_output2, name="LSTMOutput") 92 | output=tf.nn.l2_normalize(output) 93 | return output 94 | 95 | def meanField(self, input_x, input_adj, name): 96 | 97 | W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled") 98 | W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled") 99 | 100 | CONV_PARAMS_tiled = [] 101 | for lv in range(self.max_lv): 102 | CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1], 103 | name=name + "_CONV_PARAMS_tiled_" + str(lv))) 104 | 105 | w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv") 106 | l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1)) 107 | out = w1xv 108 | for i in range(self.T_iterations - 1): 109 | ol = l 110 | lv = self.max_lv - 1 111 | while lv >= 0: 112 | with tf.name_scope('cell_' + str(lv)) as scope: 113 | node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv)) 114 | if lv > 0: 115 | ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv)) 116 | else: 117 | ol = node_linear 118 | lv -= 1 119 | out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2)) 120 | l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2)) 121 | 122 | fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1, 123 | name=name + "_y_potential_expand_dims") 124 | graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding') 125 | return graph_embedding 126 | 127 | def generateGraphClassificationNetwork(self): 128 | print("Features size:"+str(self.features_size)) 129 | self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix), 130 | trainable=self.trainable_embeddings, 131 | name="instruction_embedding", dtype=tf.float32) 132 | self.x = tf.placeholder(tf.int32, [None, None, self.max_instructions],name="x_1") 133 | self.adj = tf.placeholder(tf.float32, [None, None, None], name="adj_1") 134 | self.lenghts = tf.placeholder(tf.int32, [None,None], name='lenghts_1') 135 | self.y = tf.placeholder(tf.int32, [None], name='y_') 136 | self.norms = [] 137 | 138 | l2_loss = tf.constant(0.0) 139 | 140 | # ------------------------------- 141 | # 1. MEAN FIELD COMPONENT 142 | # ------------------------------- 143 | 144 | # 1. parameters for MeanField 145 | with tf.name_scope('parameters_MeanField'): 146 | 147 | # W1 is a [d,p] matrix, and p is the embedding size as explained above 148 | self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1") 149 | self.norms.append(tf.norm(self.W1)) 150 | 151 | # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv) 152 | self.CONV_PARAMS = [] 153 | for lv in range(self.max_lv): 154 | v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), 155 | name="CONV_PARAMS_" + str(lv)) 156 | self.CONV_PARAMS.append(v) 157 | self.norms.append(tf.norm(v)) 158 | 159 | # W2 is another [p,p] matrix to transform the embedding vector 160 | self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),name="W2") 161 | self.norms.append(tf.norm(self.W2)) 162 | 163 | # 164 | # LSTMExtraction 165 | with tf.name_scope('BlockFeatureExtraction'): 166 | self.block_features = self.lstmFeatures(self.x, self.lenghts) 167 | 168 | # Mean Field 169 | with tf.name_scope('MeanField'): 170 | self.graph_embedding = tf.nn.l2_normalize( 171 | tf.squeeze(self.meanField(self.block_features, self.adj, "MeanField1"), axis=1), axis=1, name="embedding") 172 | 173 | with tf.name_scope('Hidden_Layer'): 174 | self.dense_ouput = tf.nn.relu(tf.layers.dense(self.graph_embedding, self.dense_layer_size)) 175 | 176 | with tf.name_scope('Output_Layer'): 177 | self.logits = tf.layers.dense(self.dense_ouput, self.number_of_classes) 178 | 179 | with tf.name_scope('Prediction'): 180 | self.pred_classes = tf.argmax(self.logits, axis=1) 181 | self.pred_probab = tf.nn.softmax(self.logits) 182 | 183 | # Regularization 184 | with tf.name_scope("Regularization"): 185 | l2_loss += tf.nn.l2_loss(self.W1) 186 | for lv in range(self.max_lv): 187 | l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv]) 188 | l2_loss += tf.nn.l2_loss(self.W2) 189 | 190 | # CalculateMean cross-entropy loss 191 | with tf.name_scope("Loss"): 192 | self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y)) 193 | self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss # regularization 194 | 195 | # Train step 196 | with tf.name_scope("Train_Step"): 197 | self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss) 198 | -------------------------------------------------------------------------------- /compiler_provenance/s2v_classification_network_rnn.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | import tensorflow as tf 8 | 9 | 10 | # structure2vec 11 | # DE-MF : discriminative embedding using Mean Field 12 | 13 | 14 | class NetworkLSTM: 15 | 16 | def __init__(self, 17 | features_size, # Dimensione delle features del nodo 18 | embedding_size, # Dimensione dell'embedding del vettore 19 | max_lv, 20 | T_iterations, 21 | learning_rate, # Learning rate 22 | l2_reg_lambda, 23 | batch_size, 24 | max_instructions, 25 | max_nodes, 26 | rnn_depth, 27 | rnn_kind, 28 | embedding_matrix, 29 | trainable_embeddings, 30 | dense_layer_size, 31 | num_classes 32 | ): 33 | print("Features size"+str(features_size)) 34 | self.features_size = features_size 35 | self.embedding_size = embedding_size 36 | self.max_lv = max_lv 37 | self.T_iterations = T_iterations 38 | self.learning_rate = learning_rate 39 | self.l2_reg_lambda = l2_reg_lambda 40 | self.RRN_HIDDEN = features_size 41 | self.batch_size = batch_size 42 | self.max_instructions = max_instructions 43 | self.max_nodes = max_nodes 44 | self.rnn_depth = rnn_depth 45 | self.rnn_kind=rnn_kind 46 | self.embedding_matrix = embedding_matrix 47 | self.trainable_embeddings = trainable_embeddings 48 | self.dense_layer_size = dense_layer_size 49 | self.number_of_classes = num_classes 50 | self.generateGraphClassificationNetwork() 51 | 52 | def extract_axis_1(self, data, ind): 53 | """ 54 | Get specified elements along the first axis of tensor. 55 | :param data: Tensorflow tensor that will be subsetted. 56 | :param ind: Indices to take (one for each element along axis 0 of data). 57 | :return: Subsetted tensor. 58 | """ 59 | ind=tf.nn.relu(ind-1) 60 | batch_range = tf.range(tf.shape(data)[0]) 61 | indices = tf.stack([batch_range, ind], axis=1) 62 | res = tf.gather_nd(data, indices) 63 | 64 | return res 65 | 66 | def lstmFeatures(self, input_x, lengths): 67 | flattened_inputs=tf.reshape(input_x,[-1,tf.shape(input_x)[2]],name="Flattening") 68 | flattened_lenghts = tf.reshape(lengths, [-1]) 69 | max = tf.reduce_max(flattened_lenghts) 70 | flattened_inputs=flattened_inputs[:,:max] 71 | flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs) 72 | zeros = tf.zeros(tf.shape(flattened_lenghts)[0], dtype=tf.int32) 73 | mask = tf.not_equal(flattened_lenghts, zeros) 74 | int_mask = tf.cast(mask, tf.int32) 75 | fake_output = tf.zeros([self.features_size], dtype=tf.float32) 76 | partitions = tf.dynamic_partition(flattened_embedded, int_mask, 2) 77 | real_nodes=partitions[1] 78 | real_lenghts=tf.boolean_mask(flattened_lenghts,mask) 79 | fake_zero = tf.tile([fake_output], [tf.shape(flattened_embedded)[0] - tf.shape(partitions[1])[0], 1]) 80 | 81 | if self.rnn_kind==0: 82 | rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in ([self.features_size] * self.rnn_depth)] 83 | else: 84 | rnn_layers = [tf.nn.rnn_cell.GRUCell(size) for size in ([self.features_size] * self.rnn_depth)] 85 | cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers) 86 | rnn_outputs, _ = tf.nn.dynamic_rnn(cell, real_nodes, sequence_length=real_lenghts, dtype=tf.float32, 87 | time_major=False, parallel_iterations=88) 88 | last_outputs = self.extract_axis_1(rnn_outputs, real_lenghts) 89 | condition_indices = tf.dynamic_partition( 90 | tf.range(tf.shape(flattened_embedded)[0]), int_mask, 2) 91 | last_outputs = tf.dynamic_stitch(condition_indices, [fake_zero, last_outputs]) 92 | gather_output2 = tf.reshape(last_outputs, [-1, tf.shape(input_x)[1], self.features_size], name="Deflattening") 93 | 94 | output = tf.identity(gather_output2, name="LSTMOutput") 95 | output=tf.nn.l2_normalize(output) 96 | return output 97 | 98 | def meanField(self, input_x, input_adj, name): 99 | 100 | W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled") 101 | W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled") 102 | 103 | CONV_PARAMS_tiled = [] 104 | for lv in range(self.max_lv): 105 | CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1], 106 | name=name + "_CONV_PARAMS_tiled_" + str(lv))) 107 | 108 | w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv") 109 | l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1)) 110 | out = w1xv 111 | for i in range(self.T_iterations - 1): 112 | ol = l 113 | lv = self.max_lv - 1 114 | while lv >= 0: 115 | with tf.name_scope('cell_' + str(lv)) as scope: 116 | node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv)) 117 | if lv > 0: 118 | ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv)) 119 | else: 120 | ol = node_linear 121 | lv -= 1 122 | out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2)) 123 | l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2)) 124 | fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1, 125 | name=name + "_y_potential_expand_dims") 126 | graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding') 127 | return graph_embedding 128 | 129 | def generateGraphClassificationNetwork(self): 130 | 131 | print("Features size:" + str(self.features_size)) 132 | self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix), 133 | trainable=self.trainable_embeddings, 134 | name="instruction_embedding", dtype=tf.float32) 135 | 136 | self.x = tf.placeholder(tf.int32, [None, None, self.max_instructions], name="x_1") 137 | self.adj = tf.placeholder(tf.float32, [None, None, None], name="adj_1") 138 | self.lenghts = tf.placeholder(tf.int32, [None, None], name='lenghts_1') 139 | self.y = tf.placeholder(tf.int32, [None], name='y_') 140 | 141 | self.norms = [] 142 | 143 | l2_loss = tf.constant(0.0) 144 | 145 | # ------------------------------- 146 | # 1. MEAN FIELD COMPONENT 147 | # ------------------------------- 148 | 149 | # 1. parameters for MeanField 150 | with tf.name_scope('parameters_MeanField'): 151 | 152 | # W1 is a [d,p] matrix, and p is the embedding size as explained above 153 | self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1") 154 | self.norms.append(tf.norm(self.W1)) 155 | 156 | # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv) 157 | self.CONV_PARAMS = [] 158 | for lv in range(self.max_lv): 159 | v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), 160 | name="CONV_PARAMS_" + str(lv)) 161 | self.CONV_PARAMS.append(v) 162 | self.norms.append(tf.norm(v)) 163 | 164 | # W2 is another [p,p] matrix to transform the embedding vector 165 | self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), name="W2") 166 | self.norms.append(tf.norm(self.W2)) 167 | 168 | # 169 | # LSTMExtraction 170 | with tf.name_scope('BlockFeatureExtraction'): 171 | self.block_features = self.lstmFeatures(self.x, self.lenghts) 172 | 173 | # Mean Field 174 | with tf.name_scope('MeanField'): 175 | self.graph_embedding = tf.nn.l2_normalize( 176 | tf.squeeze(self.meanField(self.block_features, self.adj, "MeanField1"), axis=1), axis=1, name="embedding") 177 | 178 | with tf.name_scope('Hidden_Layer'): 179 | self.dense_ouput = tf.nn.relu(tf.layers.dense(self.graph_embedding, self.dense_layer_size)) 180 | 181 | with tf.name_scope('Output_Layer'): 182 | self.logits = tf.layers.dense(self.dense_ouput, self.number_of_classes) 183 | 184 | with tf.name_scope('Prediction'): 185 | self.pred_classes = tf.argmax(self.logits, axis=1) 186 | self.pred_probab = tf.nn.softmax(self.logits) 187 | 188 | # Regularization 189 | with tf.name_scope("Regularization"): 190 | l2_loss += tf.nn.l2_loss(self.W1) 191 | for lv in range(self.max_lv): 192 | l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv]) 193 | l2_loss += tf.nn.l2_loss(self.W2) 194 | 195 | # CalculateMean cross-entropy loss 196 | with tf.name_scope("Loss"): 197 | self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y)) 198 | self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss # regularization 199 | 200 | # Train step 201 | with tf.name_scope("Train_Step"): 202 | self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss) 203 | -------------------------------------------------------------------------------- /compiler_provenance/s2v_trainer.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | 8 | from compiler_provenance.s2v_classification_network_arith_mean import NetworkLSTM as arithMeanNetwork 9 | from compiler_provenance.s2v_classification_network_rnn import NetworkLSTM as rrnFastMeanNetwork 10 | from compiler_provenance.s2v_classification_network_annotations import Network as annotationNetwork 11 | from compiler_provenance.s2v_classification_network_attention_mean import Network as weightedMeanNetwork 12 | 13 | from compiler_provenance.FunctionFactory import PairFactory as FunctionFactory 14 | 15 | import tensorflow as tf 16 | import random 17 | import sys, os 18 | import numpy as np 19 | from sklearn import metrics 20 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 21 | import matplotlib 22 | import sqlite3 23 | import pickle 24 | matplotlib.use('Agg') 25 | import matplotlib.pyplot as plt 26 | import itertools 27 | import tqdm 28 | 29 | class S2VTrainerLSTM: 30 | 31 | def __init__(self, flags, embedding_matrix): 32 | self.embedding_size = flags.embedding_size 33 | self.max_lv = flags.max_lv 34 | self.num_epochs = flags.num_epochs 35 | self.learning_rate = flags.learning_rate 36 | self.l2_reg_lambda = flags.l2_reg_lambda 37 | self.num_checkpoints = flags.num_checkpoints 38 | self.logdir = flags.logdir 39 | self.logger = flags.logger 40 | self.T_iterations = flags.T_iterations 41 | self.seed = flags.seed 42 | self.batch_size = flags.batch_size 43 | self.max_instructions = flags.max_instructions 44 | self.rnn_depth = flags.rnn_depth 45 | self.rnn_kind = flags.rnn_kind 46 | self.max_nodes = flags.MAX_NUM_VERTICES 47 | self.embeddings_matrix = embedding_matrix 48 | self.session = None 49 | self.db_name = flags.db_name 50 | self.feature_type = flags.feature_type 51 | self.json_asm2id = flags.json_asm2id 52 | self.trainable_embeddings = flags.trainable_embeddings 53 | self.network_type = flags.network_type 54 | self.cross_val = flags.cross_val 55 | self.dense_layer_size = flags.dense_layer_size 56 | self.flags = flags 57 | self.functions = False 58 | 59 | if flags.class_kind == "CMP" or flags.class_kind=="FML": 60 | query_str="SELECT DISTINCT compiler FROM functions" 61 | elif flags.class_kind == "CMPOPT": 62 | query_str = "SELECT DISTINCT compiler,optimization FROM functions" 63 | elif flags.class_kind == "OPT": 64 | query_str = "SELECT DISTINCT optimization FROM functions" 65 | 66 | conn = sqlite3.connect(self.db_name) 67 | cur = conn.cursor() 68 | print("Looking in db for classes") 69 | q = cur.execute(query_str) 70 | q_compilers = q.fetchall() 71 | #q_compilers = [c[0] for c in compilers] 72 | compilers = [] 73 | 74 | for c in q_compilers: 75 | if flags.class_kind == "CMPOPT": 76 | compiler = c[0] + '-' + c[1] 77 | elif flags.class_kind == "FML": 78 | compiler = str(c[0]).split('-')[0] 79 | 80 | else: 81 | compiler = c[0] 82 | compilers.append(compiler) 83 | 84 | print(compilers) 85 | 86 | 87 | compilers = list(set(compilers)) 88 | conn.close() 89 | 90 | self.encoder = LabelEncoder() 91 | self.encoder.fit(compilers) 92 | self.num_classes = len(self.encoder.classes_) 93 | 94 | print("Num classes = " + str(self.num_classes)) 95 | 96 | random.seed(self.seed) 97 | np.random.seed(self.seed) 98 | 99 | print(self.db_name) 100 | 101 | def plot_confusion_matrix(self, cm, classes, normalize=False, 102 | title='Confusion matrix', 103 | cmap=plt.cm.Blues): 104 | """ 105 | This function prints and plots the confusion matrix. 106 | Normalization can be applied by setting `normalize=True`. 107 | """ 108 | if normalize: 109 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 110 | 111 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 112 | plt.title(title) 113 | plt.colorbar() 114 | tick_marks = np.arange(len(classes)) 115 | plt.xticks(tick_marks, classes, rotation=45) 116 | plt.yticks(tick_marks, classes) 117 | 118 | fmt = '.2f' if normalize else 'd' 119 | thresh = cm.max() / 2. 120 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 121 | plt.text(j, i, format(cm[i, j], fmt), 122 | horizontalalignment="center", 123 | color="white" if cm[i, j] > thresh else "black") 124 | 125 | plt.tight_layout() 126 | plt.ylabel('True label') 127 | plt.xlabel('Predicted label') 128 | 129 | def loadmodel(self): 130 | tf.reset_default_graph() 131 | with tf.Graph().as_default() as g: 132 | session_conf = tf.ConfigProto( 133 | allow_soft_placement=True, 134 | log_device_placement=False 135 | ) 136 | sess = tf.Session(config=session_conf) 137 | 138 | # Sets the graph-level random seed. 139 | tf.set_random_seed(self.seed) 140 | 141 | self.createNetwork() 142 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.num_checkpoints) 143 | checkpoint_dir = os.path.abspath(os.path.join(self.logdir, "checkpoints")) 144 | saver.restore(sess, os.path.join(checkpoint_dir, "model")) 145 | self.session = sess 146 | return 147 | 148 | 149 | def createNetwork(self): 150 | self.features_size = np.shape(self.embeddings_matrix)[1] 151 | if self.network_type == "Arith_Mean": 152 | 153 | self.network = arithMeanNetwork( 154 | features_size=self.features_size, 155 | embedding_size=self.embedding_size, 156 | max_lv=self.max_lv, 157 | T_iterations=self.T_iterations, 158 | learning_rate=self.learning_rate, 159 | l2_reg_lambda=self.l2_reg_lambda, 160 | batch_size=self.batch_size, 161 | max_instructions=self.max_instructions, 162 | max_nodes=self.max_nodes, 163 | rnn_depth=self.rnn_depth, 164 | rnn_kind=self.rnn_kind, 165 | embedding_matrix=self.embeddings_matrix, 166 | trainable_embeddings=self.trainable_embeddings, 167 | num_classes=self.num_classes, 168 | dense_layer_size=self.dense_layer_size 169 | ) 170 | 171 | if self.network_type == "RNN": 172 | 173 | self.network = rrnFastMeanNetwork( 174 | features_size=self.features_size, 175 | embedding_size=self.embedding_size, 176 | max_lv=self.max_lv, 177 | T_iterations=self.T_iterations, 178 | learning_rate=self.learning_rate, 179 | l2_reg_lambda=self.l2_reg_lambda, 180 | batch_size=self.batch_size, 181 | max_instructions = self.max_instructions, 182 | max_nodes = self.max_nodes, 183 | rnn_depth = self.rnn_depth, 184 | rnn_kind=self.rnn_kind, 185 | embedding_matrix=self.embeddings_matrix, 186 | trainable_embeddings=self.trainable_embeddings, 187 | dense_layer_size=self.dense_layer_size, 188 | num_classes=self.num_classes 189 | ) 190 | 191 | if self.network_type == "Attention_Mean": 192 | 193 | self.network = weightedMeanNetwork( 194 | features_size=self.features_size, 195 | embedding_size=self.embedding_size, 196 | max_lv=self.max_lv, 197 | T_iterations=self.T_iterations, 198 | learning_rate=self.learning_rate, 199 | l2_reg_lambda=self.l2_reg_lambda, 200 | batch_size=self.batch_size, 201 | max_instructions = self.max_instructions, 202 | max_nodes = self.max_nodes, 203 | rnn_depth = self.rnn_depth, 204 | rnn_kind=self.rnn_kind, 205 | embedding_matrix=self.embeddings_matrix, 206 | trainable_embeddings=self.trainable_embeddings, 207 | dense_layer_size=self.dense_layer_size, 208 | num_classes = self.num_classes 209 | ) 210 | 211 | if self.network_type == "Annotations": 212 | self.features_size = 8 213 | self.network = annotationNetwork( 214 | features_size=self.features_size, 215 | embedding_size=self.embedding_size, 216 | max_lv=self.max_lv, 217 | T_iterations=self.T_iterations, 218 | learning_rate=self.learning_rate, 219 | l2_reg_lambda=self.l2_reg_lambda, 220 | dense_layer_size=self.dense_layer_size, 221 | num_classes=self.num_classes 222 | ) 223 | 224 | def read_weight(self): 225 | a = self.session.run(self.session.graph.get_tensor_by_name('LSTMExtraction1/lstm1/W0:0')) 226 | plt.bar(range(0, 150), a[0]) 227 | plt.show() 228 | plt.savefig('/home/massarelli/weight.pdf') 229 | 230 | 231 | def train(self): 232 | tf.reset_default_graph() 233 | with tf.Graph().as_default() as g: 234 | session_conf = tf.ConfigProto( 235 | allow_soft_placement=True, 236 | log_device_placement=False 237 | ) 238 | sess = tf.Session(config=session_conf) 239 | 240 | # Sets the graph-level random seed. 241 | tf.set_random_seed(self.seed) 242 | 243 | self.createNetwork() 244 | 245 | print("Network created") 246 | 247 | # Initialize all variables 248 | sess.run(tf.global_variables_initializer()) 249 | 250 | # TensorBoard 251 | # Summaries for loss and accuracy 252 | loss_summary = tf.summary.scalar("loss", self.network.loss) 253 | 254 | # Train Summaries 255 | train_summary_op = tf.summary.merge([loss_summary]) 256 | train_summary_dir = os.path.join(self.logdir, "summaries", "train") 257 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 258 | 259 | # Validation summaries 260 | val_summary_op = tf.summary.merge([loss_summary]) 261 | val_summary_dir = os.path.join(self.logdir, "summaries", "validation") 262 | val_summary_writer = tf.summary.FileWriter(val_summary_dir, sess.graph) 263 | 264 | # Test summaries 265 | test_summary_op = tf.summary.merge([loss_summary]) 266 | test_summary_dir = os.path.join(self.logdir, "summaries", "test") 267 | test_summary_writer = tf.summary.FileWriter(test_summary_dir, sess.graph) 268 | 269 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 270 | checkpoint_dir = os.path.abspath(os.path.join(self.logdir, "checkpoints")) 271 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 272 | if not os.path.exists(checkpoint_dir): 273 | os.makedirs(checkpoint_dir) 274 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.num_checkpoints) 275 | 276 | BEST_ACCURACY = 0 277 | stat_file = open(str(self.logdir) + "/epoch_stats.tsv", "w") 278 | stat_file.write("#epoch\ttrain_loss\tval_loss\tval_auc\ttest_loss\ttest_auc\n") 279 | 280 | print("Creating functions factories...") 281 | sys.stdout.flush() 282 | 283 | p_train = FunctionFactory(self.db_name, self.feature_type, 'train', self.json_asm2id, self.max_instructions, self.max_nodes, self.encoder,self.batch_size,self.flags) 284 | p_validation = FunctionFactory(self.db_name, self.feature_type, 'validation', self.json_asm2id, self.max_instructions, self.max_nodes, self.encoder,self.batch_size,self.flags) 285 | p_test = FunctionFactory(self.db_name, self.feature_type, 'test', self.json_asm2id,self.max_instructions, self.max_nodes, self.encoder,self.batch_size,self.flags) 286 | 287 | print("Starting train!") 288 | sys.stdout.flush() 289 | 290 | step = 0 291 | for epoch in range(0, self.num_epochs): 292 | epoch_msg = "" 293 | epoch_msg += " epoch: {}\n".format(epoch) 294 | 295 | epoch_loss = 0 296 | 297 | # ----------------------# 298 | # TRAIN # 299 | # ----------------------# 300 | n_batch=0 301 | for adj_batch, nodes_batch, y_batch, len_batch in tqdm.tqdm(p_train.async_chunker(epoch%25, self.batch_size, shuffle=True), total=p_train.num_batches): 302 | 303 | assert len(adj_batch) 304 | 305 | feed_dict = { 306 | self.network.x: nodes_batch, 307 | self.network.adj: adj_batch, 308 | self.network.lenghts: len_batch, 309 | self.network.y: y_batch, 310 | } 311 | 312 | summaries, _, loss, norms = sess.run( 313 | [train_summary_op, self.network.train_step, self.network.loss, self.network.norms], 314 | feed_dict=feed_dict) 315 | 316 | n_batch=n_batch+1 317 | 318 | # tensorboard 319 | train_summary_writer.add_summary(summaries, step) 320 | epoch_loss += loss * p_train.batch_dim # ??? 321 | step += 1 322 | 323 | # recap epoch 324 | epoch_loss /= p_train.num_pairs 325 | 326 | # ----------------------# 327 | # VALIDATION # 328 | # ----------------------# 329 | val_loss = 0 330 | epoch_msg += "\n" 331 | val_y = [] 332 | val_pred = [] 333 | print("Validating") 334 | for adj_batch, nodes_batch, y_batch, len_batch in tqdm.tqdm(p_validation.async_chunker(0, self.batch_size),total=p_validation.num_batches): 335 | feed_dict = { 336 | self.network.x: nodes_batch, 337 | self.network.adj: adj_batch, 338 | self.network.lenghts: len_batch, 339 | self.network.y: y_batch, 340 | } 341 | 342 | summaries, loss, pred_probab, pred_classes = sess.run( 343 | [val_summary_op, self.network.loss, self.network.pred_probab, self.network.pred_classes], feed_dict=feed_dict) 344 | val_loss += loss * p_validation.batch_dim 345 | val_summary_writer.add_summary(summaries, step) 346 | val_y.extend(y_batch) 347 | val_pred.extend(pred_classes) 348 | val_loss /= p_validation.num_pairs 349 | 350 | val_accuracy = metrics.accuracy_score(val_y, val_pred) 351 | 352 | val_report = metrics.classification_report(val_y, val_pred, target_names=self.encoder.classes_) 353 | 354 | tmp = val_report.split("\n") 355 | val_report = "" 356 | for l in tmp: 357 | val_report += "\t\t" + l + "\n" 358 | 359 | stri = "\tepoch {} / {}, train loss {:g}, val loss {:g}, val accuracy {:g}\n".format(epoch, self.num_epochs, epoch_loss, val_loss, val_accuracy) 360 | 361 | epoch_msg += stri 362 | 363 | sys.stdout.write(stri) 364 | 365 | sys.stdout.flush() 366 | 367 | # execute test only if validation auc increased 368 | test_loss = "-" 369 | test_auc = "-" 370 | 371 | if val_accuracy > BEST_ACCURACY and self.cross_val: 372 | BEST_ACCURACY = val_accuracy 373 | saver.save(sess, checkpoint_prefix) 374 | print("\nNEW BEST_VAL_ACCURACY: {} !\n".format(BEST_ACCURACY)) 375 | 376 | if val_accuracy > BEST_ACCURACY and not self.cross_val: 377 | BEST_ACCURACY = val_accuracy 378 | 379 | sys.stdout.write("\t" + "-*"*40 + "\n") 380 | 381 | stri = "\tNEW BEST_ACCURACY: {} !\n\tVal Classification Report: \n {} \n".format(BEST_ACCURACY, val_report) 382 | 383 | epoch_msg += stri 384 | sys.stdout.write(stri) 385 | 386 | # save best model 387 | saver.save(sess, checkpoint_prefix) 388 | 389 | # ----------------------# 390 | # TEST # 391 | # ----------------------# 392 | 393 | # TEST 394 | test_loss = 0 395 | epoch_msg += "\n" 396 | test_y = [] 397 | test_pred = [] 398 | print("Testing") 399 | for adj_batch, nodes_batch, y_batch, len_batch in tqdm.tqdm(p_test.async_chunker(0, self.batch_size),total=p_test.num_batches): 400 | 401 | feed_dict = { 402 | self.network.x: nodes_batch, 403 | self.network.adj: adj_batch, 404 | self.network.lenghts: len_batch, 405 | self.network.y: y_batch, 406 | } 407 | 408 | summaries, loss, pred_probab, pred_classes = sess.run( 409 | [val_summary_op, self.network.loss, self.network.pred_probab, self.network.pred_classes], 410 | feed_dict=feed_dict) 411 | test_loss += loss * p_test.batch_dim 412 | test_summary_writer.add_summary(summaries, step) 413 | test_y.extend(y_batch) 414 | test_pred.extend(pred_classes) 415 | test_loss /= p_test.num_pairs 416 | 417 | test_accuracy = metrics.accuracy_score(test_y, test_pred) 418 | 419 | test_report = metrics.classification_report(test_y, test_pred, target_names=self.encoder.classes_) 420 | 421 | tmp = test_report.split("\n") 422 | test_report = "" 423 | for l in tmp: 424 | test_report += "\t\t" + l + "\n" 425 | 426 | # Compute confusion matrix 427 | cnf_matrix = metrics.confusion_matrix(test_y, test_pred) 428 | np.set_printoptions(precision=2) 429 | np.savetxt(str(self.logdir) + "/best_test_confusion_matrix.csv", cnf_matrix, delimiter=',') 430 | 431 | fig=plt.figure() 432 | self.plot_confusion_matrix(cnf_matrix, self.encoder.classes_) 433 | plt.savefig(str(self.logdir) + "/best_test_confusion_matrix.png") 434 | plt.close(fig) 435 | 436 | tmp = str(cnf_matrix).split('\n') 437 | scnf = "" 438 | for l in tmp: 439 | scnf += "\t\t" + l + "\n" 440 | 441 | stri = "\tTest_loss : {}\n\tTest Accuracy: {}\n\tTest Classification Report:\n {} \tTest Confusion Matrix : \n {} \n".format(test_loss, test_accuracy, test_report, scnf) 442 | epoch_msg += stri 443 | 444 | sys.stdout.write(stri) 445 | 446 | sys.stdout.write("\t" + "-*"*40 + "\n") 447 | 448 | stat_file.write( 449 | "{}\t{}\t{}\t{}\t{}\t{}\n".format(epoch, epoch_loss, val_loss, val_accuracy, test_loss, test_accuracy)) 450 | self.logger.info("\n *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-\n {} \n".format(epoch_msg)) 451 | stat_file.close() 452 | sess.close() 453 | return BEST_ACCURACY 454 | -------------------------------------------------------------------------------- /compiler_provenance/train.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | from compiler_provenance.s2v_trainer import S2VTrainerLSTM 8 | from compiler_provenance.parameters import Flags 9 | import numpy as np 10 | 11 | 12 | def run_test(): 13 | flags = Flags() 14 | flags.logger.info("\n{}\n".format(flags)) 15 | 16 | print(str(flags)) 17 | 18 | file_embedding_matrix = flags.file_embedding_matrix 19 | 20 | embedding_matrix = np.float32(np.load(file_embedding_matrix)) 21 | if flags.random_embedding: 22 | embedding_matrix = np.random.rand(*np.shape(embedding_matrix)).astype(np.float32) 23 | embedding_matrix[0, :] = np.zeros(np.shape(embedding_matrix)[1]).astype(np.float32) 24 | 25 | if flags.cross_val: 26 | print("STARTING CROSS VALIDATION") 27 | res = [] 28 | mean = 0 29 | for i in range(0, flags.cross_val_fold): 30 | print("CROSS VALIDATION STARTING FOLD: " + str(i)) 31 | if i > 0: 32 | flags.close_log() 33 | flags.reset_logdir() 34 | del flags 35 | flags = Flags() 36 | flags.logger.info("\n{}\n".format(flags)) 37 | 38 | flags.logger.info("Starting cross validation fold: {}".format(i)) 39 | 40 | flags.db_name = flags.db_name + "_val_" + str(i+1) + ".db" 41 | flags.logger.info("Cross validation db name: {}".format(flags.db_name)) 42 | 43 | trainer = S2VTrainerLSTM(flags, embedding_matrix) 44 | best_val_auc = trainer.train() 45 | 46 | mean += best_val_auc 47 | res.append(best_val_auc) 48 | 49 | flags.logger.info("Cross validation fold {} finished best auc: {}".format(i, best_val_auc)) 50 | print("FINISH FOLD: " + str(i) + " BEST VAL AUC: " + str(best_val_auc)) 51 | 52 | print("CROSS VALIDATION ENDED") 53 | print("Result: " + str(res)) 54 | print("") 55 | 56 | flags.logger.info("Cross validation finished results: {}".format(res)) 57 | flags.logger.info(" mean: {}".format(mean / flags.cross_val_fold)) 58 | flags.close_log() 59 | 60 | flags.close_log() 61 | 62 | else: 63 | trainer = S2VTrainerLSTM(flags, embedding_matrix) 64 | trainer.train() 65 | flags.close_log() 66 | 67 | 68 | if __name__ == '__main__': 69 | run_test() 70 | -------------------------------------------------------------------------------- /compiler_provenance/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Type of the network to use 4 | 5 | NETWORK_TYPE="Attention_Mean" 6 | # NETWORK_TYPE="Arith_Mean" 7 | # NETWORK_TYPE="RNN" 8 | # NETWORK_TYPE="Annotations" 9 | 10 | # What to classify: 11 | CLASSIFICATION_KIND="Family" # Compiler Family 12 | # CLASSIFICATION_KIND="Compiler" # Compiler Family + Version 13 | # CLASSIFICATION_KIND="Compiler+Opt" # Compiler Familt + Version + Optimization 14 | # CLASSIFICATION_KIND="Opt" # Optimization 15 | 16 | 17 | # Root path for the experiment 18 | MODEL_PATH=experiments/ 19 | 20 | # Path to the sqlite db with diassembled functions 21 | DB_PATH=../data/restricted_compilers_dataset.db 22 | 23 | # Path to embedding matrix 24 | EMBEDDING_MATRIX=../data/i2v/embedding_matrix.npy 25 | 26 | # Path to instruction2id dictionary 27 | INS2ID=../data/i2v/word2id.json 28 | 29 | # Add this argument to train.py to use random instructions embeddings 30 | RANDOM_EMBEDDINGS="-r" 31 | 32 | # Add this argument to train.py to use trainable instructions embeddings 33 | TRAINABLE_EMBEDDINGS="-te" 34 | 35 | python3 train.py --o $MODEL_PATH -n $DB_PATH -nn $NETWORK_TYPE -e $EMBEDDING_MATRIX -j $INS2ID -cl $CLASSIFICATION_KIND 36 | 37 | -------------------------------------------------------------------------------- /compiler_provenance/utils.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | import numpy as np 8 | 9 | 10 | def __padAndFilter(input_pairs, input_labels, input_len, max_num_vertices): 11 | 12 | output_pairs = [] 13 | output_labels = [] 14 | output_len = [] 15 | 16 | for pair, label, lens in zip(input_pairs, input_labels, input_len): 17 | try: 18 | g1 = pair[0] 19 | 20 | # graph 1 21 | adj1 = g1[0] 22 | nodes1 = g1[1] 23 | 24 | if len(nodes1) <= max_num_vertices: 25 | # graph 1 26 | pad_lenght1 = max_num_vertices - len(nodes1) 27 | new_node1 = np.pad(nodes1, [(0, pad_lenght1), (0, 0)], mode='constant') 28 | pad_lenght1 = max_num_vertices - adj1.shape[0] 29 | 30 | # pass to dense for padding 31 | adj1_dense = np.pad(adj1.todense(), [(0, pad_lenght1), (0, pad_lenght1)], mode='constant') 32 | 33 | g1 = (adj1_dense, new_node1) 34 | output_pairs.append([g1]) 35 | output_labels.append(label) 36 | 37 | new_lens_0 = lens + [0]*(max_num_vertices-len(lens)) 38 | output_len.append([new_lens_0]) 39 | except: 40 | pass 41 | 42 | return output_pairs, output_labels, output_len 43 | 44 | -------------------------------------------------------------------------------- /dataset_creation/BlockFeaturesExtractor.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | 8 | # Questa classe estra da un blocco di codice assembler le features utilizzate nell'articolo ccs17 9 | 10 | class BlockFeaturesExtractor: 11 | x86_ARIT = 0 12 | x86_MOV = 0 13 | string = [] 14 | dyn_string = [] 15 | constants = [] 16 | num_transfer = 0 17 | num_instructions = 0 18 | num_calls = 0 19 | num_arith = 0 20 | 21 | def __init__(self, architecture, instructions, r2_disasm, string_addr): 22 | self.architecture = architecture 23 | self.instructions = instructions 24 | self.r2_disasm = r2_disasm 25 | self.string_addr = string_addr 26 | 27 | self.string = [] 28 | self.constant = [] 29 | self.num_transfer = 0 30 | self.num_instructions = 0 31 | self.num_calls = 0 32 | self.num_arith = 0 33 | 34 | def getFeatures(self): 35 | if len(self.instructions) != 0: 36 | self.num_instructions = len(self.instructions) 37 | self.constant, self.string = self.extractConstansStrings() 38 | self.num_transfer = self.countTransfer() 39 | self.num_calls = self.countCalls() 40 | self.num_arith = self.countArith() 41 | 42 | return ({'string': self.string, 'constant': self.constant, 43 | 'transfer': self.num_transfer, 'instruction': self.num_instructions, 44 | 'call': self.num_calls, 'arith': self.num_arith}) 45 | 46 | def countCalls(self): 47 | x86_mnemonics = ['call', 'int'] 48 | arm_mnemonics = ['bl', 'blx'] 49 | mips_mnemonics = ['jal', 'jalr', 'syscall'] 50 | 51 | mips_mnemonics = [s.lower() for s in mips_mnemonics] 52 | arm_mnemonics = [s.lower() for s in arm_mnemonics] 53 | x86_mnemonics = [s.lower() for s in x86_mnemonics] 54 | 55 | count = 0 56 | for i in list(self.instructions): 57 | if self.architecture == 'x86': 58 | if str(i['mnemonic']) in x86_mnemonics: 59 | count = count + 1 60 | elif self.architecture == 'mips': 61 | if str(i['mnemonic']) in mips_mnemonics: 62 | count = count + 1 63 | elif self.architecture == 'arm': 64 | if str(i['mnemonic']) in arm_mnemonics: 65 | count = count + 1 66 | return count 67 | 68 | # Questa funzione conta le istruzione aritmetiche all'interno del blocco 69 | def countArith(self): 70 | x86_mnemonics = ['add', 'sub', 'div', 'imul', 'idiv', 'mul', 'shl', 'dec', 'adc', 'adcx', 'addpd', 'addps', 71 | 'addsd', 'addss', 'addsubpd', 'ADDSUBPS', 'adox', 'divpd', 'divps' 72 | , 'divsd', 'divss', 'dppd', 'dpps', 'f2xm1', 'fabs', 'fadd', 'faddp', 'fcos', 'fdiv', 'fdivp', 'fiadd', 73 | 'fidiv', 'fimul', 'fisub', 'fisubr', 'fmul', 'fmulp', 'FPATAN', 'FPREM', 'FPREM1', 'FPTAN', 74 | 'FRNDINT', 'FSCALE' 75 | , 'FSIN', 'FSINCOS', 'FSQRT', 'FSUB', 'FSUBP', 'FSUBR', 'FSUBRP', 'FYL2X', 'FYL2XP1', 'HADDPD', 'HADDPS', 76 | 'HSUBPD', 'HSUBPS', 'KADDB', 'KADDD', 'KADDD', 'KADDW', 'KSHIFTLB', 'KSHIFTLD', 'KSHIFTLQ', 77 | 'KSHIFTLW', 'KSHIFTRB', 'KSHIFTRD', 'KSHIFTRQ', 'KSHIFTRW' 78 | , 'MAXPD', 'MAXPS', 'MAXSD', 'MAXSS', 'MINPD', 'MINPS', 'MINSD', 'MINSS', 'MULPD' 79 | , 'MULPS', 'MULSS', 'MULSD', 'MULX', 'PADDB', 'PADDD', 'PADDQ', 'PADDSB', 'PADDSW', 'PADDUSB', 'PADDUSW' 80 | , 'PADDW', 'PAVGB', 'PAVGW', 'PHADDD', 'PHADDSW', 'PHADDW', 'PHMINPOSUW', 'PHSUBD', 'PHSUBSW', 'PHSUBW' 81 | , 'PMADDUBSW', 'PMADDWD', 'PMAXSB', 'PMAXSD', 'PMAXSQ', 'PMAXSW', 'PMAXUB', 'PMAXUD', 'PMAXUQ', 'PMAXUW', 82 | 'PMINSB' 83 | , 'PMINSD', 'PMINSQ', 'PMINSW', 'PMINUB', 'PMINUD', 'PMINUQ', 'PMINUW', 'PMULDQ', 'PMULHRSW', 'PMULHUW', 84 | 'PMULHW', 'PMULLD', 'PMULLQ' 85 | , 'PMULLW', 'PMULUDQ', 'PSADBW', 'PSLLD', 'PSLLW', 'PSRAD', 'PSLLQ', 'PSRAQ', 'PSRLQ', 'PSRLW' 86 | , 'PSUBB', 'PSUBD', 'PSUBQ', 'PSUBSB', 'PSUBSW', 'PSUBUSB', 'PSUBUSW', 'RCL', 'RCR' 87 | , 'ROL', 'ROR', 'ROUNDPD', 'ROUNDPS', 'ROUNDSD', 'ROUNDSS', 'RSQRTPS' 88 | , 'RSQRTSS', 'SAL', 'SAR', 'SARX', 'SBB', 'inc', 'SHLD', 'SHLX', 'SHR', 'SHRD', 'SHRX', 'SQRTPD', 'SQRTPS', 89 | 'SQRTSD', 'SQRTSS' 90 | , 'SUBPD', 'SUBPS', 'SUBSD', 'SUBSS', 'VFMADD132PD', 'VPSLLVD', 'VPSLLVQ', 'VPSLLVW', 'VPSRAVD', 'VPSRAVQ' 91 | , 'VPSRAVW', 'VPSRLVD', 'VPSRLVQ', 'VPSRLVW', 'VRNDSCALEPD', 'VRNDSCALEPS', 'XADD'] 92 | 93 | arm_mnemonics = ['add', 'adc', 'qadd', 'dadd', 'sub', 'SBC', 'RSB', 'RSC', 'subs', 'qsub', 94 | 'add16', 'SUB16', 'add8', 'sub8', 'ASX', 'sax', 'usad8', 'SSAT', 'MUL' 95 | , 'smul', 'MLA', 'MLs', 'UMULL', 'UMLAL', 'UMaAL', 'SMULL', 'smlal' 96 | , 'SMULxy', 'SMULWy', 'SMLAxy', 'SMLAWy', 'SMLALxy', 'SMUAD' 97 | , 'SMLAD', 'SMLALD', 'SMUSD', 'SMLSD', 'SMLSLD', 'SMMUL' 98 | , 'SMMLA', 'MIA', 'MIAPH', 'MIAxy', 'SDIV', 'udiv' 99 | , 'ASR', 'LSL', 'LSR', 'ROR', 'RRX'] 100 | 101 | mips_mnemonics = ['add', 'addu', 'addi', 'addiu', 'mult', 'multu', 'div', 'divu' 102 | , 'AUI', 'DAUI', 'DAHI', 'DATI', 'CLO', 'CLZ', 'DADD', 'DADDI' 103 | , 'DADDIU', 'DADDU', 'DCLO', 'DCLZ', 'DDIV', 'DDIVU', 'MOD' 104 | , 'MODU', 'DMOD', 'DMODU', 'DMULTU', 'DROTR', 'DROTR32', 'DSLLV' 105 | , 'DSRA', 'DSRA32', 'DSRAV', 'DSRL', 'DSRL32' 106 | , 'DSRLV', 'DSUB', 'DSUBU', 'DSRL', 'FLOOR', 'MAX', 'MIN', 'MINA', 'MAXA' 107 | , 'MSUB', 'MSUBU', 'MUL', 'MUH', 'MULU', 'MUHU', 'DMUL', 'DMUH' 108 | , 'DMULU', 'DMUHU', 'DMUL', 'NEG' 109 | , 'NMADD', 'NMSUB', 'RECIP', 'RINT', 'ROTR', 'ROUND', 'RSQRT' 110 | , 'SLL', 'SLLV', 'SQRT', 'SRA', 'SRAV', 'SRL', 'SRLV' 111 | , 'SUB', 'SUBU', 'madd', 'maddu', 'msub', 'msubu', 'sll' 112 | , 'srl', 'sra', 'sllv', 'srla', 'srlv'] 113 | 114 | mips_mnemonics = [s.lower() for s in mips_mnemonics] 115 | arm_mnemonics = [s.lower() for s in arm_mnemonics] 116 | x86_mnemonics = [s.lower() for s in x86_mnemonics] 117 | 118 | count = 0 119 | for i in list(self.instructions): 120 | if self.architecture == 'x86': 121 | if str(i['mnemonic']).lower() in x86_mnemonics: 122 | count = count + 1 123 | elif self.architecture == 'mips': 124 | if str(i['mnemonic']).lower() in mips_mnemonics: 125 | count = count + 1 126 | elif self.architecture == 'arm': 127 | if str(i['mnemonic']).lower() in arm_mnemonics: 128 | count = count + 1 129 | elif self.architecture == 'arm': 130 | if str(i['mnemonic']).lower() in arm_mnemonics: 131 | count = count + 1 132 | nop = 0 133 | return count 134 | 135 | # Questa funzione conta le istruzioni logiche all'interno del blocco 136 | def countLogic(self): 137 | x86_mnemonics = ['and', 'andn', 'andnpd', 'andpd', 'andps', 'andnps', 'test', 'xor', 'xorpd', 'pslld' 138 | , 'ANDNPD', 'ANDNPS', 'ANDPD', 'ANDPS', 'KANDB', 'KANDD', 'KANDNB', 'KANDND', 'KANDNQ', 'KANDNW', 'KANDQ', 139 | 'KANDW' 140 | , 'KNOTB', 'KNOTq', 'KNOTD', 'KNOTw', 'korq', 'korb', 'korw', 'kord', 'KTESTB', 'ktestd', 'ktestq', 'ktestw' 141 | , 'KXNORB', 'KXNORd', 'KXNORq', 'KXORB', 'KXORq', 'KXORd', 'KXORw', 'NOT', 'OR', 'ORPD', 'ORPS', 'PAND', 142 | 'PAND' 143 | , 'PCMPEQB', 'PCMPEQD', 'PCMPEQQ', 'PCMPGTB', 'PTEST', 'pxor', 'VPCMPB', 'VPCMPD', 'VPCMPQ', 144 | 'VPTESTMB', 'VPTESTMD', 'VPTESTMQ', 'VPTESTMW', 'VPTESTNMB', 'VPTESTNMD', 'VPTESTNMQ', 145 | 'VPTESTNMW' 146 | , 'XORPD', 'XORPS'] 147 | arm_mnemonics = ['AND', 'EOR', 'ORR', 'ORN', 'BIC'] 148 | mips_mnemonics = ['and', 'andi', 'or', 'ori', 'xor', 'nor', 'slt', 'slti', 'sltu'] 149 | 150 | mips_mnemonics = [s.lower() for s in mips_mnemonics] 151 | arm_mnemonics = [s.lower() for s in arm_mnemonics] 152 | x86_mnemonics = [s.lower() for s in x86_mnemonics] 153 | 154 | count = 0 155 | for i in list(self.instructions): 156 | if self.architecture == 'x86': 157 | if str(i['mnemonic']).lower() in x86_mnemonics: 158 | count = count + 1 159 | elif self.architecture == 'mips': 160 | if str(i['mnemonic']).lower() in mips_mnemonics: 161 | count = count + 1 162 | elif self.architecture == 'arm': 163 | if str(i['mnemonic']).lower() in arm_mnemonics: 164 | count = count + 1 165 | return count 166 | 167 | def countTransfer(self): 168 | x86_mnemonics = ['BNDLDX', 'BNDMK', 'BNDMOV', 'BNDSTX' 169 | , 'CMOVA', 'CMOVZ', 'CMOVPO', 'CMOVPE', 'CMOVP', 'CMOVO', 'CMOVNZ', 'CMOVNP', 'CMOVNO', 'CMOVNG', 'CMOVL' 170 | , 'FIST', 'FISTP', 'FISTTP', 'FSAVE', 'KMOVB', 'KMOVD', 'KMOVQ', 'KMOVW' 171 | , 'LDDQU', 'LDS', 'LEA', 'LODS', 'LODSB', 'LODSD', 'LODSQ', 'LODSW' 172 | , 'LSS', 'LSL', 'MOV', 'MOVAPD', 'MOVAPS', 'MOVBE', 'MOVD', 'MOVDDUP', 'MOVDQ2Q', 'MOVDQA', 'MOVDQU' 173 | , 'MOVHLPS', 'MOVHPD', 'MOVHPS', 'MOVLHPS', 'MOVLPD', 'MOVLPS', 'MOVQ', 'MOVS', 'MOVSB', 'MOVSD', 'MOVNTQ' 174 | , 'MOVNTDQ', 'MOVMSKPS', 'MOVSQ', 'MOVSS', 'MOVSW', 'MOVSX', 'MOVSXD', 'MOVUPD', 'MOVUPS', 'MOVZX', 175 | 'PMOVMSKB' 176 | , 'PMOVSX', 'PMOVZX', 'PUSH', 'PUSHA', 'PUSHAD', 'PUSHF', 'STOS', 'STOSB', 'STOSD', 'STOSQ', 'STOSW' 177 | , 'VBROADCAST', 'VEXPANDPD', 'VEXPANDPS', 'VMOVDQA32', 'VMOVDQA64', 'VMOVDQU16', 'VMOVDQU32', 'VMOVDQU64', 178 | 'VMOVDQU8' 179 | , 'VPBROADCAST', 'VPBROADCASTB', 'VPEXPANDD', 'VPEXPANDQ', 'movb', 'movq'] 180 | arm_mnemonics = ['MOV', 'MVN', 'MOVT', 'MRA', 'MAR', 'LDR', 'STR', 'PLD', 'PLI', 'PLDW', 'LDM', 'LDREX', 181 | 'LDREXD', 'STM', 'STREX', 'STREXD'] 182 | mips_mnemonics = ['LB', 'LBE', 'LBU', 'LBUE', 'LD', 'LDE', 'LDU', 'LDUE', 'LDC1', 'LDC2' 183 | , 'LDL', 'LDPC', 'LDR', 'LDXC1', 'LH', 'LHE', 'LHU', 'LHUE', 'LL' 184 | , 'LLD', 'LLE', 'LLDP', 'LLWP', 'LLWPE', 'LSA', 'LUXC1', 'LW' 185 | , 'LWC1', 'LWC2', 'LWL', 'LWLE', 'LWPC' 186 | , 'LWR', 'LWRE', 'LWU', 'MOV', 'SB', 'SBE', 'SC' 187 | , 'SCD', 'SCDP', 'SCE', 'SCWP', 'SCWPE' 188 | , 'SD', 'SDBBP', 'SDC1', 'SDC2', 'SDL', 'SDR', 'SDXC1', 'SH', 'SHU', 'SHE' 189 | , 'SW', 'SWE', 'SWC1', 'SWC2', 'SWL', 'SWR', 'SWLE', 'SWRE', 'SWXC1'] 190 | 191 | mips_mnemonics = [s.lower() for s in mips_mnemonics] 192 | arm_mnemonics = [s.lower() for s in arm_mnemonics] 193 | x86_mnemonics = [s.lower() for s in x86_mnemonics] 194 | 195 | count = 0 196 | for i in list(self.instructions): 197 | if self.architecture == 'x86': 198 | if str(i['mnemonic']).lower() in x86_mnemonics: 199 | count = count + 1 200 | elif self.architecture == 'mips': 201 | if str(i['mnemonic']).lower() in mips_mnemonics: 202 | count = count + 1 203 | elif self.architecture == 'arm': 204 | if str(i['mnemonic']).lower() in arm_mnemonics: 205 | count = count + 1 206 | return count 207 | 208 | def extractConstansStrings(self): 209 | constants = 0 210 | strings = 0 211 | for i, ins in enumerate(self.instructions): 212 | if 'opex' not in ins: 213 | continue 214 | for operand in ins['opex']['operands']: 215 | if operand['type'] == 'imm': 216 | if 'disasm' in self.r2_disasm[i] and 'str.' in self.r2_disasm[i]['disasm']: 217 | strings += 1 218 | elif operand['value'] in self.string_addr: 219 | strings += 1 220 | else: 221 | constants += 1 222 | 223 | return (constants, strings) 224 | -------------------------------------------------------------------------------- /dataset_creation/DataSplitter.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | import json 7 | import random 8 | import sqlite3 9 | from tqdm import tqdm 10 | 11 | 12 | class DataSplitter: 13 | 14 | def __init__(self, db_name): 15 | self.db_name = db_name 16 | 17 | def create_pair_table(self, table_name): 18 | conn = sqlite3.connect(self.db_name) 19 | c = conn.cursor() 20 | c.executescript("DROP TABLE IF EXISTS {} ".format(table_name)) 21 | c.execute("CREATE TABLE {} (id INTEGER PRIMARY KEY, true_pair TEXT, false_pair TEXT)".format(table_name)) 22 | conn.commit() 23 | conn.close() 24 | 25 | def get_ids(self, set_type): 26 | conn = sqlite3.connect(self.db_name) 27 | cur = conn.cursor() 28 | q = cur.execute("SELECT id FROM {}".format(set_type)) 29 | ids = q.fetchall() 30 | conn.close() 31 | return ids 32 | 33 | @staticmethod 34 | def select_similar_cfg(id, provenance, ids, cursor): 35 | q1 = cursor.execute('SELECT id FROM functions WHERE project=? AND file_name=? and function_name=?', provenance) 36 | candidates = [i[0] for i in q1.fetchall() if (i[0] != id and i[0] in ids)] 37 | if len(candidates) == 0: 38 | return None 39 | id_similar = random.choice(candidates) 40 | return id_similar 41 | 42 | @staticmethod 43 | def select_dissimilar_cfg(ids, provenance, cursor): 44 | while True: 45 | id_dissimilar = random.choice(ids) 46 | q2 = cursor.execute('SELECT project, file_name, function_name FROM functions WHERE id=?', (id_dissimilar,)) 47 | res = q2.fetchone() 48 | if res != provenance: 49 | break 50 | return id_dissimilar 51 | 52 | def create_epoch_pairs(self, epoch_number, pairs_table,id_table): 53 | random.seed = epoch_number 54 | 55 | conn = sqlite3.connect(self.db_name) 56 | cur = conn.cursor() 57 | ids = cur.execute("SELECT id FROM "+id_table).fetchall() 58 | ids = [i[0] for i in ids] 59 | id_set = set(ids) 60 | true_pair = [] 61 | false_pair = [] 62 | 63 | for my_id in tqdm(ids): 64 | q = cur.execute('SELECT project, file_name, function_name FROM functions WHERE id =?', (my_id,)) 65 | cfg_0_provenance = q.fetchone() 66 | id_sim = DataSplitter.select_similar_cfg(my_id, cfg_0_provenance, id_set, cur) 67 | id_dissim = DataSplitter.select_dissimilar_cfg(ids, cfg_0_provenance, cur) 68 | if id_sim is not None and id_dissim is not None: 69 | true_pair.append((my_id, id_sim)) 70 | false_pair.append((my_id, id_dissim)) 71 | 72 | true_pair = str(json.dumps(true_pair)) 73 | false_pair = str(json.dumps(false_pair)) 74 | 75 | cur.execute("INSERT INTO {} VALUES (?,?,?)".format(pairs_table), (epoch_number, true_pair, false_pair)) 76 | conn.commit() 77 | conn.close() 78 | 79 | def create_pairs(self, total_epochs): 80 | 81 | self.create_pair_table('train_pairs') 82 | self.create_pair_table('validation_pairs') 83 | self.create_pair_table('test_pairs') 84 | 85 | for i in range(0, total_epochs): 86 | print("Creating training pairs for epoch {} of {}".format(i, total_epochs)) 87 | self.create_epoch_pairs(i, 'train_pairs','train') 88 | 89 | print("Creating validation pairs") 90 | self.create_epoch_pairs(0, 'validation_pairs','validation') 91 | 92 | print("Creating test pairs") 93 | self.create_epoch_pairs(0, "test_pairs",'test') 94 | 95 | 96 | @staticmethod 97 | def prepare_set(data_to_include, table_name, file_list, cur): 98 | i = 0 99 | while i < data_to_include and len(file_list) > 0: 100 | choice = random.choice(file_list) 101 | file_list.remove(choice) 102 | q = cur.execute("SELECT id FROM functions where project=? AND file_name=?", choice) 103 | data = q.fetchall() 104 | cur.executemany("INSERT INTO {} VALUES (?)".format(table_name), data) 105 | i += len(data) 106 | return file_list, i 107 | 108 | def split_data(self, validation_dim, test_dim): 109 | random.seed = 12345 110 | conn = sqlite3.connect(self.db_name) 111 | c = conn.cursor() 112 | 113 | q = c.execute('''SELECT project, file_name FROM functions ''') 114 | data = q.fetchall() 115 | conn.commit() 116 | 117 | num_data = len(data) 118 | num_test = int(num_data * test_dim) 119 | num_validation = int(num_data * validation_dim) 120 | 121 | filename = list(set(data)) 122 | 123 | c.execute("DROP TABLE IF EXISTS train") 124 | c.execute("DROP TABLE IF EXISTS test") 125 | c.execute("DROP TABLE IF EXISTS validation") 126 | 127 | c.execute("CREATE TABLE IF NOT EXISTS train (id INTEGER PRIMARY KEY)") 128 | c.execute("CREATE TABLE IF NOT EXISTS validation (id INTEGER PRIMARY KEY)") 129 | c.execute("CREATE TABLE IF NOT EXISTS test (id INTEGER PRIMARY KEY)") 130 | 131 | c.execute('''CREATE INDEX IF NOT EXISTS my_index ON functions(project, file_name, function_name)''') 132 | c.execute('''CREATE INDEX IF NOT EXISTS my_index_2 ON functions(project, file_name)''') 133 | 134 | filename, test_num = DataSplitter.prepare_set(num_test, 'test', filename, conn.cursor()) 135 | conn.commit() 136 | assert len(filename) > 0 137 | filename, val_num = self.prepare_set(num_validation, 'validation', filename, conn.cursor()) 138 | conn.commit() 139 | assert len(filename) > 0 140 | _, train_num = self.prepare_set(num_data - num_test - num_validation, 'train', filename, conn.cursor()) 141 | conn.commit() 142 | 143 | print("Train Size: {}".format(train_num)) 144 | print("Validation Size: {}".format(val_num)) 145 | print("Test Size: {}".format(test_num)) 146 | -------------------------------------------------------------------------------- /dataset_creation/DatabaseFactory.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | from FunctionAnalyzerRadare import RadareFunctionAnalyzer 7 | import json 8 | import multiprocessing 9 | from multiprocessing import Pool 10 | from multiprocessing.dummy import Pool as ThreadPool 11 | import os 12 | import random 13 | import signal 14 | import sqlite3 15 | from tqdm import tqdm 16 | from networkx.readwrite import json_graph 17 | 18 | 19 | class DatabaseFactory: 20 | 21 | def __init__(self, db_name, root_path): 22 | self.db_name = db_name 23 | self.root_path = root_path 24 | 25 | @staticmethod 26 | def worker(item): 27 | DatabaseFactory.analyze_file(item) 28 | return 0 29 | 30 | @staticmethod 31 | def extract_function(graph_analyzer): 32 | return graph_analyzer.extractAll() 33 | 34 | @staticmethod 35 | def to_jsongraph(graph): 36 | return json.dumps(json_graph.adjacency_data(graph)) 37 | 38 | @staticmethod 39 | def insert_in_db(db_name, pool_sem, func, filename, function_name): 40 | path = filename.split(os.sep) 41 | if len(path) < 4: 42 | return 43 | pool_sem.acquire() 44 | conn = sqlite3.connect(db_name) 45 | cfg = DatabaseFactory.to_jsongraph(func["cfg"]) 46 | cur = conn.cursor() 47 | cur.execute('''INSERT INTO functions VALUES (?,?,?,?,?,?,?)''', (None, # id 48 | path[-4], # project 49 | path[-3], # compiler 50 | path[-2], # optimization 51 | path[-1], # file_name 52 | function_name, # function_name 53 | cfg)) 54 | 55 | inserted_id = cur.lastrowid 56 | acfg = DatabaseFactory.to_jsongraph(func["acfg"]) 57 | lstm_cfg = DatabaseFactory.to_jsongraph(func["lstm_cfg"]) 58 | 59 | cur.execute('''INSERT INTO acfg VALUES (?,?)''', (inserted_id, acfg)) 60 | conn.commit() 61 | cur.execute('''INSERT INTO lstm_cfg VALUES (?,?)''', (inserted_id, lstm_cfg)) 62 | conn.commit() 63 | 64 | conn.close() 65 | pool_sem.release() 66 | 67 | @staticmethod 68 | def analyze_file(item): 69 | global pool_sem 70 | os.setpgrp() 71 | 72 | filename = item[0] 73 | db = item[1] 74 | use_symbol = item[2] 75 | 76 | analyzer = RadareFunctionAnalyzer(filename, use_symbol) 77 | p = ThreadPool(1) 78 | res = p.apply_async(analyzer.analyze) 79 | 80 | try: 81 | result = res.get(120) 82 | except multiprocessing.TimeoutError: 83 | print("Aborting due to timeout:" + str(filename)) 84 | print('Try to modify the timeout value in DatabaseFactory instruction result = res.get(TIMEOUT)') 85 | os.killpg(0, signal.SIGKILL) 86 | except Exception: 87 | print("Aborting due to error:" + str(filename)) 88 | os.killpg(0, signal.SIGKILL) 89 | 90 | for func in result: 91 | DatabaseFactory.insert_in_db(db, pool_sem, result[func], filename, func) 92 | 93 | analyzer.close() 94 | 95 | return 0 96 | 97 | # Create the db where data are stored 98 | def create_db(self): 99 | print('Database creation...') 100 | conn = sqlite3.connect(self.db_name) 101 | conn.execute(''' CREATE TABLE IF NOT EXISTS functions (id INTEGER PRIMARY KEY, 102 | project text, 103 | compiler text, 104 | optimization text, 105 | file_name text, 106 | function_name text, 107 | cfg text)''') 108 | 109 | conn.execute('''CREATE TABLE IF NOT EXISTS acfg (id INTEGER PRIMARY KEY, acfg text)''') 110 | conn.execute('''CREATE TABLE IF NOT EXISTS lstm_cfg (id INTEGER PRIMARY KEY, lstm_cfg text)''') 111 | 112 | conn.commit() 113 | conn.close() 114 | 115 | # Scan the root directory to find all the file to analyze, 116 | # query also the db for already analyzed files. 117 | def scan_for_file(self, start): 118 | file_list = [] 119 | # Scan recursively all the subdirectory 120 | directories = os.listdir(start) 121 | for item in directories: 122 | item = os.path.join(start,item) 123 | if os.path.isdir(item): 124 | file_list.extend(self.scan_for_file(item + os.sep)) 125 | elif os.path.isfile(item) and item.endswith('.o'): 126 | file_list.append(item) 127 | return file_list 128 | 129 | # Looks for already existing files in the database 130 | # It returns a list of files that are not in the database 131 | def remove_override(self, file_list): 132 | conn = sqlite3.connect(self.db_name) 133 | cur = conn.cursor() 134 | q = cur.execute('''SELECT project, compiler, optimization, file_name FROM functions''') 135 | names = q.fetchall() 136 | names = [os.path.join(self.root_path, n[0], n[1], n[2], n[3]) for n in names] 137 | names = set(names) 138 | # If some files is already in the db remove it from the file list 139 | if len(names) > 0: 140 | print(str(len(names)) + ' Already in the database') 141 | cleaned_file_list = [] 142 | for f in file_list: 143 | if not(f in names): 144 | cleaned_file_list.append(f) 145 | 146 | return cleaned_file_list 147 | 148 | # root function to create the db 149 | def build_db(self, use_symbol): 150 | global pool_sem 151 | 152 | pool_sem = multiprocessing.BoundedSemaphore(value=1) 153 | 154 | self.create_db() 155 | file_list = self.scan_for_file(self.root_path) 156 | 157 | print('Found ' + str(len(file_list)) + ' during the scan') 158 | file_list = self.remove_override(file_list) 159 | print('Find ' + str(len(file_list)) + ' files to analyze') 160 | random.shuffle(file_list) 161 | 162 | t_args = [(f, self.db_name, use_symbol) for f in file_list] 163 | 164 | # Start a parallel pool to analyze files 165 | p = Pool(processes=None, maxtasksperchild=20) 166 | for _ in tqdm(p.imap_unordered(DatabaseFactory.worker, t_args), total=len(file_list)): 167 | pass 168 | 169 | p.close() 170 | p.join() 171 | 172 | 173 | -------------------------------------------------------------------------------- /dataset_creation/ExperimentUtil.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | import argparse 7 | from dataset_creation import DatabaseFactory, DataSplitter 8 | 9 | def debug_msg(): 10 | msg = " DATABASE UTILITY" 11 | msg += "-------------------------------------------------\n" 12 | msg += "This program is an utility to save data into an sqlite database with SAFE \n\n" 13 | msg += "There are three main command: \n" 14 | msg += "BUILD: It create a db with two tables: functions, filtered_functions. \n" 15 | msg += " In the first table there are all the functions extracted from the executable with their hex code.\n" 16 | msg += " In the second table functions are converted to i2v representation. \n" 17 | msg += "SPLIT: Data are splitted into train validation and test set. " \ 18 | " Then it generate the pairs for the training of the network.\n" 19 | msg += "EMBEDD: Generate the embeddings of each function in the database using a trained SAFE model\n\n" 20 | msg += "If you want to train the network use build + split" 21 | msg += "If you want to create a knowledge base for the binary code search engine use build + embedd" 22 | msg += "This program has been written by the SAFE team.\n" 23 | msg += "-------------------------------------------------" 24 | return msg 25 | 26 | 27 | def build_configuration(db_name, root_dir, use_symbols): 28 | msg = "Database creation options: \n" 29 | msg += " - Database Name: {} \n".format(db_name) 30 | msg += " - Root dir: {} \n".format(root_dir) 31 | msg += " - Use symbols: {} \n".format(use_symbols) 32 | return msg 33 | 34 | 35 | def split_configuration(db_name, val_split, test_split, epochs): 36 | msg = "Splitting options: \n" 37 | msg += " - Database Name: {} \n".format(db_name) 38 | msg += " - Validation Size: {} \n".format(val_split) 39 | msg += " - Test Size: {} \n".format(test_split) 40 | msg += " - Epochs: {} \n".format(epochs) 41 | return msg 42 | 43 | 44 | if __name__ == '__main__': 45 | 46 | parser = argparse.ArgumentParser(description=debug_msg) 47 | 48 | parser.add_argument("-db", "--db", help="Name of the database to create", required=True) 49 | 50 | parser.add_argument("-b", "--build", help="Build db disassebling executables", action="store_true") 51 | parser.add_argument("-s", "--split", help="Perform data splitting for training", action="store_true") 52 | 53 | parser.add_argument("-dir", "--dir", help="Root path of the directory to scan") 54 | parser.add_argument("-sym", "--symbols", help="Use it if you want to use symbols", action="store_true") 55 | 56 | parser.add_argument("-test", "--test_size", help="Test set size [0-1]", type=float, default=0.2) 57 | parser.add_argument("-val", "--val_size", help="Validation set size [0-1]", type=float, default=0.2) 58 | parser.add_argument("-epo", "--epochs", help="# Epochs to generate pairs for", type=int, default=25) 59 | 60 | try: 61 | args = parser.parse_args() 62 | except: 63 | parser.print_help() 64 | print(debug_msg()) 65 | exit(0) 66 | 67 | if args.build: 68 | print("Disassemblying files and creating dataset") 69 | print(build_configuration(args.db, args.dir, args.symbols)) 70 | factory = DatabaseFactory.DatabaseFactory(args.db, args.dir) 71 | factory.build_db(args.symbols) 72 | 73 | if args.split: 74 | print("Splitting data and generating epoch pairs") 75 | print(split_configuration(args.db, args.val_size, args.test_size, args.epochs)) 76 | splitter = DataSplitter.DataSplitter(args.db) 77 | splitter.split_data(args.val_size, args.test_size) 78 | splitter.create_pairs(args.epochs) 79 | 80 | exit(0) 81 | -------------------------------------------------------------------------------- /dataset_creation/FunctionAnalyzerRadare.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | import json 7 | import r2pipe 8 | import networkx as nx 9 | from dataset_creation.BlockFeaturesExtractor import BlockFeaturesExtractor 10 | 11 | 12 | class Dict2Obj(object): 13 | """ 14 | Turns a dictionary into a class 15 | """ 16 | 17 | # ---------------------------------------------------------------------- 18 | def __init__(self, dictionary): 19 | """Constructor""" 20 | for key in dictionary: 21 | setattr(self, key, dictionary[key]) 22 | 23 | class RadareFunctionAnalyzer: 24 | 25 | def __init__(self, filename, use_symbol): 26 | self.r2 = r2pipe.open(filename, flags=['-2']) 27 | self.filename = filename 28 | self.arch, _ = self.get_arch() 29 | self.use_symbol = use_symbol 30 | 31 | def __enter__(self): 32 | return self 33 | 34 | @staticmethod 35 | def filter_reg(op): 36 | return op["value"] 37 | 38 | @staticmethod 39 | def filter_imm(op): 40 | imm = int(op["value"]) 41 | if -int(5000) <= imm <= int(5000): 42 | ret = str(hex(op["value"])) 43 | else: 44 | ret = str('HIMM') 45 | return ret 46 | 47 | @staticmethod 48 | def filter_mem(op): 49 | if "base" not in op: 50 | op["base"] = 0 51 | 52 | if op["base"] == 0: 53 | r = "[" + "MEM" + "]" 54 | else: 55 | reg_base = str(op["base"]) 56 | disp = str(op["disp"]) 57 | scale = str(op["scale"]) 58 | r = '[' + reg_base + "*" + scale + "+" + disp + ']' 59 | return r 60 | 61 | @staticmethod 62 | def filter_memory_references(i): 63 | inst = "" + i["mnemonic"] 64 | 65 | for op in i["opex"]["operands"]: 66 | if op["type"] == 'reg': 67 | inst += " " + RadareFunctionAnalyzer.filter_reg(op) 68 | elif op["type"] == 'imm': 69 | inst += " " + RadareFunctionAnalyzer.filter_imm(op) 70 | elif op["type"] == 'mem': 71 | inst += " " + RadareFunctionAnalyzer.filter_mem(op) 72 | if len(i["opex"]["operands"]) > 1: 73 | inst = inst + "," 74 | 75 | if "," in inst: 76 | inst = inst[:-1] 77 | inst = inst.replace(" ", "_") 78 | 79 | return str(inst) 80 | 81 | @staticmethod 82 | def get_callref(my_function, depth): 83 | calls = {} 84 | if 'callrefs' in my_function and depth > 0: 85 | for cc in my_function['callrefs']: 86 | if cc["type"] == "C": 87 | calls[cc['at']] = cc['addr'] 88 | return calls 89 | 90 | 91 | def process_instructions(self, instructions): 92 | filtered_instructions = [] 93 | for insn in instructions: 94 | #operands = [] 95 | if 'opex' not in insn: 96 | continue 97 | #for op in insn['opex']['operands']: 98 | # operands.append(Dict2Obj(op)) 99 | #insn['operands'] = operands 100 | stringized = RadareFunctionAnalyzer.filter_memory_references(insn) 101 | if "x86" in self.arch: 102 | stringized = "X_" + stringized 103 | elif "arm" in self.arch: 104 | stringized = "A_" + stringized 105 | else: 106 | stringized = "UNK_" + stringized 107 | filtered_instructions.append(stringized) 108 | return filtered_instructions 109 | 110 | def process_block(self, block): 111 | bytes = "" 112 | disasm = [] 113 | for op in block['ops']: 114 | if 'disasm' in op: 115 | disasm.append(op['disasm']) 116 | bytes += str(op['bytes']) 117 | 118 | self.r2.cmd("s " + str(block['offset'])) 119 | instructions = json.loads(self.r2.cmd("aoj " + str(len(block['ops'])))) 120 | string_addresses = [s['vaddr'] for s in json.loads(self.r2.cmd("izzj"))] 121 | bfe = BlockFeaturesExtractor(self.arch, instructions, block['ops'], string_addresses) 122 | annotations = bfe.getFeatures() 123 | filtered_instructions = self.process_instructions(instructions) 124 | 125 | return disasm, bytes, annotations, filtered_instructions 126 | 127 | def function_to_cfg(self, func): 128 | if self.use_symbol: 129 | s = 'vaddr' 130 | else: 131 | s = 'offset' 132 | 133 | self.r2.cmd('s ' + str(func[s])) 134 | try: 135 | cfg = json.loads(self.r2.cmd('agfj ' + str(func[s]))) 136 | except: 137 | cfg = [] 138 | 139 | my_cfg = nx.DiGraph() 140 | acfg = nx.DiGraph() 141 | lstm_cfg = nx.DiGraph() 142 | 143 | if len(cfg) == 0: 144 | return my_cfg, acfg, lstm_cfg 145 | else: 146 | cfg = cfg[0] 147 | 148 | for block in cfg['blocks']: 149 | disasm, block_bytes, annotations, filtered_instructions = self.process_block(block) 150 | my_cfg.add_node(block['offset'], asm=block_bytes, label=disasm) 151 | acfg.add_node(block['offset'], features=annotations) 152 | lstm_cfg.add_node(block['offset'], features=filtered_instructions) 153 | 154 | for block in cfg['blocks']: 155 | if 'jump' in block: 156 | if block['jump'] in my_cfg.nodes: 157 | my_cfg.add_edge(block['offset'],block['jump']) 158 | acfg.add_edge(block['offset'], block['jump']) 159 | lstm_cfg.add_edge(block['offset'], block['jump']) 160 | if 'fail' in block: 161 | if block['fail'] in my_cfg.nodes: 162 | my_cfg.add_edge(block['offset'],block['fail']) 163 | acfg.add_edge(block['offset'], block['fail']) 164 | lstm_cfg.add_edge(block['offset'], block['fail']) 165 | 166 | between = nx.betweenness_centrality(acfg) 167 | for n in acfg.nodes(data=True): 168 | d = n[1]['features'] 169 | d['offspring'] = len(nx.descendants(acfg, n[0])) 170 | d['betweenness'] = between[n[0]] 171 | n[1]['features'] = d 172 | 173 | return my_cfg, acfg, lstm_cfg 174 | 175 | def get_arch(self): 176 | try: 177 | info = json.loads(self.r2.cmd('ij')) 178 | if 'bin' in info: 179 | arch = info['bin']['arch'] 180 | bits = info['bin']['bits'] 181 | except: 182 | print("Error loading file") 183 | arch = None 184 | bits = None 185 | return arch, bits 186 | 187 | def find_functions(self): 188 | self.r2.cmd('aaa') 189 | try: 190 | function_list = json.loads(self.r2.cmd('aflj')) 191 | except: 192 | function_list = [] 193 | return function_list 194 | 195 | def find_functions_by_symbols(self): 196 | self.r2.cmd('aa') 197 | try: 198 | symbols = json.loads(self.r2.cmd('isj')) 199 | fcn_symb = [s for s in symbols if s['type'] == 'FUNC'] 200 | except: 201 | fcn_symb = [] 202 | return fcn_symb 203 | 204 | def analyze(self): 205 | if self.use_symbol: 206 | function_list = self.find_functions_by_symbols() 207 | else: 208 | function_list = self.find_functions() 209 | 210 | result = {} 211 | for my_function in function_list: 212 | if self.use_symbol: 213 | address = my_function['vaddr'] 214 | else: 215 | address = my_function['offset'] 216 | 217 | try: 218 | cfg, acfg, lstm_cfg = self.function_to_cfg(my_function) 219 | result[my_function['name']] = {'cfg': cfg, "acfg": acfg, "lstm_cfg": lstm_cfg, "address": address} 220 | except: 221 | print("Error in functions: {} from {}".format(my_function['name'], self.filename)) 222 | pass 223 | return result 224 | 225 | def close(self): 226 | self.r2.quit() 227 | 228 | def __exit__(self, exc_type, exc_value, traceback): 229 | self.r2.quit() 230 | 231 | 232 | 233 | -------------------------------------------------------------------------------- /dataset_creation/__init__.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # 3 | # 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) # 5 | # 6 | 7 | -------------------------------------------------------------------------------- /downloader.py: -------------------------------------------------------------------------------- 1 | # SAFE TEAM 2 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode) 3 | 4 | import argparse 5 | import os 6 | import sys 7 | from subprocess import call 8 | 9 | class Downloader: 10 | 11 | def __init__(self): 12 | parser = argparse.ArgumentParser(description='SAFE downloader') 13 | 14 | parser.add_argument("-i2v", "--i2v", dest="i2v", help="Download the i2v dictionary and embedding matrix", 15 | action="store_true", 16 | required=False) 17 | 18 | parser.add_argument("-op", "--openSSL", dest="openSSL", 19 | help="Download OpenSSL dataset", 20 | action="store_true", 21 | required=False) 22 | 23 | parser.add_argument("-rc", "--restricted_compiler", dest="restricted_compiler", 24 | help="Download the restricted compiler dataset", 25 | action="store_true", 26 | required=False) 27 | 28 | parser.add_argument("-c", "--compiler", dest="compiler", 29 | help="Download the compiler dataset. Be careful, it is very huge ( 30 GB ).", 30 | action="store_true", 31 | required=False) 32 | 33 | args = parser.parse_args() 34 | 35 | self.i2v = args.i2v 36 | self.openSSL = args.openSSL 37 | self.restricted_compiler = args.restricted_compiler 38 | self.compiler = args.compiler 39 | 40 | if not (self.i2v, self.openSSL or self.restricted_compiler or self.compiler): 41 | parser.print_help(sys.__stdout__) 42 | 43 | self.url_i2v = "https://drive.google.com/file/d/1ndKVrot5lBPklGGFn-olEt-rCtzjv69z/view?usp=sharing" 44 | self.url_openSSL = "https://drive.google.com/file/d/1NnC4qCtZUDdb32Yfeq2toa94jvCKTBxZ/view?usp=sharing" 45 | self.url_restricted_compiler = "https://drive.google.com/file/d/15VUJ3iwj5VHCqAXiUcr4zJgVWSCbaU_d/view?usp=sharing" 46 | self.url_compiler = "https://drive.google.com/file/d/1fEr9N97fTsAS2NXYpYI3GRTxadaJwhTe/view?usp=sharing" 47 | 48 | self.base_path = "data" 49 | self.path_i2v = os.path.join(self.base_path, "") 50 | self.path_openSSL = os.path.join(self.base_path, "") 51 | self.path_restricted_compiler = os.path.join(self.base_path, "") 52 | self.path_compiler = os.path.join(self.base_path, "") 53 | 54 | self.i2v_compress_name='i2v.tar.bz2' 55 | self.openSSL_compress_name='openSSL_dataset.tar.bz2' 56 | self.restricted_compiler_compress_name='restricted_compiler_dataset.tar.bz2' 57 | self.compiler_compress_name = 'compiler_dataset.bz2' 58 | 59 | 60 | @staticmethod 61 | def download_file(id,path): 62 | try: 63 | print("Downloading from "+ str(id) +" into "+str(path)) 64 | call(['./godown.pl',id,path]) 65 | except Exception as e: 66 | print("Error downloading file at url:" + str(id)) 67 | print(e) 68 | 69 | @staticmethod 70 | def decompress_file(file_src,file_path): 71 | try: 72 | call(['tar','-xvf',file_src,'-C',file_path]) 73 | except Exception as e: 74 | print("Error decompressing file:" + str(file_src)) 75 | print('you need tar command e b2zip support') 76 | print(e) 77 | 78 | def download(self): 79 | print('Making the godown.pl script executable, thanks:'+str('https://github.com/circulosmeos/gdown.pl')) 80 | call(['chmod', '+x','godown.pl']) 81 | print("SAFE --- downloading models") 82 | 83 | if self.i2v: 84 | print("Downloading i2v model.... in the folder data/i2v/") 85 | if not os.path.exists(self.path_i2v): 86 | os.makedirs(self.path_i2v) 87 | Downloader.download_file(self.url_i2v, os.path.join(self.path_i2v,self.i2v_compress_name)) 88 | print("Decompressing i2v model and placing in " + str(self.path_i2v)) 89 | Downloader.decompress_file(os.path.join(self.path_i2v,self.i2v_compress_name),self.path_i2v) 90 | 91 | if self.openSSL: 92 | print("Downloading the OpenSSL dataset... in the folder data") 93 | if not os.path.exists(self.path_openSSL): 94 | os.makedirs(self.path_openSSL) 95 | Downloader.download_file(self.url_openSSL, os.path.join(self.path_openSSL, self.openSSL_compress_name)) 96 | print("Decompressing OpenSSL dataset and placing in " + str(self.path_openSSL)) 97 | Downloader.decompress_file(os.path.join(self.path_openSSL, self.openSSL_compress_name), self.path_openSSL) 98 | 99 | if self.restricted_compiler: 100 | print("Downloading the restricted compiler dataset... in the folder data") 101 | if not os.path.exists(self.path_restricted_compiler): 102 | os.makedirs(self.path_restricted_compiler) 103 | Downloader.download_file(self.url_restricted_compiler, os.path.join(self.path_restricted_compiler,self.restricted_compiler_compress_name)) 104 | print("Decompressing restricted compiler dataset and placing in " + str(self.path_restricted_compiler)) 105 | Downloader.decompress_file(os.path.join(self.path_restricted_compiler, self.restricted_compiler_compress_name), self.path_restricted_compiler) 106 | 107 | if self.compiler: 108 | print("Downloading the compiler dataset... in the folder data") 109 | if not os.path.exists(self.path_compiler): 110 | os.makedirs(self.path_compiler) 111 | Downloader.download_file(self.url_compiler, os.path.join(self.path_compiler,self.compiler_compress_name)) 112 | print("Decompressing restricted compiler dataset and placing in " + str(self.path_compiler)) 113 | Downloader.decompress_file(os.path.join(self.path_compiler, self.compiler_compress_name), self.path_compiler) 114 | 115 | 116 | if __name__=='__main__': 117 | a = Downloader() 118 | a.download() -------------------------------------------------------------------------------- /godown.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # Google Drive direct download of big files 4 | # ./gdown.pl 'gdrive file url' ['desired file name'] 5 | # 6 | # v1.0 by circulosmeos 04-2014. 7 | # v1.1 by circulosmeos 01-2017. 8 | # http://circulosmeos.wordpress.com/2014/04/12/google-drive-direct-download-of-big-files 9 | # Distributed under GPL 3 (http://www.gnu.org/licenses/gpl-3.0.html) 10 | # 11 | use strict; 12 | 13 | my $TEMP='gdown.cookie.temp'; 14 | my $COMMAND; 15 | my $confirm; 16 | my $check; 17 | sub execute_command(); 18 | 19 | my $URL=shift; 20 | die "\n./gdown.pl 'gdrive file url' [desired file name]\n\n" if $URL eq ''; 21 | 22 | my $FILENAME=shift; 23 | $FILENAME='gdown' if $FILENAME eq ''; 24 | 25 | if ($URL=~m#^https?://drive.google.com/file/d/([^/]+)#) { 26 | $URL="https://docs.google.com/uc?id=$1&export=download"; 27 | } 28 | 29 | execute_command(); 30 | 31 | while (-s $FILENAME < 100000) { # only if the file isn't the download yet 32 | open fFILENAME, '<', $FILENAME; 33 | $check=0; 34 | foreach () { 35 | if (/href="(\/uc\?export=download[^"]+)/) { 36 | $URL='https://docs.google.com'.$1; 37 | $URL=~s/&/&/g; 38 | $confirm=''; 39 | $check=1; 40 | last; 41 | } 42 | if (/confirm=([^;&]+)/) { 43 | $confirm=$1; 44 | $check=1; 45 | last; 46 | } 47 | if (/"downloadUrl":"([^"]+)/) { 48 | $URL=$1; 49 | $URL=~s/\\u003d/=/g; 50 | $URL=~s/\\u0026/&/g; 51 | $confirm=''; 52 | $check=1; 53 | last; 54 | } 55 | } 56 | close fFILENAME; 57 | die "Couldn't download the file :-(\n" if ($check==0); 58 | $URL=~s/confirm=([^;&]+)/confirm=$confirm/ if $confirm ne ''; 59 | 60 | execute_command(); 61 | } 62 | 63 | unlink $TEMP; 64 | 65 | sub execute_command() { 66 | $COMMAND="wget --no-check-certificate --load-cookie $TEMP --save-cookie $TEMP \"$URL\""; 67 | $COMMAND.=" -O \"$FILENAME\"" if $FILENAME ne ''; 68 | `$COMMAND`; 69 | return 1; 70 | } 71 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | tensorflow 3 | networkx 4 | tqdm 5 | matplotlib 6 | sklearn 7 | r2pipe 8 | --------------------------------------------------------------------------------