├── Gemfile
├── Gemfile.lock
├── LICENSE
├── README.md
├── asm_embedding
    ├── FunctionNormalizer.py
    ├── InstructionsConverter.py
    └── __init__.py
├── binary_similarity
    ├── PairFactory.py
    ├── __init__.py
    ├── parameters.py
    ├── s2v_network.py
    ├── s2v_network_arith_mean.py
    ├── s2v_network_attention_mean.py
    ├── s2v_network_rnn.py
    ├── s2v_trainer.py
    ├── train.py
    ├── train.sh
    └── utils.py
├── compiler_provenance
    ├── FunctionFactory.py
    ├── __init__.py
    ├── parameters.py
    ├── s2v_classification_network_annotations.py
    ├── s2v_classification_network_arith_mean.py
    ├── s2v_classification_network_attention_mean.py
    ├── s2v_classification_network_rnn.py
    ├── s2v_trainer.py
    ├── train.py
    ├── train.sh
    └── utils.py
├── dataset_creation
    ├── BlockFeaturesExtractor.py
    ├── DataSplitter.py
    ├── DatabaseFactory.py
    ├── ExperimentUtil.py
    ├── FunctionAnalyzerRadare.py
    └── __init__.py
├── downloader.py
├── godown.pl
└── requirements.txt


/Gemfile:
--------------------------------------------------------------------------------
 1 | source "https://rubygems.org"
 2 | 
 3 | # Hello! This is where you manage which Jekyll version is used to run.
 4 | # When you want to use a different version, change it below, save the
 5 | # file and run `bundle install`. Run Jekyll with `bundle exec`, like so:
 6 | #
 7 | #     bundle exec jekyll serve
 8 | #
 9 | # This will help ensure the proper Jekyll version is running.
10 | # Happy Jekylling!
11 | gem "jekyll", "~> 3.7.4"
12 | 
13 | # This is the default theme for new Jekyll sites. You may change this to anything you like.
14 | gem "minima", "~> 2.0"
15 | 
16 | # If you want to use GitHub Pages, remove the "gem "jekyll"" above and
17 | # uncomment the line below. To upgrade, run `bundle update github-pages`.
18 | # gem "github-pages", group: :jekyll_plugins
19 | #gem "github-pages", group: :jekyll_plugins
20 | 
21 | # If you have any plugins, put them here!
22 | group :jekyll_plugins do
23 |   gem "jekyll-feed", "~> 0.6"
24 | end
25 | 
26 | # Windows does not include zoneinfo files, so bundle the tzinfo-data gem
27 | gem "tzinfo-data", platforms: [:mingw, :mswin, :x64_mingw, :jruby]
28 | 
29 | # Performance-booster for watching directories on Windows
30 | gem "wdm", "~> 0.1.0" if Gem.win_platform?
31 | 
32 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
  1 | GEM
  2 |   remote: https://rubygems.org/
  3 |   specs:
  4 |     activesupport (4.2.10)
  5 |       i18n (~> 0.7)
  6 |       minitest (~> 5.1)
  7 |       thread_safe (~> 0.3, >= 0.3.4)
  8 |       tzinfo (~> 1.1)
  9 |     addressable (2.5.2)
 10 |       public_suffix (>= 2.0.2, < 4.0)
 11 |     coffee-script (2.4.1)
 12 |       coffee-script-source
 13 |       execjs
 14 |     coffee-script-source (1.11.1)
 15 |     colorator (1.1.0)
 16 |     commonmarker (0.17.13)
 17 |       ruby-enum (~> 0.5)
 18 |     concurrent-ruby (1.1.3)
 19 |     dnsruby (1.61.2)
 20 |       addressable (~> 2.5)
 21 |     em-websocket (0.5.1)
 22 |       eventmachine (>= 0.12.9)
 23 |       http_parser.rb (~> 0.6.0)
 24 |     ethon (0.11.0)
 25 |       ffi (>= 1.3.0)
 26 |     eventmachine (1.2.7)
 27 |     execjs (2.7.0)
 28 |     faraday (0.15.3)
 29 |       multipart-post (>= 1.2, < 3)
 30 |     ffi (1.9.25)
 31 |     forwardable-extended (2.6.0)
 32 |     gemoji (3.0.0)
 33 |     github-pages (193)
 34 |       activesupport (= 4.2.10)
 35 |       github-pages-health-check (= 1.8.1)
 36 |       jekyll (= 3.7.4)
 37 |       jekyll-avatar (= 0.6.0)
 38 |       jekyll-coffeescript (= 1.1.1)
 39 |       jekyll-commonmark-ghpages (= 0.1.5)
 40 |       jekyll-default-layout (= 0.1.4)
 41 |       jekyll-feed (= 0.11.0)
 42 |       jekyll-gist (= 1.5.0)
 43 |       jekyll-github-metadata (= 2.9.4)
 44 |       jekyll-mentions (= 1.4.1)
 45 |       jekyll-optional-front-matter (= 0.3.0)
 46 |       jekyll-paginate (= 1.1.0)
 47 |       jekyll-readme-index (= 0.2.0)
 48 |       jekyll-redirect-from (= 0.14.0)
 49 |       jekyll-relative-links (= 0.5.3)
 50 |       jekyll-remote-theme (= 0.3.1)
 51 |       jekyll-sass-converter (= 1.5.2)
 52 |       jekyll-seo-tag (= 2.5.0)
 53 |       jekyll-sitemap (= 1.2.0)
 54 |       jekyll-swiss (= 0.4.0)
 55 |       jekyll-theme-architect (= 0.1.1)
 56 |       jekyll-theme-cayman (= 0.1.1)
 57 |       jekyll-theme-dinky (= 0.1.1)
 58 |       jekyll-theme-hacker (= 0.1.1)
 59 |       jekyll-theme-leap-day (= 0.1.1)
 60 |       jekyll-theme-merlot (= 0.1.1)
 61 |       jekyll-theme-midnight (= 0.1.1)
 62 |       jekyll-theme-minimal (= 0.1.1)
 63 |       jekyll-theme-modernist (= 0.1.1)
 64 |       jekyll-theme-primer (= 0.5.3)
 65 |       jekyll-theme-slate (= 0.1.1)
 66 |       jekyll-theme-tactile (= 0.1.1)
 67 |       jekyll-theme-time-machine (= 0.1.1)
 68 |       jekyll-titles-from-headings (= 0.5.1)
 69 |       jemoji (= 0.10.1)
 70 |       kramdown (= 1.17.0)
 71 |       liquid (= 4.0.0)
 72 |       listen (= 3.1.5)
 73 |       mercenary (~> 0.3)
 74 |       minima (= 2.5.0)
 75 |       nokogiri (>= 1.8.2, < 2.0)
 76 |       rouge (= 2.2.1)
 77 |       terminal-table (~> 1.4)
 78 |     github-pages-health-check (1.8.1)
 79 |       addressable (~> 2.3)
 80 |       dnsruby (~> 1.60)
 81 |       octokit (~> 4.0)
 82 |       public_suffix (~> 2.0)
 83 |       typhoeus (~> 1.3)
 84 |     html-pipeline (2.9.1)
 85 |       activesupport (>= 2)
 86 |       nokogiri (>= 1.4)
 87 |     http_parser.rb (0.6.0)
 88 |     i18n (0.9.5)
 89 |       concurrent-ruby (~> 1.0)
 90 |     jekyll (3.7.4)
 91 |       addressable (~> 2.4)
 92 |       colorator (~> 1.0)
 93 |       em-websocket (~> 0.5)
 94 |       i18n (~> 0.7)
 95 |       jekyll-sass-converter (~> 1.0)
 96 |       jekyll-watch (~> 2.0)
 97 |       kramdown (~> 1.14)
 98 |       liquid (~> 4.0)
 99 |       mercenary (~> 0.3.3)
100 |       pathutil (~> 0.9)
101 |       rouge (>= 1.7, < 4)
102 |       safe_yaml (~> 1.0)
103 |     jekyll-avatar (0.6.0)
104 |       jekyll (~> 3.0)
105 |     jekyll-coffeescript (1.1.1)
106 |       coffee-script (~> 2.2)
107 |       coffee-script-source (~> 1.11.1)
108 |     jekyll-commonmark (1.2.0)
109 |       commonmarker (~> 0.14)
110 |       jekyll (>= 3.0, < 4.0)
111 |     jekyll-commonmark-ghpages (0.1.5)
112 |       commonmarker (~> 0.17.6)
113 |       jekyll-commonmark (~> 1)
114 |       rouge (~> 2)
115 |     jekyll-default-layout (0.1.4)
116 |       jekyll (~> 3.0)
117 |     jekyll-feed (0.11.0)
118 |       jekyll (~> 3.3)
119 |     jekyll-gist (1.5.0)
120 |       octokit (~> 4.2)
121 |     jekyll-github-metadata (2.9.4)
122 |       jekyll (~> 3.1)
123 |       octokit (~> 4.0, != 4.4.0)
124 |     jekyll-mentions (1.4.1)
125 |       html-pipeline (~> 2.3)
126 |       jekyll (~> 3.0)
127 |     jekyll-optional-front-matter (0.3.0)
128 |       jekyll (~> 3.0)
129 |     jekyll-paginate (1.1.0)
130 |     jekyll-readme-index (0.2.0)
131 |       jekyll (~> 3.0)
132 |     jekyll-redirect-from (0.14.0)
133 |       jekyll (~> 3.3)
134 |     jekyll-relative-links (0.5.3)
135 |       jekyll (~> 3.3)
136 |     jekyll-remote-theme (0.3.1)
137 |       jekyll (~> 3.5)
138 |       rubyzip (>= 1.2.1, < 3.0)
139 |     jekyll-sass-converter (1.5.2)
140 |       sass (~> 3.4)
141 |     jekyll-seo-tag (2.5.0)
142 |       jekyll (~> 3.3)
143 |     jekyll-sitemap (1.2.0)
144 |       jekyll (~> 3.3)
145 |     jekyll-swiss (0.4.0)
146 |     jekyll-theme-architect (0.1.1)
147 |       jekyll (~> 3.5)
148 |       jekyll-seo-tag (~> 2.0)
149 |     jekyll-theme-cayman (0.1.1)
150 |       jekyll (~> 3.5)
151 |       jekyll-seo-tag (~> 2.0)
152 |     jekyll-theme-dinky (0.1.1)
153 |       jekyll (~> 3.5)
154 |       jekyll-seo-tag (~> 2.0)
155 |     jekyll-theme-hacker (0.1.1)
156 |       jekyll (~> 3.5)
157 |       jekyll-seo-tag (~> 2.0)
158 |     jekyll-theme-leap-day (0.1.1)
159 |       jekyll (~> 3.5)
160 |       jekyll-seo-tag (~> 2.0)
161 |     jekyll-theme-merlot (0.1.1)
162 |       jekyll (~> 3.5)
163 |       jekyll-seo-tag (~> 2.0)
164 |     jekyll-theme-midnight (0.1.1)
165 |       jekyll (~> 3.5)
166 |       jekyll-seo-tag (~> 2.0)
167 |     jekyll-theme-minimal (0.1.1)
168 |       jekyll (~> 3.5)
169 |       jekyll-seo-tag (~> 2.0)
170 |     jekyll-theme-modernist (0.1.1)
171 |       jekyll (~> 3.5)
172 |       jekyll-seo-tag (~> 2.0)
173 |     jekyll-theme-primer (0.5.3)
174 |       jekyll (~> 3.5)
175 |       jekyll-github-metadata (~> 2.9)
176 |       jekyll-seo-tag (~> 2.0)
177 |     jekyll-theme-slate (0.1.1)
178 |       jekyll (~> 3.5)
179 |       jekyll-seo-tag (~> 2.0)
180 |     jekyll-theme-tactile (0.1.1)
181 |       jekyll (~> 3.5)
182 |       jekyll-seo-tag (~> 2.0)
183 |     jekyll-theme-time-machine (0.1.1)
184 |       jekyll (~> 3.5)
185 |       jekyll-seo-tag (~> 2.0)
186 |     jekyll-titles-from-headings (0.5.1)
187 |       jekyll (~> 3.3)
188 |     jekyll-watch (2.1.2)
189 |       listen (~> 3.0)
190 |     jemoji (0.10.1)
191 |       gemoji (~> 3.0)
192 |       html-pipeline (~> 2.2)
193 |       jekyll (~> 3.0)
194 |     kramdown (1.17.0)
195 |     liquid (4.0.0)
196 |     listen (3.1.5)
197 |       rb-fsevent (~> 0.9, >= 0.9.4)
198 |       rb-inotify (~> 0.9, >= 0.9.7)
199 |       ruby_dep (~> 1.2)
200 |     mercenary (0.3.6)
201 |     mini_portile2 (2.3.0)
202 |     minima (2.5.0)
203 |       jekyll (~> 3.5)
204 |       jekyll-feed (~> 0.9)
205 |       jekyll-seo-tag (~> 2.1)
206 |     minitest (5.11.3)
207 |     multipart-post (2.0.0)
208 |     nokogiri (1.8.5)
209 |       mini_portile2 (~> 2.3.0)
210 |     octokit (4.13.0)
211 |       sawyer (~> 0.8.0, >= 0.5.3)
212 |     pathutil (0.16.2)
213 |       forwardable-extended (~> 2.6)
214 |     public_suffix (2.0.5)
215 |     rb-fsevent (0.10.3)
216 |     rb-inotify (0.9.10)
217 |       ffi (>= 0.5.0, < 2)
218 |     rouge (2.2.1)
219 |     ruby-enum (0.7.2)
220 |       i18n
221 |     ruby_dep (1.5.0)
222 |     rubyzip (1.2.2)
223 |     safe_yaml (1.0.4)
224 |     sass (3.7.2)
225 |       sass-listen (~> 4.0.0)
226 |     sass-listen (4.0.0)
227 |       rb-fsevent (~> 0.9, >= 0.9.4)
228 |       rb-inotify (~> 0.9, >= 0.9.7)
229 |     sawyer (0.8.1)
230 |       addressable (>= 2.3.5, < 2.6)
231 |       faraday (~> 0.8, < 1.0)
232 |     terminal-table (1.8.0)
233 |       unicode-display_width (~> 1.1, >= 1.1.1)
234 |     thread_safe (0.3.6)
235 |     typhoeus (1.3.1)
236 |       ethon (>= 0.9.0)
237 |     tzinfo (1.2.5)
238 |       thread_safe (~> 0.1)
239 |     unicode-display_width (1.4.0)
240 | 
241 | PLATFORMS
242 |   ruby
243 | 
244 | DEPENDENCIES
245 |   github-pages
246 |   jekyll (~> 3.7.4)
247 |   jekyll-feed (~> 0.6)
248 |   minima (~> 2.0)
249 |   tzinfo-data
250 | 
251 | BUNDLED WITH
252 |    1.17.1
253 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
  2 | Public License
  3 | 
  4 | By exercising the Licensed Rights (defined below), You accept and agree
  5 | to be bound by the terms and conditions of this Creative Commons
  6 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
  7 | ("Public License"). To the extent this Public License may be
  8 | interpreted as a contract, You are granted the Licensed Rights in
  9 | consideration of Your acceptance of these terms and conditions, and the
 10 | Licensor grants You such rights in consideration of benefits the
 11 | Licensor receives from making the Licensed Material available under
 12 | these terms and conditions.
 13 | 
 14 | 
 15 | Section 1 -- Definitions.
 16 | 
 17 |   a. Adapted Material means material subject to Copyright and Similar
 18 |      Rights that is derived from or based upon the Licensed Material
 19 |      and in which the Licensed Material is translated, altered,
 20 |      arranged, transformed, or otherwise modified in a manner requiring
 21 |      permission under the Copyright and Similar Rights held by the
 22 |      Licensor. For purposes of this Public License, where the Licensed
 23 |      Material is a musical work, performance, or sound recording,
 24 |      Adapted Material is always produced where the Licensed Material is
 25 |      synched in timed relation with a moving image.
 26 | 
 27 |   b. Adapter's License means the license You apply to Your Copyright
 28 |      and Similar Rights in Your contributions to Adapted Material in
 29 |      accordance with the terms and conditions of this Public License.
 30 | 
 31 |   c. BY-NC-SA Compatible License means a license listed at
 32 |      creativecommons.org/compatiblelicenses, approved by Creative
 33 |      Commons as essentially the equivalent of this Public License.
 34 | 
 35 |   d. Copyright and Similar Rights means copyright and/or similar rights
 36 |      closely related to copyright including, without limitation,
 37 |      performance, broadcast, sound recording, and Sui Generis Database
 38 |      Rights, without regard to how the rights are labeled or
 39 |      categorized. For purposes of this Public License, the rights
 40 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 41 |      Rights.
 42 | 
 43 |   e. Effective Technological Measures means those measures that, in the
 44 |      absence of proper authority, may not be circumvented under laws
 45 |      fulfilling obligations under Article 11 of the WIPO Copyright
 46 |      Treaty adopted on December 20, 1996, and/or similar international
 47 |      agreements.
 48 | 
 49 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
 50 |      any other exception or limitation to Copyright and Similar Rights
 51 |      that applies to Your use of the Licensed Material.
 52 | 
 53 |   g. License Elements means the license attributes listed in the name
 54 |      of a Creative Commons Public License. The License Elements of this
 55 |      Public License are Attribution, NonCommercial, and ShareAlike.
 56 | 
 57 |   h. Licensed Material means the artistic or literary work, database,
 58 |      or other material to which the Licensor applied this Public
 59 |      License.
 60 | 
 61 |   i. Licensed Rights means the rights granted to You subject to the
 62 |      terms and conditions of this Public License, which are limited to
 63 |      all Copyright and Similar Rights that apply to Your use of the
 64 |      Licensed Material and that the Licensor has authority to license.
 65 | 
 66 |   j. Licensor means the individual(s) or entity(ies) granting rights
 67 |      under this Public License.
 68 | 
 69 |   k. NonCommercial means not primarily intended for or directed towards
 70 |      commercial advantage or monetary compensation. For purposes of
 71 |      this Public License, the exchange of the Licensed Material for
 72 |      other material subject to Copyright and Similar Rights by digital
 73 |      file-sharing or similar means is NonCommercial provided there is
 74 |      no payment of monetary compensation in connection with the
 75 |      exchange.
 76 | 
 77 |   l. Share means to provide material to the public by any means or
 78 |      process that requires permission under the Licensed Rights, such
 79 |      as reproduction, public display, public performance, distribution,
 80 |      dissemination, communication, or importation, and to make material
 81 |      available to the public including in ways that members of the
 82 |      public may access the material from a place and at a time
 83 |      individually chosen by them.
 84 | 
 85 |   m. Sui Generis Database Rights means rights other than copyright
 86 |      resulting from Directive 96/9/EC of the European Parliament and of
 87 |      the Council of 11 March 1996 on the legal protection of databases,
 88 |      as amended and/or succeeded, as well as other essentially
 89 |      equivalent rights anywhere in the world.
 90 | 
 91 |   n. You means the individual or entity exercising the Licensed Rights
 92 |      under this Public License. Your has a corresponding meaning.
 93 | 
 94 | 
 95 | Section 2 -- Scope.
 96 | 
 97 |   a. License grant.
 98 | 
 99 |        1. Subject to the terms and conditions of this Public License,
100 |           the Licensor hereby grants You a worldwide, royalty-free,
101 |           non-sublicensable, non-exclusive, irrevocable license to
102 |           exercise the Licensed Rights in the Licensed Material to:
103 | 
104 |             a. reproduce and Share the Licensed Material, in whole or
105 |                in part, for NonCommercial purposes only; and
106 | 
107 |             b. produce, reproduce, and Share Adapted Material for
108 |                NonCommercial purposes only.
109 | 
110 |        2. Exceptions and Limitations. For the avoidance of doubt, where
111 |           Exceptions and Limitations apply to Your use, this Public
112 |           License does not apply, and You do not need to comply with
113 |           its terms and conditions.
114 | 
115 |        3. Term. The term of this Public License is specified in Section
116 |           6(a).
117 | 
118 |        4. Media and formats; technical modifications allowed. The
119 |           Licensor authorizes You to exercise the Licensed Rights in
120 |           all media and formats whether now known or hereafter created,
121 |           and to make technical modifications necessary to do so. The
122 |           Licensor waives and/or agrees not to assert any right or
123 |           authority to forbid You from making technical modifications
124 |           necessary to exercise the Licensed Rights, including
125 |           technical modifications necessary to circumvent Effective
126 |           Technological Measures. For purposes of this Public License,
127 |           simply making modifications authorized by this Section 2(a)
128 |           (4) never produces Adapted Material.
129 | 
130 |        5. Downstream recipients.
131 | 
132 |             a. Offer from the Licensor -- Licensed Material. Every
133 |                recipient of the Licensed Material automatically
134 |                receives an offer from the Licensor to exercise the
135 |                Licensed Rights under the terms and conditions of this
136 |                Public License.
137 | 
138 |             b. Additional offer from the Licensor -- Adapted Material.
139 |                Every recipient of Adapted Material from You
140 |                automatically receives an offer from the Licensor to
141 |                exercise the Licensed Rights in the Adapted Material
142 |                under the conditions of the Adapter's License You apply.
143 | 
144 |             c. No downstream restrictions. You may not offer or impose
145 |                any additional or different terms or conditions on, or
146 |                apply any Effective Technological Measures to, the
147 |                Licensed Material if doing so restricts exercise of the
148 |                Licensed Rights by any recipient of the Licensed
149 |                Material.
150 | 
151 |        6. No endorsement. Nothing in this Public License constitutes or
152 |           may be construed as permission to assert or imply that You
153 |           are, or that Your use of the Licensed Material is, connected
154 |           with, or sponsored, endorsed, or granted official status by,
155 |           the Licensor or others designated to receive attribution as
156 |           provided in Section 3(a)(1)(A)(i).
157 | 
158 |   b. Other rights.
159 | 
160 |        1. Moral rights, such as the right of integrity, are not
161 |           licensed under this Public License, nor are publicity,
162 |           privacy, and/or other similar personality rights; however, to
163 |           the extent possible, the Licensor waives and/or agrees not to
164 |           assert any such rights held by the Licensor to the limited
165 |           extent necessary to allow You to exercise the Licensed
166 |           Rights, but not otherwise.
167 | 
168 |        2. Patent and trademark rights are not licensed under this
169 |           Public License.
170 | 
171 |        3. To the extent possible, the Licensor waives any right to
172 |           collect royalties from You for the exercise of the Licensed
173 |           Rights, whether directly or through a collecting society
174 |           under any voluntary or waivable statutory or compulsory
175 |           licensing scheme. In all other cases the Licensor expressly
176 |           reserves any right to collect such royalties, including when
177 |           the Licensed Material is used other than for NonCommercial
178 |           purposes.
179 | 
180 | 
181 | Section 3 -- License Conditions.
182 | 
183 | Your exercise of the Licensed Rights is expressly made subject to the
184 | following conditions.
185 | 
186 |   a. Attribution.
187 | 
188 |        1. If You Share the Licensed Material (including in modified
189 |           form), You must:
190 | 
191 |             a. retain the following if it is supplied by the Licensor
192 |                with the Licensed Material:
193 | 
194 |                  i. identification of the creator(s) of the Licensed
195 |                     Material and any others designated to receive
196 |                     attribution, in any reasonable manner requested by
197 |                     the Licensor (including by pseudonym if
198 |                     designated);
199 | 
200 |                 ii. a copyright notice;
201 | 
202 |                iii. a notice that refers to this Public License;
203 | 
204 |                 iv. a notice that refers to the disclaimer of
205 |                     warranties;
206 | 
207 |                  v. a URI or hyperlink to the Licensed Material to the
208 |                     extent reasonably practicable;
209 | 
210 |             b. indicate if You modified the Licensed Material and
211 |                retain an indication of any previous modifications; and
212 | 
213 |             c. indicate the Licensed Material is licensed under this
214 |                Public License, and include the text of, or the URI or
215 |                hyperlink to, this Public License.
216 | 
217 |        2. You may satisfy the conditions in Section 3(a)(1) in any
218 |           reasonable manner based on the medium, means, and context in
219 |           which You Share the Licensed Material. For example, it may be
220 |           reasonable to satisfy the conditions by providing a URI or
221 |           hyperlink to a resource that includes the required
222 |           information.
223 |        3. If requested by the Licensor, You must remove any of the
224 |           information required by Section 3(a)(1)(A) to the extent
225 |           reasonably practicable.
226 | 
227 |   b. ShareAlike.
228 | 
229 |      In addition to the conditions in Section 3(a), if You Share
230 |      Adapted Material You produce, the following conditions also apply.
231 | 
232 |        1. The Adapter's License You apply must be a Creative Commons
233 |           license with the same License Elements, this version or
234 |           later, or a BY-NC-SA Compatible License.
235 | 
236 |        2. You must include the text of, or the URI or hyperlink to, the
237 |           Adapter's License You apply. You may satisfy this condition
238 |           in any reasonable manner based on the medium, means, and
239 |           context in which You Share Adapted Material.
240 | 
241 |        3. You may not offer or impose any additional or different terms
242 |           or conditions on, or apply any Effective Technological
243 |           Measures to, Adapted Material that restrict exercise of the
244 |           rights granted under the Adapter's License You apply.
245 | 
246 | 
247 | Section 4 -- Sui Generis Database Rights.
248 | 
249 | Where the Licensed Rights include Sui Generis Database Rights that
250 | apply to Your use of the Licensed Material:
251 | 
252 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
253 |      to extract, reuse, reproduce, and Share all or a substantial
254 |      portion of the contents of the database for NonCommercial purposes
255 |      only;
256 | 
257 |   b. if You include all or a substantial portion of the database
258 |      contents in a database in which You have Sui Generis Database
259 |      Rights, then the database in which You have Sui Generis Database
260 |      Rights (but not its individual contents) is Adapted Material,
261 |      including for purposes of Section 3(b); and
262 | 
263 |   c. You must comply with the conditions in Section 3(a) if You Share
264 |      all or a substantial portion of the contents of the database.
265 | 
266 | For the avoidance of doubt, this Section 4 supplements and does not
267 | replace Your obligations under this Public License where the Licensed
268 | Rights include other Copyright and Similar Rights.
269 | 
270 | 
271 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
272 | 
273 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
274 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
275 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
276 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
277 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
278 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
279 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
280 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
281 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
282 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
283 | 
284 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
285 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
286 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
287 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
288 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
289 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
290 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
291 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
292 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
293 | 
294 |   c. The disclaimer of warranties and limitation of liability provided
295 |      above shall be interpreted in a manner that, to the extent
296 |      possible, most closely approximates an absolute disclaimer and
297 |      waiver of all liability.
298 | 
299 | 
300 | Section 6 -- Term and Termination.
301 | 
302 |   a. This Public License applies for the term of the Copyright and
303 |      Similar Rights licensed here. However, if You fail to comply with
304 |      this Public License, then Your rights under this Public License
305 |      terminate automatically.
306 | 
307 |   b. Where Your right to use the Licensed Material has terminated under
308 |      Section 6(a), it reinstates:
309 | 
310 |        1. automatically as of the date the violation is cured, provided
311 |           it is cured within 30 days of Your discovery of the
312 |           violation; or
313 | 
314 |        2. upon express reinstatement by the Licensor.
315 | 
316 |      For the avoidance of doubt, this Section 6(b) does not affect any
317 |      right the Licensor may have to seek remedies for Your violations
318 |      of this Public License.
319 | 
320 |   c. For the avoidance of doubt, the Licensor may also offer the
321 |      Licensed Material under separate terms or conditions or stop
322 |      distributing the Licensed Material at any time; however, doing so
323 |      will not terminate this Public License.
324 | 
325 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
326 |      License.
327 | 
328 | 
329 | Section 7 -- Other Terms and Conditions.
330 | 
331 |   a. The Licensor shall not be bound by any additional or different
332 |      terms or conditions communicated by You unless expressly agreed.
333 | 
334 |   b. Any arrangements, understandings, or agreements regarding the
335 |      Licensed Material not stated herein are separate from and
336 |      independent of the terms and conditions of this Public License.
337 | 
338 | 
339 | Section 8 -- Interpretation.
340 | 
341 |   a. For the avoidance of doubt, this Public License does not, and
342 |      shall not be interpreted to, reduce, limit, restrict, or impose
343 |      conditions on any use of the Licensed Material that could lawfully
344 |      be made without permission under this Public License.
345 | 
346 |   b. To the extent possible, if any provision of this Public License is
347 |      deemed unenforceable, it shall be automatically reformed to the
348 |      minimum extent necessary to make it enforceable. If the provision
349 |      cannot be reformed, it shall be severed from this Public License
350 |      without affecting the enforceability of the remaining terms and
351 |      conditions.
352 | 
353 |   c. No term or condition of this Public License will be waived and no
354 |      failure to comply consented to unless expressly agreed to by the
355 |      Licensor.
356 | 
357 |   d. Nothing in this Public License constitutes or may be interpreted
358 |      as a limitation upon, or waiver of, any privileges and immunities
359 |      that apply to the Licensor or You, including from the legal
360 |      processes of any jurisdiction or authority.
361 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Investigating Graph Embedding Neural Networks with Unsupervised Features Extraction for Binary Analysis
  2 | This repository contains the code to reproduce the experiment of the paper accepted at the Workshop on Binary 
  3 | Analysis Research (BAR) 2019. https://ruoyuwang.me/bar2019/pdfs/bar2019-paper20.pdf
  4 | 
  5 | ## Tasks
  6 | 
  7 | You can use the code to solve two different tasks:
  8 | 
  9 | - Binary Similarity with functions embeddings.
 10 | - Compiler Provenance
 11 | 
 12 | ## Reproducing the experiment
 13 | 
 14 | Following this step you will be able to reproduce the experiments of the paper!
 15 | 
 16 | ### Install the requirements
 17 | 
 18 |    ```
 19 |     pip install -r requirements.txt
 20 |    ```
 21 | 
 22 | ### Download datasets
 23 | First you need to download at least one of the datasets.
 24 | We release the three datasets used in the paper:
 25 | 
 26 | - **OpenSSL_dataset**: It includes two version of OpenSSL libraries compiled for X86 and ARM with 
 27 |     gcc with optimizations from 0 to 3. It has been used for binary similarity task.
 28 |     To download it:
 29 |     
 30 |     ```
 31 |     python downloader.py -op
 32 |     ```
 33 | 
 34 |    
 35 | - **Restricted_Compiler_Dataset**: It includes different projects compiled for X86 gcc-3, gcc-5, 
 36 |     clang-3.9 with optimizations from 0 to 3. It has been used for compiler provenance. To download it:
 37 |     
 38 |     ```
 39 |     python downloader.py -rc
 40 |     ```
 41 |     
 42 | - **Compiler_Dataset**: It includes different projects compiled for X86 different compilers (see the paper) with 
 43 | optimizations from 0 to 3. It has been used for compiler provenance. This dataset is very huge,
 44 | you need 30 GB of space to download it. To download it:
 45 |     
 46 |     ```
 47 |     python downloader.py -c
 48 |     ```
 49 | 
 50 | 
 51 | ### Download word2vec model for asm
 52 | 
 53 | Before to run the experiment you need to download the word2vec model for asm.
 54 | It consists of two file, the embedding matrix and the word2id file. The latter that assigns to 
 55 | each instruction an id. The id correspond to the relative row of the instruction inside the
 56 | embedding matrix.
 57 | 
 58 |  ```
 59 |  python downloader.py -i2v
 60 |  ```
 61 | 
 62 | 
 63 | ### Binary Similarity 
 64 | 
 65 | To train the network for binary similarity task go into binary similarity folder and look at the file
 66 | train.sh.
 67 | 
 68 | Here you can change different parameter, like network architecture, path for saving the trained model, 
 69 | the databases you want to use for the training, and the embedding matrix for asm instructions.
 70 | By default the script uses the data downloaded before.
 71 | 
 72 | If you want to change the hyperparameter of the network take a look at parameters.py file!
 73 | 
 74 | To start the training just run:
 75 | 
 76 | ```
 77 | export PYTHONPATH=path-to-repository
 78 | cd binary_similarity
 79 | chmod +x train.sh
 80 | ./train.sh
 81 | ```
 82 | 
 83 | ### Compiler provenance
 84 | 
 85 | Like in the previous case just run:
 86 | 
 87 | ```
 88 | export PYTHONPATH=path-to-repository
 89 | cd compiler_provenance
 90 | chmod +x train.sh
 91 | ./train.sh
 92 | ```
 93 | 
 94 | ## Creating your own dataset
 95 | 
 96 | Following this steps you will be able to create your own dataset!
 97 | 
 98 | - Install radare2 on your system.
 99 | 
100 | - Put the executable you want to add to your dataset inside a directory three as follow:
101 | 
102 | ```
103 | dataset_root/
104 |              \
105 |               \--project/
106 |                          \--compiler
107 |                                     \--optimization
108 |                                                    \executables
109 | ```                                              
110 | 
111 | For example you will ends up with a three like:
112 | 
113 | ```
114 | my_dataset/
115 |            \
116 |             \--openSSL/
117 |                       \--gcc-3
118 |                               \--O1
119 |                                    \executables
120 |                               \--O0
121 |                                    \executables
122 |             \--binutil/
123 |                       \--gcc-3
124 |                               \--O1
125 |                                    \executables
126 |                       \--gcc-5      
127 |                               \--O1
128 |                                    \executables
129 | ```
130 |                           
131 | - Once you have your executable in the correct path just launch:
132 | 
133 | ```
134 | python dataset_creation/ExperimentUtil.py -db name_of_the_db -b --dir dataset_root [-s (if you want to use debug symbols)]
135 | ```
136 | 
137 | - To split your dataset in train validation and test you can use the following command:
138 | 
139 | ```
140 | python dataset_creation/ExperimentUtil.py -db name_of_the_db -s
141 | ```
142 | 
143 | 
144 | 
145 | 
146 | ## Citation
147 | If you use this repository or datasets for your project please cite:
148 | 
149 | Massarelli L., Di Luna G. A., Petroni F., Querzoni L., Baldoni R. Investigating Graph Embedding Neural Networks with Unsupervised Features Extraction for Binary Analysis. To Appear in: Workshop on Binary Analysis Research (BAR) colocated with Symposium on Network and Distributed System Security (NDSS). 2019.
150 | 
151 | ## Aknowledgement
152 | 
153 | In our code we use godown to download data from Google drive. We thank circulosmeos, the creator of godown.
154 | 
155 | 


--------------------------------------------------------------------------------
/asm_embedding/FunctionNormalizer.py:
--------------------------------------------------------------------------------
 1 | # SAFE TEAM
 2 | #
 3 | #
 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
 5 | #
 6 | import numpy as np
 7 | 
 8 | 
 9 | class FunctionNormalizer:
10 | 
11 |     def __init__(self, max_instruction):
12 |         self.max_instructions = max_instruction
13 | 
14 |     def normalize(self, f):
15 |         f = np.asarray(f[0:self.max_instructions])
16 |         length = f.shape[0]
17 |         if f.shape[0] < self.max_instructions:
18 |             f = np.pad(f, (0, self.max_instructions - f.shape[0]), mode='constant')
19 |         return f, length
20 | 
21 |     def normalize_function_pairs(self, pairs):
22 |         lengths = []
23 |         new_pairs = []
24 |         for x in pairs:
25 |             f0, len0 = self.normalize(x[0])
26 |             f1, len1 = self.normalize(x[1])
27 |             lengths.append((len0, len1))
28 |             new_pairs.append((f0, f1))
29 |         return new_pairs, lengths
30 | 
31 |     def normalize_functions(self, functions):
32 |         lengths = []
33 |         new_functions = []
34 |         for f in functions:
35 |             f, length = self.normalize(f)
36 |             lengths.append(length)
37 |             new_functions.append(f)
38 |         return new_functions, lengths
39 | 


--------------------------------------------------------------------------------
/asm_embedding/InstructionsConverter.py:
--------------------------------------------------------------------------------
 1 | # SAFE TEAM
 2 | #
 3 | #
 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
 5 | #
 6 | import json
 7 | 
 8 | 
 9 | class InstructionsConverter:
10 | 
11 |     def __init__(self, json_i2id):
12 |         f = open(json_i2id, 'r')
13 |         self.i2id = json.load(f)
14 |         f.close()
15 | 
16 |     def convert_to_ids(self, instructions_list):
17 |         ret_array = []
18 |         # For each instruction we add +1 to its ID because the first
19 |         # element of the embedding matrix is zero
20 |         for x in instructions_list:
21 |             if x in self.i2id:
22 |                 ret_array.append(self.i2id[x] + 1)
23 |             elif 'X_' in x:
24 |                 # print(str(x) + " is not a known x86 instruction")
25 |                 ret_array.append(self.i2id['X_UNK'] + 1)
26 |             elif 'A_' in x:
27 |                 # print(str(x) + " is not a known arm instruction")
28 |                 ret_array.append(self.i2id['A_UNK'] + 1)
29 |             else:
30 |                 # print("There is a problem " + str(x) + " does not appear to be an asm or arm instruction")
31 |                 ret_array.append(self.i2id['X_UNK'] + 1)
32 |         return ret_array
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/asm_embedding/__init__.py:
--------------------------------------------------------------------------------
1 | # SAFE TEAM
2 | #
3 | #
4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
5 | #


--------------------------------------------------------------------------------
/binary_similarity/PairFactory.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from multiprocessing import Queue
  3 | import networkx as nx
  4 | from networkx import json_graph
  5 | import numpy as np
  6 | from scipy import sparse
  7 | import sqlite3
  8 | from threading import Thread
  9 | from binary_similarity.utils import __padAndFilterLSTM as padAndFilterLSTM
 10 | from binary_similarity.utils import __padAndFilter as padAndFilter
 11 | from asm_embedding.InstructionsConverter import InstructionsConverter
 12 | from asm_embedding.FunctionNormalizer import FunctionNormalizer
 13 | 
 14 | class DatasetGenerator:
 15 | 
 16 |     def get_dataset(self, epoch_number):
 17 |         pass
 18 | 
 19 | 
 20 | class PairFactory(DatasetGenerator):
 21 | 
 22 |     def __init__(self, db_name, feature_type, dataset_type, json_asm2id, max_instructions, max_num_vertices):
 23 |         self.db_name = db_name
 24 |         self.feature_type = feature_type
 25 |         self.dataset_type = dataset_type
 26 |         self.max_instructions = max_instructions
 27 |         self.max_num_vertices = max_num_vertices
 28 |         self.batch_dim = 0
 29 |         self.num_pairs = 0
 30 |         self.num_batches = 0
 31 |         self.converter = InstructionsConverter(json_asm2id)
 32 |         self.normalizer = FunctionNormalizer(self.max_instructions)
 33 | 
 34 |     def get_data_from_cfg(self, cfg):
 35 |         adj = sparse.csr_matrix([1,1])
 36 |         lenghts = []
 37 |         node_matrix = []
 38 | 
 39 |         try:
 40 |             adj = nx.adjacency_matrix(cfg)
 41 |             nodes = cfg.nodes(data=True)
 42 |             for i, n in enumerate(nodes):
 43 |                 filtered = self.converter.convert_to_ids(n[1]['features'])
 44 |                 lenghts.append(len(filtered))
 45 |                 node_matrix.append(self.normalizer.normalize(filtered)[0])
 46 |         except:
 47 |             pass
 48 |         return adj, node_matrix, lenghts
 49 | 
 50 |     def remove_bad_acfg_node(self, g):
 51 |         nodeToRemove = []
 52 |         for n in g.nodes(data=True):
 53 |             f = n[1]['features']
 54 |             if len(f.keys()) == 0:
 55 |                 nodeToRemove.append(n[0])
 56 |         for n in nodeToRemove:
 57 |             g.remove_node(n)
 58 |         return g
 59 | 
 60 |     def get_node_matrix(self, nodes):
 61 |         num_node = len(nodes)
 62 |         node_matrix = np.zeros([num_node, 8])
 63 |         for i, n in enumerate(nodes):
 64 |             f = n[1]['features']
 65 |             if isinstance(f['constant'], list):
 66 |                 node_matrix[i, 0] = len(f['constant'])
 67 |             else:
 68 |                 node_matrix[i, 0] = f['constant']
 69 |             if isinstance(f['string'], list):
 70 |                 node_matrix[i, 1] = len(f['string'])
 71 |             else:
 72 |                 node_matrix[i, 1] = f['string']
 73 |             node_matrix[i, 2] = f['transfer']
 74 |             node_matrix[i, 3] = f['call']
 75 |             node_matrix[i, 4] = f['instruction']
 76 |             node_matrix[i, 5] = f['arith']
 77 |             node_matrix[i, 6] = f['offspring']
 78 |             node_matrix[i, 7] = f['betweenness']
 79 |         return node_matrix
 80 | 
 81 |     def get_data_from_acfg(self, g):
 82 |         g = self.remove_bad_acfg_node(g)
 83 |         if len(g.nodes) > 0:
 84 |             adj = nx.adjacency_matrix(g)
 85 |             node_matrix = self.get_node_matrix(g.nodes(data=True))
 86 |         else:
 87 |             adj = sparse.bsr_matrix(np.zeros([1, 1]))
 88 |             node_matrix = np.zeros([1, 8])
 89 |         lenght = 8
 90 |         return adj, node_matrix, lenght
 91 | 
 92 |     def async_chunker(self, epoch, number_of_pairs, shuffle=True):
 93 |         self.num_pairs = 0
 94 | 
 95 |         conn = sqlite3.connect(self.db_name)
 96 |         cur = conn.cursor()
 97 |         q = cur.execute("SELECT true_pair, false_pair from " + self.dataset_type + " WHERE id=?", (int(epoch),))
 98 |         true_pairs_id, false_pairs_id = q.fetchone()
 99 |         true_pairs_id = json.loads(true_pairs_id)
100 |         false_pairs_id = json.loads(false_pairs_id)
101 | 
102 |         assert len(true_pairs_id) == len(false_pairs_id)
103 |         data_len = len(true_pairs_id)
104 | 
105 |         print("Data Len: " + str(data_len))
106 |         conn.close()
107 | 
108 |         n_chunk = int(data_len / (number_of_pairs/2)) - 1
109 |         self.num_batches = n_chunk
110 | 
111 |         q = Queue(maxsize=50)
112 | 
113 |         t = Thread(target=self.async_create_pairs, args=(epoch, n_chunk, number_of_pairs, q))
114 |         t.start()
115 | 
116 |         for i in range(0, n_chunk):
117 |             yield self.async_get_dataset(i, n_chunk, number_of_pairs, q, shuffle)
118 | 
119 |     def get_pair_from_db(self, epoch_number, chunk, number_of_pairs):
120 | 
121 |         conn = sqlite3.connect(self.db_name)
122 |         cur = conn.cursor()
123 | 
124 |         pairs = []
125 |         labels = []
126 |         lenghts = []
127 | 
128 |         q = cur.execute("SELECT true_pair, false_pair from " + self.dataset_type + " WHERE id=?", (int(epoch_number),))
129 |         true_pairs_id, false_pairs_id = q.fetchone()
130 | 
131 |         true_pairs_id = json.loads(true_pairs_id)
132 |         false_pairs_id = json.loads(false_pairs_id)
133 | 
134 |         data_len = len(true_pairs_id)
135 | 
136 |         i = 0
137 | 
138 |         while i < number_of_pairs:
139 |             if chunk * int(number_of_pairs/2) + i > data_len:
140 |                 break
141 | 
142 |             p = true_pairs_id[chunk * int(number_of_pairs/2) + i]
143 |             q0 = cur.execute("SELECT " + self.feature_type + " FROM " + self.feature_type + " WHERE id=?", (p[0],))
144 |             if self.feature_type == 'acfg':
145 |                 adj0, node0, lenghts0 = self.get_data_from_acfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0])))
146 |             elif self.feature_type == 'lstm_cfg':
147 |                 adj0, node0, lenghts0 = self.get_data_from_cfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0])))
148 | 
149 |             q1 = cur.execute("SELECT " + self.feature_type + " FROM " + self.feature_type + " WHERE id=?", (p[1],))
150 |             if self.feature_type == 'acfg':
151 |                 adj1, node1, lenghts1 = self.get_data_from_acfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0])))
152 |             elif self.feature_type == 'lstm_cfg':
153 |                 adj1, node1, lenghts1 = self.get_data_from_cfg(json_graph.adjacency_graph(json.loads(q1.fetchone()[0])))
154 | 
155 |             pairs.append(((adj0, node0), (adj1, node1)))
156 |             lenghts.append([lenghts0, lenghts1])
157 |             labels.append(+1)
158 | 
159 |             p = false_pairs_id[chunk * int(number_of_pairs/2) + i]
160 |             q0 = cur.execute("SELECT " + self.feature_type + " FROM " + self.feature_type + " WHERE id=?", (p[0],))
161 |             if self.feature_type == 'acfg':
162 |                 adj0, node0,lenghts0 = self.get_data_from_acfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0])))
163 |             elif self.feature_type == 'lstm_cfg':
164 |                 adj0, node0, lenghts0 = self.get_data_from_cfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0])))
165 | 
166 |             q1 = cur.execute("SELECT " + self.feature_type + " FROM " + self.feature_type + " WHERE id=?", (p[1],))
167 |             if self.feature_type == 'acfg':
168 |                 adj1, node1, lenghts1 = self.get_data_from_acfg(json_graph.adjacency_graph(json.loads(q0.fetchone()[0])))
169 |             elif self.feature_type == 'lstm_cfg':
170 |                 adj1, node1, lenghts1 = self.get_data_from_cfg(json_graph.adjacency_graph(json.loads(q1.fetchone()[0])))
171 | 
172 |             pairs.append(((adj0, node0), (adj1, node1)))
173 |             lenghts.append([lenghts0, lenghts1])
174 |             labels.append(-1)
175 | 
176 |             i += 2
177 |         if self.feature_type == 'acfg':
178 |             pairs, labels, output_len = padAndFilter(pairs, labels, self.max_num_vertices)
179 |         elif self.feature_type == 'lstm_cfg':
180 |             pairs, labels, output_len = padAndFilterLSTM(pairs, labels, lenghts, self.max_num_vertices)
181 |         return pairs, labels, output_len
182 | 
183 |     def async_create_pairs(self, epoch, n_chunk, number_of_pairs, q):
184 |         for i in range(0, n_chunk):
185 |             pairs, y_, lenghts = self.get_pair_from_db(epoch, i, number_of_pairs)
186 |             q.put((pairs, y_, lenghts), block=True)
187 | 
188 |     def async_get_dataset(self, chunk, n_chunk, number_of_pairs, q, shuffle):
189 | 
190 |         item = q.get()
191 |         pairs = item[0]
192 |         y_ = item[1]
193 |         lenghts = item[2]
194 | 
195 |         assert (len(pairs) == len(y_))
196 |         n_samples = len(pairs)
197 |         self.batch_dim = n_samples
198 |         self.num_pairs += n_samples
199 | 
200 |         # Unpack the list
201 |         graph1, graph2 = zip(*pairs)
202 |         len1, len2 = zip(*lenghts)
203 |         adj1, nodes1 = zip(*graph1)
204 |         adj2, nodes2 = zip(*graph2)
205 | 
206 |         if shuffle:
207 |             shuffle_indices = np.random.permutation(np.arange(n_samples))
208 |             adj1 = np.array(adj1)[shuffle_indices]
209 |             nodes1 = np.array(nodes1)[shuffle_indices]
210 |             adj2 = np.array(adj2)[shuffle_indices]
211 |             nodes2 = np.array(nodes2)[shuffle_indices]
212 |             y_ = np.array(y_)[shuffle_indices]
213 | 
214 |         for i in range(0, n_samples, number_of_pairs):
215 |             upper_bound = min(i + number_of_pairs, n_samples)
216 | 
217 |             ret_adj1 = adj1[i:upper_bound]
218 |             ret_nodes1 = nodes1[i:upper_bound]
219 |             ret_len1=len1[i:upper_bound]
220 |             ret_adj2 = adj2[i:upper_bound]
221 |             ret_nodes2 = nodes2[i:upper_bound]
222 |             ret_len2 = len2[i:upper_bound]
223 |             ret_y = y_[i:upper_bound]
224 | 
225 |             return ret_adj1, ret_nodes1, ret_adj2, ret_nodes2, ret_y, ret_len1, ret_len2


--------------------------------------------------------------------------------
/binary_similarity/__init__.py:
--------------------------------------------------------------------------------
1 | # SAFE TEAM
2 | #
3 | #
4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
5 | #


--------------------------------------------------------------------------------
/binary_similarity/parameters.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | 
  7 | import argparse
  8 | import time
  9 | import os
 10 | import logging
 11 | 
 12 | def getLogger(logfile):
 13 |     logger = logging.getLogger(__name__)
 14 |     hdlr = logging.FileHandler(logfile)
 15 |     formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
 16 |     hdlr.setFormatter(formatter)
 17 |     logger.addHandler(hdlr) 
 18 |     logger.setLevel(logging.INFO)
 19 |     return logger, hdlr
 20 | 
 21 | class Flags:
 22 | 
 23 |     def __init__(self):
 24 |         parser = argparse.ArgumentParser(description=' cryptoarb.')
 25 | 
 26 |         parser.add_argument("-o", "--output", dest="output_file", help="output directory for logging and models", required=False)
 27 |         parser.add_argument("-e", "--embedding_matrix", dest="embedding_matrix", help="file with the embedding matrix for the instructions",required=False)
 28 |         parser.add_argument("-j", "--json_asm2id", dest="json_asm2id",help="file with the dictionary of instructions ids", required=False)
 29 |         parser.add_argument("-n", "--dbName", dest="db_name", help="Name of the database", required=False)
 30 |         parser.add_argument("-ld","--load_dir", dest="load_dir", help="Load the model from directory load_dir", required=False)
 31 |         parser.add_argument("-nn","--network_type", help="network type: Arith_Mean, Weighted_Mean, RNN, CCS", required=True, dest="network_type")
 32 |         parser.add_argument("-r", "--random", help="if present the network use random embedder", default=False, action="store_true", dest="random_embedding", required=False)
 33 |         parser.add_argument("-te","--trainable_embedding", help="if present the network consider the embedding as trainable", action="store_true", dest="trainable_embeddings", default=False)
 34 |         parser.add_argument("-cv","--cross_val", help="if present the training is done with cross validiation", default=False, action="store_true", dest="cross_val")
 35 | 
 36 |         args = parser.parse_args()
 37 |         self.network_type = args.network_type
 38 | 
 39 |         if self.network_type == "Annotations":
 40 |             self.feature_type = 'acfg'
 41 |         elif self.network_type in ["Arith_Mean", "Attention_Mean", "RNN"]:
 42 |             self.feature_type = 'lstm_cfg'
 43 |         else:
 44 |             print("ERROR NETWORK NOT FOUND")
 45 |             exit(0)
 46 | 
 47 |         self.batch_size = 250           # minibatch size (-1 = whole dataset)
 48 |         self.num_epochs = 50            # number of epochs
 49 |         self.embedding_size = 64        # dimension of latent layers
 50 |         self.learning_rate = 0.001      # init learning_rate
 51 |         self.max_lv = 2                 # embedd depth
 52 |         self.T_iterations= 2            # max rounds of message passing
 53 |         self.l2_reg_lambda = 0          # 0.002 #0.002 # regularization coefficient
 54 |         self.num_checkpoints = 1        # max number of checkpoints
 55 |         self.out_dir = args.output_file # directory for logging
 56 |         self.db_name = args.db_name
 57 |         self.load_dir=str(args.load_dir)
 58 |         self.random_embedding = args.random_embedding
 59 |         self.trainable_embeddings = args.trainable_embeddings
 60 |         self.cross_val = args.cross_val
 61 |         self.cross_val_fold = 5
 62 | 
 63 |         self.rnn_depth = 2              # depth of the rnn
 64 |         self.max_instructions = 150     # number of instructions
 65 |         self.rnn_kind = 0               #kind of rnn cell 0: lstm cell 1: GRU cell
 66 | 
 67 |         self.seed = 2                   # random seed
 68 | 
 69 |         self.reset_logdir()
 70 | 
 71 |         self.file_embedding_matrix = args.embedding_matrix
 72 |         self.json_asm2id = args.json_asm2id
 73 | 
 74 |         self.MAX_NUM_VERTICES = 150
 75 |         self.MIN_NUM_VERTICES = 1
 76 | 
 77 |     def reset_logdir(self):
 78 |         # create logdir
 79 |         timestamp = str(int(time.time()))
 80 |         self.logdir = os.path.abspath(os.path.join(self.out_dir, "runs", timestamp))   
 81 |         os.makedirs(self.logdir, exist_ok=True)   
 82 | 
 83 |         # create logger
 84 |         self.log_file = str(self.logdir)+'/console.log'
 85 |         self.logger, self.hdlr = getLogger(self.log_file)
 86 | 
 87 |         # create symlink for last_run
 88 |         sym_path_logdir = str(self.out_dir)+"/last_run"
 89 |         try:
 90 |             os.unlink(sym_path_logdir)   
 91 |         except:
 92 |             pass
 93 |         try:            
 94 |             os.symlink(self.logdir, sym_path_logdir)
 95 |         except:
 96 |             print("\nfailed to create symlink!\n")
 97 | 
 98 |     def close_log(self):
 99 |         self.hdlr.close()
100 |         self.logger.removeHandler(self.hdlr)
101 |         handlers = self.logger.handlers[:]
102 |         for handler in handlers:
103 |             handler.close()
104 |             self.logger.removeHandler(handler)
105 | 
106 |     def __str__(self):
107 |         msg = ""
108 |         msg +="\n  Parameters:\n"
109 |         msg +="\tNetwork_Type: {}\n".format(self.network_type)
110 |         msg +="\tRandom embedding: {}\n".format(self.random_embedding)
111 |         msg +="\tTrainable embedding: {}\n".format(self.trainable_embeddings)
112 |         msg +="\tFeature Type: {}\n".format(self.feature_type)
113 |         msg +="\tlogdir: {}\n".format(self.logdir)
114 |         msg +="\tbatch_size: {}\n".format(self.batch_size)
115 |         msg +="\tnum_epochs: {}\n".format(self.num_epochs)
116 |         msg +="\tembedding_size: {}\n".format(self.embedding_size)
117 |         msg +="\tlearning_rate: {}\n".format(self.learning_rate)
118 |         msg +="\tmax_lv: {}\n".format(self.max_lv)
119 |         msg +="\tT_iterations: {}\n".format(self.T_iterations)
120 |         msg +="\tl2_reg_lambda: {}\n".format(self.l2_reg_lambda)
121 |         msg +="\tnum_checkpoints: {}\n".format(self.num_checkpoints)
122 |         msg +="\tseed: {}\n".format(self.seed)
123 |         msg +="\tMAX_NUM_VERTICES: {}\n".format(self.MAX_NUM_VERTICES)
124 |         msg += "\tMax Instructions per cfg node: {}\n".format(self.max_instructions)
125 |         if self.network_type == "RNN" or self.network_type=="Attention":
126 |             msg += "\tRNN type (0, lstm; 1, GRU): {}\n".format(self.rnn_kind)
127 |             msg += "\tRNN Depth: {}\n".format(self.rnn_depth)
128 |         if self.network_type== "Attention":
129 |             msg += "\tAttention hops:{}\n".format(self.attention_hops)
130 |             msg += "\tAttention depth:{}\n".format(self.attention_detph)
131 |         if self.network_type=="RNN_SINGLE":
132 |             msg += "\tAttention hops:{}\n".format(self.attention_hops)
133 |             msg += "\tAttention depth:{}\n".format(self.attention_detph)
134 |             msg += "\tDense Layer Size:{}\n".format(self.dense_layer_size)
135 |         return msg
136 | 


--------------------------------------------------------------------------------
/binary_similarity/s2v_network.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | # structure2vec
  5 | # DE-MF : discriminative embedding using Mean Field
  6 | # SAFE TEAM
  7 | #
  8 | #
  9 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
 10 | #
 11 | 
 12 | class Network:
 13 | 
 14 |     def __init__(self,
 15 |         features_size,
 16 |         embedding_size,
 17 |         max_lv,
 18 |         T_iterations,
 19 |         learning_rate,
 20 |         l2_reg_lambda
 21 |     ):
 22 |         self.features_size = features_size
 23 |         self.embedding_size = embedding_size
 24 |         self.max_lv = max_lv
 25 |         self.T_iterations = T_iterations
 26 |         self.learning_rate=learning_rate
 27 |         self.l2_reg_lambda = l2_reg_lambda
 28 |         self.generateGraphClassificationNetwork()
 29 | 
 30 |     def meanField(self, input_x, input_adj, name):
 31 | 
 32 |         W1_tiled = tf.tile(tf.expand_dims(self.W1,0), [tf.shape(input_x)[0],1,1], name=name + "_W1_tiled")
 33 |         W2_tiled = tf.tile(tf.expand_dims(self.W2,0), [tf.shape(input_x)[0],1,1], name=name + "_W2_tiled")
 34 | 
 35 |         CONV_PARAMS_tiled = []
 36 |         for lv in range(self.max_lv):
 37 |             CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv],0), [tf.shape(input_x)[0],1,1], name=name + "_CONV_PARAMS_tiled_" + str(lv)))
 38 | 
 39 |         w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv")
 40 |         l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1))
 41 |         out=w1xv
 42 |         for i in range(self.T_iterations-1):
 43 |             ol = l
 44 |             lv = self.max_lv -1
 45 |             while lv >= 0 :
 46 |                 with tf.name_scope('cell_' + str(lv)) as scope:
 47 |                     node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv))
 48 |                     if lv > 0:
 49 |                         ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv))
 50 |                     else:
 51 |                         ol = node_linear
 52 |                 lv -= 1
 53 |             out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2))
 54 |             l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2))
 55 | 
 56 |         fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1, name=name + "_y_potential_expand_dims")
 57 |         
 58 |         graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding')
 59 |         return graph_embedding
 60 | 
 61 |         
 62 |     def generateGraphClassificationNetwork(self):
 63 | 
 64 |         # Placeholders for input, output
 65 |         self.x_1 = tf.placeholder(tf.float32,[None, None,self.features_size], name = "x_1") # Vettore del nodo in input 1
 66 |         self.adj_1 = tf.placeholder(tf.float32,[None, None, None],name="adj_1") # Matrice di adiacenza 1
 67 |         self.x_2 = tf.placeholder(tf.float32,[None, None,self.features_size], name = "x_2") # Vettore del nodo in input 2
 68 |         self.adj_2 = tf.placeholder(tf.float32,[None, None, None],name="adj_2") # Matrice di adiacenza 2
 69 |         self.y = tf.placeholder(tf.float32, [None], name='y_')
 70 | 
 71 |         self.lenghts_1 = tf.placeholder(tf.float32, [None], name="len1")
 72 |         self.lenghts_2 = tf.placeholder(tf.float32, [None], name="len2")
 73 | 
 74 |         self.norms = []
 75 | 
 76 |         l2_loss = tf.constant(0.0)
 77 | 
 78 |         # -------------------------------
 79 |         #   1. MEAN FIELD COMPONENT
 80 |         # -------------------------------
 81 | 
 82 |         #1. parameters for MeanField
 83 |         with tf.name_scope('parameters_MeanField'):
 84 | 
 85 |             # W1 is a [d,p] matrix, and p is the embedding size as explained above
 86 |             self.W1 = tf.Variable(tf.truncated_normal([self.features_size,self.embedding_size], stddev=0.1), name="W1")
 87 |             self.norms.append(tf.norm(self.W1))
 88 | 
 89 |             # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv)
 90 |             self.CONV_PARAMS = []
 91 |             for lv in range(self.max_lv):
 92 |                 v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), name="CONV_PARAMS_"+str(lv))
 93 |                 self.CONV_PARAMS.append(v)
 94 |                 self.norms.append(tf.norm(v))
 95 | 
 96 |             # W2 is another [p,p] matrix to transform the embedding vector
 97 |             self.W2 =  tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), name="W2")
 98 |             self.norms.append(tf.norm(self.W2))
 99 |         
100 |         # Mean Field
101 |         with tf.name_scope('MeanField1'):
102 |             self.graph_embedding_1 = tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x_1,self.adj_1,"MeanField1"), axis=1), axis=1,name="embedding1") # tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x_1,self.adj_1), axis=1), 1)
103 | 
104 |         with tf.name_scope('MeanField2'):
105 |             self.graph_embedding_2 = tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x_2,self.adj_2,"MeanField2"), axis=1), axis=1,name="embedding2") # tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x_2,self.adj_2), axis=1), 1)
106 | 
107 |         with tf.name_scope('Siamese'):
108 |              self.cos_similarity = tf.reduce_sum(tf.multiply(self.graph_embedding_1, self.graph_embedding_2), axis=1,name="cosSimilarity")
109 |         
110 |         # Regularization
111 |         with tf.name_scope("Regularization"):
112 |             l2_loss += tf.nn.l2_loss(self.W1)
113 |             for lv in range(self.max_lv):
114 |                 l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv])
115 |             l2_loss += tf.nn.l2_loss(self.W2)
116 | 
117 |         # CalculateMean cross-entropy loss
118 |         with tf.name_scope("Loss"):            
119 |             self.loss = tf.reduce_sum(tf.squared_difference(self.cos_similarity, self.y), name="loss")
120 |             self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss
121 | 
122 |         # Train step
123 |         with tf.name_scope("Train_Step"):
124 |             self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss)
125 | 


--------------------------------------------------------------------------------
/binary_similarity/s2v_network_arith_mean.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | import tensorflow as tf
  7 | 
  8 | 
  9 | # structure2vec
 10 | # DE-MF : discriminative embedding using Mean Field
 11 | 
 12 | 
 13 | class NetworkLSTM:
 14 | 
 15 |     def __init__(self,
 16 |                  features_size,
 17 |                  embedding_size,
 18 |                  max_lv,
 19 |                  T_iterations,
 20 |                  learning_rate,
 21 |                  l2_reg_lambda,
 22 |                  batch_size,
 23 |                  max_instructions,
 24 |                  max_nodes,
 25 |                  rnn_depth,
 26 |                  rnn_kind,
 27 |                  embedding_matrix,
 28 |                  trainable_embeddings
 29 |                  ):
 30 |         print("Features size"+str(features_size))
 31 |         self.features_size = features_size
 32 |         self.embedding_size = embedding_size
 33 |         self.max_lv = max_lv
 34 |         self.T_iterations = T_iterations
 35 |         self.learning_rate = learning_rate
 36 |         self.l2_reg_lambda = l2_reg_lambda
 37 |         self.RRN_HIDDEN = features_size
 38 |         self.batch_size = batch_size
 39 |         self.max_instructions = max_instructions
 40 |         self.max_nodes = max_nodes
 41 |         self.rnn_depth = rnn_depth
 42 |         self.rnn_kind=rnn_kind
 43 |         self.embedding_matrix = embedding_matrix
 44 |         self.trainable_embeddings = trainable_embeddings
 45 |         self.generateGraphClassificationNetwork()
 46 | 
 47 |     def extract_axis_1(self, data, ind):
 48 |         """
 49 |         Get specified elements along the first axis of tensor.
 50 |         :param data: Tensorflow tensor that will be subsetted.
 51 |         :param ind: Indices to take (one for each element along axis 0 of data).
 52 |         :return: Subsetted tensor.
 53 |         """
 54 |         ind=tf.nn.relu(ind-1)
 55 |         batch_range = tf.range(tf.shape(data)[0])
 56 |         indices = tf.stack([batch_range, ind], axis=1)
 57 |         res = tf.gather_nd(data, indices)
 58 | 
 59 |         return res
 60 | 
 61 |     def create_flattening_array(self, max_nodes, batch_size):
 62 |         shape_array = []
 63 |         for p in range(0, batch_size):
 64 |             for i in range(0, max_nodes):
 65 |                 shape_array.append([p, i])
 66 |         return shape_array
 67 | 
 68 |     def create_gather_array(self, max_nodes, batch_size):
 69 |         shape_array = []
 70 |         for p in range(0, batch_size):
 71 |             x = []
 72 |             for i in range(0, max_nodes):
 73 |                 x.append([0, i + p * max_nodes])
 74 |             shape_array.append(x)
 75 |         return shape_array
 76 | 
 77 |     def lstmFeatures(self, input_x, lengths):
 78 |         flattened_inputs = tf.reshape(input_x, [-1, tf.shape(input_x)[2]], name="Flattening")
 79 |         flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs)
 80 |         last_outputs = tf.squeeze(tf.nn.l2_normalize(tf.reduce_mean(flattened_embedded, name='arith_mean', axis=1), axis=1))
 81 |         print("shape: " + str(tf.shape(last_outputs)))
 82 |         gather_output2 = tf.reshape(last_outputs, [-1, tf.shape(input_x)[1], self.features_size], name="Deflattening")
 83 |         output = tf.identity(gather_output2, name="LSTMOutput")
 84 |         output = tf.nn.l2_normalize(output)
 85 |         return output
 86 | 
 87 |     def meanField(self, input_x, input_adj, name):
 88 | 
 89 |         # for batch processing
 90 |         W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled")
 91 |         W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled")
 92 | 
 93 |         CONV_PARAMS_tiled = []
 94 |         for lv in range(self.max_lv):
 95 |             CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1],
 96 |                                              name=name + "_CONV_PARAMS_tiled_" + str(lv)))
 97 | 
 98 |         w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv")
 99 |         l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1))
100 |         out = w1xv
101 |         for i in range(self.T_iterations - 1):
102 |             ol = l
103 |             lv = self.max_lv - 1
104 |             while lv >= 0:
105 |                 with tf.name_scope('cell_' + str(lv)) as scope:
106 |                     node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv))
107 |                     if lv > 0:
108 |                         ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv))
109 |                     else:
110 |                         ol = node_linear
111 |                 lv -= 1
112 | 
113 |             out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2))
114 |             l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2))
115 | 
116 |         fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1,
117 |                             name=name + "_y_potential_expand_dims")
118 | 
119 |         graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding')
120 |         return graph_embedding
121 | 
122 |     def generateGraphClassificationNetwork(self):
123 |         print("Features size:"+str(self.features_size))
124 | 
125 |         self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix),
126 |                                     trainable=self.trainable_embeddings,
127 |                                     name="instruction_embedding", dtype=tf.float32)
128 | 
129 |         self.x_1 = tf.placeholder(tf.int32, [None, None, self.max_instructions],name="x_1")
130 |         self.adj_1 = tf.placeholder(tf.float32, [None, None, None], name="adj_1")  #
131 |         self.lenghts_1 = tf.placeholder(tf.int32, [None,None], name='lenghts_1')
132 |         self.x_2 = tf.placeholder(tf.int32, [None, None, self.max_instructions], name="x_2")
133 |         self.adj_2 = tf.placeholder(tf.float32, [None, None, None], name="adj_2")
134 |         self.lenghts_2 = tf.placeholder(tf.int32, [None,None], name='lenghts_2')
135 |         self.y = tf.placeholder(tf.float32, [None], name='y_')
136 | 
137 |         # Euclidean norms; p = 2
138 |         self.norms = []
139 | 
140 |         l2_loss = tf.constant(0.0)
141 | 
142 |         # -------------------------------
143 |         #   1. MEAN FIELD COMPONENT
144 |         # -------------------------------
145 | 
146 |         # 1. parameters for MeanField
147 |         with tf.name_scope('parameters_MeanField'):
148 | 
149 |             # W1 is a [d,p] matrix, and p is the embedding size as explained above
150 |             self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1")
151 |             self.norms.append(tf.norm(self.W1))
152 | 
153 |             # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv)
154 |             self.CONV_PARAMS = []
155 |             for lv in range(self.max_lv):
156 |                 v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),
157 |                                 name="CONV_PARAMS_" + str(lv))
158 |                 self.CONV_PARAMS.append(v)
159 |                 self.norms.append(tf.norm(v))
160 | 
161 |             # W2 is another [p,p] matrix to transform the embedding vector
162 |             self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),
163 |                                   name="W2")
164 |             self.norms.append(tf.norm(self.W2))
165 | 
166 |         # LSTMExtraction
167 |         with tf.name_scope('LSTMExtraction1'):
168 |             with tf.variable_scope('lstm1'):
169 |                 self.x_1_after_lstm = self.lstmFeatures(self.x_1, self.lenghts_1)
170 |         with tf.name_scope('LSTMExtraction2'):
171 |             with tf.variable_scope('lstm2'):
172 |                 self.x2_after_lstm = self.lstmFeatures(self.x_2, self.lenghts_2)
173 | 
174 |         # Mean Field
175 |         with tf.name_scope('MeanField1'):
176 |             self.graph_embedding_1 = tf.nn.l2_normalize(
177 |                 tf.squeeze(self.meanField(self.x_1_after_lstm, self.adj_1, "MeanField1"), axis=1), axis=1,
178 |                 name="embedding1")
179 | 
180 |         with tf.name_scope('MeanField2'):
181 |             self.graph_embedding_2 = tf.nn.l2_normalize(
182 |                 tf.squeeze(self.meanField(self.x2_after_lstm, self.adj_2, "MeanField2"), axis=1), axis=1,
183 |                 name="embedding2")
184 | 
185 |         with tf.name_scope('Siamese'):
186 |             self.cos_similarity = tf.reduce_sum(tf.multiply(self.graph_embedding_1, self.graph_embedding_2), axis=1,
187 |                                                 name="cosSimilarity")
188 | 
189 |         # Regularization
190 |         with tf.name_scope("Regularization"):
191 |             l2_loss += tf.nn.l2_loss(self.W1)
192 |             for lv in range(self.max_lv):
193 |                 l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv])
194 |             l2_loss += tf.nn.l2_loss(self.W2)
195 | 
196 |         # CalculateMean cross-entropy loss
197 |         with tf.name_scope("Loss"):
198 | 
199 |             self.loss = tf.reduce_sum(tf.squared_difference(self.cos_similarity, self.y), name="loss")
200 |             self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss  # regularization
201 | 
202 |         # Train step
203 |         with tf.name_scope("Train_Step"):
204 |             self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss)
205 | 


--------------------------------------------------------------------------------
/binary_similarity/s2v_network_attention_mean.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | 
  7 | import tensorflow as tf
  8 | 
  9 | 
 10 | # structure2vec
 11 | # DE-MF : discriminative embedding using Mean Field
 12 | 
 13 | 
 14 | class NetworkLSTM:
 15 | 
 16 |     def __init__(self,
 17 |                  features_size,
 18 |                  embedding_size,
 19 |                  max_lv,
 20 |                  T_iterations,
 21 |                  learning_rate,
 22 |                  l2_reg_lambda,
 23 |                  batch_size,
 24 |                  max_instructions,
 25 |                  max_nodes,
 26 |                  rnn_depth,
 27 |                  rnn_kind,
 28 |                  embedding_matrix,
 29 |                  trainable_embeddings
 30 |                  ):
 31 |         print("Features size"+str(features_size))
 32 |         self.features_size = features_size
 33 |         self.embedding_size = embedding_size
 34 |         self.max_lv = max_lv
 35 |         self.T_iterations = T_iterations
 36 |         self.learning_rate = learning_rate
 37 |         self.l2_reg_lambda = l2_reg_lambda
 38 |         self.RRN_HIDDEN = features_size
 39 |         self.batch_size = batch_size
 40 |         self.max_instructions = max_instructions
 41 |         self.max_nodes = max_nodes
 42 |         self.rnn_depth = rnn_depth
 43 |         self.rnn_kind=rnn_kind
 44 |         self.embedding_matrix = embedding_matrix
 45 |         self.trainable_embeddings = trainable_embeddings
 46 |         self.generateGraphClassificationNetwork()
 47 | 
 48 |     def extract_axis_1(self, data, ind):
 49 |         """
 50 |         Get specified elements along the first axis of tensor.
 51 |         :param data: Tensorflow tensor that will be subsetted.
 52 |         :param ind: Indices to take (one for each element along axis 0 of data).
 53 |         :return: Subsetted tensor.
 54 |         """
 55 |         ind=tf.nn.relu(ind-1)
 56 |         batch_range = tf.range(tf.shape(data)[0])
 57 |         indices = tf.stack([batch_range, ind], axis=1)
 58 |         res = tf.gather_nd(data, indices)
 59 | 
 60 |         return res
 61 | 
 62 |     def create_flattening_array(self, max_nodes, batch_size):
 63 |         shape_array = []
 64 |         for p in range(0, batch_size):
 65 |             for i in range(0, max_nodes):
 66 |                 shape_array.append([p, i])
 67 |         return shape_array
 68 | 
 69 |     def create_gather_array(self, max_nodes, batch_size):
 70 |         shape_array = []
 71 |         for p in range(0, batch_size):
 72 |             x = []
 73 |             for i in range(0, max_nodes):
 74 |                 x.append([0, i + p * max_nodes])
 75 |             shape_array.append(x)
 76 |         return shape_array
 77 | 
 78 |     def lstmFeatures(self, input_x, lengths):
 79 | 
 80 |         flattened_inputs = tf.reshape(input_x, [-1, tf.shape(input_x)[2]], name="Flattening")
 81 |         flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs)
 82 |         self.W0 = tf.Variable(tf.constant(1.0 / self.max_instructions, shape=[1, self.max_instructions]), name="W0")
 83 |         w0_tiled = tf.tile(tf.expand_dims(self.W0, 0), [tf.shape(flattened_inputs)[0], 1, 1], name="W0_tiled")
 84 |         last_outputs = tf.squeeze(tf.nn.l2_normalize(tf.matmul(w0_tiled, flattened_embedded,
 85 |                                                                name='features_weighted_mean')), axis=1)
 86 |         gather_output2 = tf.reshape(last_outputs, [-1, tf.shape(input_x)[1], self.features_size], name="Deflattening")
 87 |         output = tf.identity(gather_output2, name="LSTMOutput")
 88 |         output=tf.nn.l2_normalize(output)
 89 |         return output
 90 | 
 91 |     def meanField(self, input_x, input_adj, name):
 92 | 
 93 |         # for batch processing
 94 |         W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled")
 95 |         W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled")
 96 | 
 97 |         CONV_PARAMS_tiled = []
 98 |         for lv in range(self.max_lv):
 99 |             CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1],
100 |                                              name=name + "_CONV_PARAMS_tiled_" + str(lv)))
101 | 
102 |         w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv")
103 |         l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1))
104 |         out = w1xv
105 |         for i in range(self.T_iterations - 1):
106 |             ol = l
107 |             lv = self.max_lv - 1
108 |             while lv >= 0:
109 |                 with tf.name_scope('cell_' + str(lv)) as scope:
110 |                     node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv))
111 |                     if lv > 0:
112 |                         ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv))
113 |                     else:
114 |                         ol = node_linear
115 |                 lv -= 1
116 | 
117 |             out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2))
118 |             l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2))
119 | 
120 |         fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1,
121 |                             name=name + "_y_potential_expand_dims")
122 | 
123 |         graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding')
124 |         return graph_embedding
125 | 
126 |     def generateGraphClassificationNetwork(self):
127 |         print("Features size:"+str(self.features_size))
128 | 
129 |         self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix),
130 |                                     trainable=self.trainable_embeddings,
131 |                                     name="instruction_embedding", dtype=tf.float32)
132 | 
133 |         self.x_1 = tf.placeholder(tf.int32, [None, None, self.max_instructions],name="x_1")
134 |         self.adj_1 = tf.placeholder(tf.float32, [None, None, None], name="adj_1")  #
135 |         self.lenghts_1 = tf.placeholder(tf.int32, [None,None], name='lenghts_1')
136 |         self.x_2 = tf.placeholder(tf.int32, [None, None, self.max_instructions], name="x_2")
137 |         self.adj_2 = tf.placeholder(tf.float32, [None, None, None], name="adj_2")
138 |         self.lenghts_2 = tf.placeholder(tf.int32, [None,None], name='lenghts_2')
139 |         self.y = tf.placeholder(tf.float32, [None], name='y_')
140 | 
141 |         # Euclidean norms; p = 2
142 |         self.norms = []
143 | 
144 |         l2_loss = tf.constant(0.0)
145 | 
146 |         # -------------------------------
147 |         #   1. MEAN FIELD COMPONENT
148 |         # -------------------------------
149 | 
150 |         # 1. parameters for MeanField
151 |         with tf.name_scope('parameters_MeanField'):
152 | 
153 |             # W1 is a [d,p] matrix, and p is the embedding size as explained above
154 |             self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1")
155 |             self.norms.append(tf.norm(self.W1))
156 | 
157 |             # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv)
158 |             self.CONV_PARAMS = []
159 |             for lv in range(self.max_lv):
160 |                 v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),
161 |                                 name="CONV_PARAMS_" + str(lv))
162 |                 self.CONV_PARAMS.append(v)
163 |                 self.norms.append(tf.norm(v))
164 | 
165 |             # W2 is another [p,p] matrix to transform the embedding vector
166 |             self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),
167 |                                   name="W2")
168 |             self.norms.append(tf.norm(self.W2))
169 | 
170 |         # LSTMExtraction
171 |         with tf.name_scope('LSTMExtraction1'):
172 |             with tf.variable_scope('lstm1'):
173 |                 self.x_1_after_lstm = self.lstmFeatures(self.x_1, self.lenghts_1)
174 |         with tf.name_scope('LSTMExtraction2'):
175 |             with tf.variable_scope('lstm2'):
176 |                 self.x2_after_lstm = self.lstmFeatures(self.x_2, self.lenghts_2)
177 | 
178 |         # Mean Field
179 |         with tf.name_scope('MeanField1'):
180 |             self.graph_embedding_1 = tf.nn.l2_normalize(
181 |                 tf.squeeze(self.meanField(self.x_1_after_lstm, self.adj_1, "MeanField1"), axis=1), axis=1,
182 |                 name="embedding1")
183 | 
184 |         with tf.name_scope('MeanField2'):
185 |             self.graph_embedding_2 = tf.nn.l2_normalize(
186 |                 tf.squeeze(self.meanField(self.x2_after_lstm, self.adj_2, "MeanField2"), axis=1), axis=1,
187 |                 name="embedding2")
188 | 
189 |         with tf.name_scope('Siamese'):
190 |             self.cos_similarity = tf.reduce_sum(tf.multiply(self.graph_embedding_1, self.graph_embedding_2), axis=1,
191 |                                                 name="cosSimilarity")
192 | 
193 |         # Regularization
194 |         with tf.name_scope("Regularization"):
195 |             l2_loss += tf.nn.l2_loss(self.W1)
196 |             for lv in range(self.max_lv):
197 |                 l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv])
198 |             l2_loss += tf.nn.l2_loss(self.W2)
199 | 
200 |         # CalculateMean cross-entropy loss
201 |         with tf.name_scope("Loss"):
202 | 
203 |             self.loss = tf.reduce_sum(tf.squared_difference(self.cos_similarity, self.y), name="loss")
204 |             self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss  # regularization
205 | 
206 |         # Train step
207 |         with tf.name_scope("Train_Step"):
208 |             self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss)


--------------------------------------------------------------------------------
/binary_similarity/s2v_network_rnn.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | 
  7 | import tensorflow as tf
  8 | 
  9 | class NetworkLSTM:
 10 | 
 11 |     def __init__(self,
 12 |                  features_size,
 13 |                  embedding_size,
 14 |                  max_lv,
 15 |                  T_iterations,
 16 |                  learning_rate,
 17 |                  l2_reg_lambda,
 18 |                  batch_size,
 19 |                  max_instructions,
 20 |                  max_nodes,
 21 |                  rnn_depth,
 22 |                  rnn_kind,
 23 |                  embedding_matrix,
 24 |                  trainable_embeddings
 25 |                  ):
 26 |         print("Features size"+str(features_size))
 27 |         self.features_size = features_size
 28 |         self.embedding_size = embedding_size
 29 |         self.max_lv = max_lv
 30 |         self.T_iterations = T_iterations
 31 |         self.learning_rate = learning_rate
 32 |         self.l2_reg_lambda = l2_reg_lambda
 33 |         self.RRN_HIDDEN = features_size
 34 |         self.batch_size = batch_size
 35 |         self.max_instructions = max_instructions
 36 |         self.max_nodes = max_nodes
 37 |         self.rnn_depth = rnn_depth
 38 |         self.rnn_kind=rnn_kind
 39 |         self.embedding_matrix = embedding_matrix
 40 |         self.trainable_embeddings = trainable_embeddings
 41 |         self.generateGraphClassificationNetwork()
 42 | 
 43 |     def extract_axis_1(self, data, ind):
 44 |         """
 45 |         Get specified elements along the first axis of tensor.
 46 |         :param data: Tensorflow tensor that will be subsetted.
 47 |         :param ind: Indices to take (one for each element along axis 0 of data).
 48 |         :return: Subsetted tensor.
 49 |         """
 50 |         ind=tf.nn.relu(ind-1)
 51 |         batch_range = tf.range(tf.shape(data)[0])
 52 |         indices = tf.stack([batch_range, ind], axis=1)
 53 |         res = tf.gather_nd(data, indices)
 54 | 
 55 |         return res
 56 | 
 57 |     def lstmFeatures(self, input_x, lengths):
 58 | 
 59 |         flattened_inputs=tf.reshape(input_x,[-1,tf.shape(input_x)[2]],name="Flattening")
 60 | 
 61 |         flattened_lenghts = tf.reshape(lengths, [-1])
 62 |         max = tf.reduce_max(flattened_lenghts)
 63 |         flattened_inputs=flattened_inputs[:,:max]
 64 |         flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs)
 65 | 
 66 |         zeros = tf.zeros(tf.shape(flattened_lenghts)[0], dtype=tf.int32)
 67 |         mask = tf.not_equal(flattened_lenghts, zeros)
 68 |         int_mask = tf.cast(mask, tf.int32)
 69 |         fake_output = tf.zeros([self.features_size], dtype=tf.float32)
 70 |         partitions = tf.dynamic_partition(flattened_embedded, int_mask, 2)
 71 |         real_nodes=partitions[1]
 72 |         real_lenghts=tf.boolean_mask(flattened_lenghts,mask)
 73 |         fake_zero = tf.tile([fake_output], [tf.shape(flattened_embedded)[0] - tf.shape(partitions[1])[0], 1])
 74 | 
 75 |         if self.rnn_kind==0:
 76 |             rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in ([self.features_size] * self.rnn_depth)]
 77 |         else:
 78 |             rnn_layers = [tf.nn.rnn_cell.GRUCell(size) for size in ([self.features_size] * self.rnn_depth)]
 79 |         cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
 80 | 
 81 |         rnn_outputs, _ = tf.nn.dynamic_rnn(cell, real_nodes, sequence_length=real_lenghts,
 82 |                                            dtype=tf.float32,
 83 |                                            time_major=False,
 84 |                                            parallel_iterations=88)
 85 | 
 86 |         last_outputs = self.extract_axis_1(rnn_outputs, real_lenghts)
 87 | 
 88 |         condition_indices = tf.dynamic_partition(
 89 |             tf.range(tf.shape(flattened_embedded)[0]), int_mask, 2)
 90 |         last_outputs = tf.dynamic_stitch(condition_indices, [fake_zero, last_outputs])
 91 | 
 92 |         gather_output2 = tf.reshape(last_outputs,
 93 |                                       [-1,tf.shape(input_x)[1],self.features_size], name="Deflattening")
 94 | 
 95 |         output = tf.identity(gather_output2, name="LSTMOutput")
 96 |         output=tf.nn.l2_normalize(output)
 97 |         return output
 98 | 
 99 |     def meanField(self, input_x, input_adj, name):
100 | 
101 |         W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled")
102 |         W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled")
103 | 
104 |         CONV_PARAMS_tiled = []
105 |         for lv in range(self.max_lv):
106 |             CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1],
107 |                                              name=name + "_CONV_PARAMS_tiled_" + str(lv)))
108 | 
109 |         w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv")
110 |         l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1))
111 |         out = w1xv
112 |         for i in range(self.T_iterations - 1):
113 |             ol = l
114 |             lv = self.max_lv - 1
115 |             while lv >= 0:
116 |                 with tf.name_scope('cell_' + str(lv)) as scope:
117 |                     node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv))
118 |                     if lv > 0:
119 |                         ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv))
120 |                     else:
121 |                         ol = node_linear
122 |                 lv -= 1
123 | 
124 |             out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2))
125 |             l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2))
126 | 
127 |         fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1,
128 |                             name=name + "_y_potential_expand_dims")
129 | 
130 |         graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding')
131 |         return graph_embedding
132 | 
133 |     def generateGraphClassificationNetwork(self):
134 |         print("Features size:"+str(self.features_size))
135 | 
136 |         self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix),
137 |                                                     trainable=self.trainable_embeddings,
138 |                                                     name="instruction_embedding", dtype=tf.float32)
139 | 
140 |         self.x_1 = tf.placeholder(tf.int32, [None, None, None], name="x_1")
141 |         self.adj_1 = tf.placeholder(tf.float32, [None, None, None], name="adj_1")
142 |         self.lenghts_1 = tf.placeholder(tf.int32, [None,None], name='lenghts_1')
143 |         self.x_2 = tf.placeholder(tf.int32, [None, None, None], name="x_2")
144 |         self.adj_2 = tf.placeholder(tf.float32, [None, None, None], name="adj_2")
145 |         self.lenghts_2 = tf.placeholder(tf.int32, [None,None], name='lenghts_2')
146 |         self.y = tf.placeholder(tf.float32, [None], name='y_')
147 | 
148 |         self.norms = []
149 | 
150 |         l2_loss = tf.constant(0.0)
151 | 
152 |         # 1. parameters for MeanField
153 |         with tf.name_scope('parameters_MeanField'):
154 | 
155 |             self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1")
156 |             self.norms.append(tf.norm(self.W1))
157 | 
158 |             self.CONV_PARAMS = []
159 |             for lv in range(self.max_lv):
160 |                 v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),
161 |                                 name="CONV_PARAMS_" + str(lv))
162 |                 self.CONV_PARAMS.append(v)
163 |                 self.norms.append(tf.norm(v))
164 | 
165 |             self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),
166 |                                   name="W2")
167 |             self.norms.append(tf.norm(self.W2))
168 | 
169 |         with tf.name_scope('LSTMExtraction1'):
170 |             with tf.variable_scope('lstm1'):
171 |                 self.x_1_after_lstm = self.lstmFeatures(self.x_1, self.lenghts_1)
172 |         with tf.name_scope('LSTMExtraction2'):
173 |             with tf.variable_scope('lstm2'):
174 |                 self.x2_after_lstm = self.lstmFeatures(self.x_2, self.lenghts_2)
175 | 
176 |         with tf.name_scope('MeanField1'):
177 |             self.graph_embedding_1 = tf.nn.l2_normalize(
178 |                 tf.squeeze(self.meanField(self.x_1_after_lstm, self.adj_1, "MeanField1"), axis=1), axis=1,
179 |                 name="embedding1")
180 | 
181 |         with tf.name_scope('MeanField2'):
182 |             self.graph_embedding_2 = tf.nn.l2_normalize(
183 |                 tf.squeeze(self.meanField(self.x2_after_lstm, self.adj_2, "MeanField2"), axis=1), axis=1,
184 |                 name="embedding2")
185 | 
186 |         with tf.name_scope('Siamese'):
187 |             self.cos_similarity = tf.reduce_sum(tf.multiply(self.graph_embedding_1, self.graph_embedding_2), axis=1,
188 |                                                 name="cosSimilarity")
189 | 
190 |         # Regularization
191 |         with tf.name_scope("Regularization"):
192 |             l2_loss += tf.nn.l2_loss(self.W1)
193 |             for lv in range(self.max_lv):
194 |                 l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv])
195 |             l2_loss += tf.nn.l2_loss(self.W2)
196 | 
197 |         # CalculateMean cross-entropy loss
198 |         with tf.name_scope("Loss"):
199 | 
200 |             self.loss = tf.reduce_sum(tf.squared_difference(self.cos_similarity, self.y), name="loss")
201 |             self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss
202 | 
203 |         # Train step
204 |         with tf.name_scope("Train_Step"):
205 |             self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss)
206 | 


--------------------------------------------------------------------------------
/binary_similarity/train.py:
--------------------------------------------------------------------------------
 1 | # SAFE TEAM
 2 | #
 3 | #
 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
 5 | #
 6 | 
 7 | import numpy as np
 8 | from binary_similarity.s2v_trainer import S2VTrainer
 9 | from binary_similarity.parameters import Flags
10 | import pickle
11 | 
12 | def run_test():
13 |     flags = Flags()
14 |     flags.logger.info("\n{}\n".format(flags))
15 | 
16 |     print(str(flags))
17 | 
18 |     file_embedding_matrix = flags.file_embedding_matrix
19 | 
20 |     embedding_matrix = np.float32(np.load(file_embedding_matrix))
21 |     if flags.random_embedding:
22 |         embedding_matrix = np.random.rand(*np.shape(embedding_matrix)).astype(np.float32)
23 |         embedding_matrix[0, :] = np.zeros(np.shape(embedding_matrix)[1]).astype(np.float32)
24 | 
25 |     if flags.cross_val:
26 |         print("STARTING CROSS VALIDATION")
27 |         res = []
28 |         mean = 0
29 |         for i in range(0, flags.cross_val_fold):
30 |             print("CROSS VALIDATION STARTING FOLD: " + str(i))
31 |             if i > 0:
32 |                 flags.close_log()
33 |                 flags.reset_logdir()
34 |                 del flags
35 |                 flags = Flags()
36 |                 flags.logger.info("\n{}\n".format(flags))
37 | 
38 |             flags.logger.info("Starting cross validation fold: {}".format(i))
39 | 
40 |             flags.db_name = flags.db_name + "_val_" + str(i+1) + ".db"
41 |             flags.logger.info("Cross validation db name: {}".format(flags.db_name))
42 | 
43 |             trainer = S2VTrainer(flags, embedding_matrix)
44 |             best_val_auc = trainer.train()
45 | 
46 |             mean += best_val_auc
47 |             res.append(best_val_auc)
48 | 
49 |             flags.logger.info("Cross validation fold {} finished best auc: {}".format(i, best_val_auc))
50 |             print("FINISH FOLD: " + str(i) + " BEST VAL AUC: " + str(best_val_auc))
51 | 
52 |         print("CROSS VALIDATION ENDED")
53 |         print("Result: " + str(res))
54 |         print("")
55 | 
56 |         flags.logger.info("Cross validation finished results: {}".format(res))
57 |         flags.logger.info(" mean: {}".format(mean / flags.cross_val_fold))
58 |         flags.close_log()
59 | 
60 |         flags.close_log()
61 | 
62 |     else:
63 |         trainer = S2VTrainer(flags, embedding_matrix)
64 |         trainer.train()
65 |         flags.close_log()
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     run_test()
70 | 


--------------------------------------------------------------------------------
/binary_similarity/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Type of the network to use
 4 | 
 5 | NETWORK_TYPE="Attention_Mean"
 6 | #NETWORK_TYPE="Arith_Mean"
 7 | #NETWORK_TYPE="RNN"
 8 | #NETWORK_TYPE="Annotations"
 9 | 
10 | # Root path for the experiment
11 | MODEL_PATH=experiments/
12 | 
13 | # Path to the sqlite db with diassembled functions
14 | DB_PATH=../data/OpenSSL_dataset.db
15 | 
16 | # Path to embedding matrix
17 | EMBEDDING_MATRIX=../data/i2v/embedding_matrix.npy
18 | 
19 | # Path to instruction2id dictionary
20 | INS2ID=../data/i2v/word2id.json
21 | 
22 | # Add this argument to train.py to use random instructions embeddings
23 | RANDOM_EMBEDDINGS="-r"
24 | 
25 | # Add this argument to train.py to use trainable instructions embeddings
26 | TRAINABLE_EMBEDDINGS="-te"
27 | 
28 | python3 train.py --o $MODEL_PATH -n $DB_PATH -nn $NETWORK_TYPE -e $EMBEDDING_MATRIX -j $INS2ID
29 | 


--------------------------------------------------------------------------------
/binary_similarity/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.sparse import csr_matrix
 3 | # SAFE TEAM
 4 | #
 5 | #
 6 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
 7 | #
 8 | 
 9 | 
10 | def __padAndFilter(input_pairs, input_labels, max_num_vertices):
11 | 
12 |     output_pairs = []
13 |     output_labels = []
14 |     output_len = []
15 | 
16 |     for pair, label in zip(input_pairs, input_labels):
17 |         g1 = pair[0]
18 |         g2 = pair[1]
19 | 
20 |         # graph 1
21 |         adj1 = g1[0]
22 |         nodes1 = g1[1]
23 | 
24 |         # graph 2
25 |         adj2 = g2[0]
26 |         nodes2 = g2[1]
27 | 
28 |         if (len(nodes1) <= max_num_vertices) and (len(nodes2) <= max_num_vertices):
29 |             pad_lenght1 = max_num_vertices - len(nodes1)
30 |             new_node1 = np.pad(nodes1, [(0, pad_lenght1), (0, 0)], mode='constant')
31 |             pad_lenght1 = max_num_vertices - adj1.shape[0]
32 |             adj1_dense = np.pad(adj1.todense(), [(0, pad_lenght1), (0, pad_lenght1)], mode='constant')
33 |             g1 = (adj1_dense, new_node1)
34 |             pad_lenght2 = max_num_vertices - len(nodes2)
35 |             new_node2 = np.pad(nodes2, [(0, pad_lenght2), (0, 0)], mode='constant')
36 |             pad_lenght2 = max_num_vertices - adj2.shape[0]
37 |             adj2_dense = np.pad(adj2.todense(), [(0, pad_lenght2), (0, pad_lenght2)], mode='constant')
38 |             g2 = (adj2_dense, new_node2)
39 |             output_pairs.append([g1, g2])
40 |             output_labels.append(label)
41 |             output_len.append([8,8])
42 | 
43 |     return output_pairs, output_labels, output_len
44 | 
45 | def __padAndFilterLSTM(input_pairs, input_labels, input_len, max_num_vertices):
46 | 
47 | 
48 |     output_pairs = []
49 |     output_labels = []
50 |     output_len=[]
51 | 
52 |     for pair, label, lens in zip(input_pairs, input_labels, input_len):
53 | 
54 |         try:
55 | 
56 |             g1 = pair[0]
57 |             g2 = pair[1]
58 | 
59 |             # graph 1
60 |             adj1 = g1[0]
61 |             nodes1 = g1[1]
62 | 
63 |             # graph 2
64 |             adj2 = g2[0]
65 |             nodes2 = g2[1]
66 |             if (len(nodes1) <= max_num_vertices) and (len(nodes2) <= max_num_vertices):
67 | 
68 |                 pad_lenght1 = max_num_vertices - len(nodes1)
69 |                 new_node1 = np.pad(nodes1, [(0, pad_lenght1), (0, 0)], mode='constant')
70 | 
71 |                 pad_lenght1 = max_num_vertices - adj1.shape[0]
72 |                 adj1_dense = np.pad(adj1.todense(), [(0, pad_lenght1), (0, pad_lenght1)], mode='constant')
73 |                 g1 = (adj1_dense, new_node1)
74 | 
75 |                 pad_lenght2 = max_num_vertices - len(nodes2)
76 |                 new_node2 = np.pad(nodes2, [(0, pad_lenght2), (0, 0)], mode='constant')
77 |                 pad_lenght2 = max_num_vertices - adj2.shape[0]
78 |                 adj2_dense = np.pad(adj2.todense(), [(0, pad_lenght2), (0, pad_lenght2)], mode='constant')
79 |                 g2 = (adj2_dense, new_node2)
80 | 
81 |                 output_pairs.append([g1, g2])
82 |                 output_labels.append(label)
83 |                 new_lens_0 = lens[0]+[0]*(max_num_vertices-len(lens[0]))
84 |                 new_lens_1 = lens[1]+[0]*(max_num_vertices-len(lens[1]))
85 |                 output_len.append([new_lens_0, new_lens_1])
86 | 
87 |         except:
88 |             pass
89 | 
90 |     return output_pairs, output_labels, output_len
91 | 


--------------------------------------------------------------------------------
/compiler_provenance/FunctionFactory.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | 
  7 | from compiler_provenance.utils import __padAndFilter as padAndFilter
  8 | from asm_embedding.InstructionsConverter import InstructionsConverter
  9 | from asm_embedding.FunctionNormalizer import FunctionNormalizer
 10 | import json
 11 | from multiprocessing import Queue
 12 | from multiprocessing import Process
 13 | import networkx as nx
 14 | from networkx.readwrite import json_graph
 15 | import numpy as np
 16 | import random
 17 | from scipy import sparse
 18 | import sqlite3
 19 | 
 20 | 
 21 | 
 22 | class DatasetGenerator:
 23 | 
 24 |     def get_dataset(self, epoch_number):
 25 |         pass
 26 | 
 27 | 
 28 | class PairFactory(DatasetGenerator):
 29 | 
 30 |     def __init__(self, db_name, feature_type, dataset_type, json_asm2id, max_instructions, max_num_vertices, encoder, batch_size, flags=None):
 31 |         self.db_name = db_name
 32 |         self.feature_type = feature_type
 33 |         self.dataset_type = dataset_type
 34 |         self.encoder = encoder
 35 |         self.max_instructions = max_instructions
 36 |         self.max_num_vertices = max_num_vertices
 37 |         self.batch_dim = 0
 38 |         self.num_pairs = 0
 39 |         self.num_batches = 0
 40 |         self.flags=flags
 41 | 
 42 |         self.converter = InstructionsConverter(json_asm2id)
 43 |         self.normalizer = FunctionNormalizer(self.max_instructions)
 44 | 
 45 |         conn = sqlite3.connect(self.db_name)
 46 |         cur = conn.cursor()
 47 |         q = cur.execute("SELECT count(*) from " + self.dataset_type)
 48 |         count=int(q.fetchone()[0])
 49 |         n_chunk = int(count / batch_size) - 1
 50 | 
 51 |         self.num_batches = n_chunk
 52 |         conn.close()
 53 | 
 54 |     def remove_bad_acfg_node(self, g):
 55 |         nodeToRemove = []
 56 |         for n in g.nodes(data=True):
 57 |             f = n[1]['features']
 58 |             if len(f.keys()) == 0:
 59 |                 nodeToRemove.append(n[0])
 60 |         for n in nodeToRemove:
 61 |             g.remove_node(n)
 62 |         return g
 63 | 
 64 |     def split(self,a, n):
 65 |         return [a[i::n] for i in range(n)]
 66 | 
 67 |     def get_node_matrix(self, nodes):
 68 | 
 69 |         if isinstance(nodes, int):
 70 |             print(nodes)
 71 | 
 72 |         num_node = len(nodes)
 73 |         node_matrix = np.zeros([num_node, 8])
 74 |         for i, n in enumerate(nodes):
 75 |             f = n[1]['features']
 76 | 
 77 |             if isinstance(f['constant'], int):
 78 |                 node_matrix[i, 0] = f['constant']
 79 |             else:
 80 |                 node_matrix[i, 0] = len(f['constant'])
 81 | 
 82 |             if isinstance(f['string'], int):
 83 |                 node_matrix[i, 1] = f['string']
 84 |             else:
 85 |                 node_matrix[i, 1] = len(f['string'])
 86 | 
 87 |             node_matrix[i, 2] = f['transfer']
 88 |             node_matrix[i, 3] = f['call']
 89 |             node_matrix[i, 4] = f['instruction']
 90 |             node_matrix[i, 5] = f['arith']
 91 |             node_matrix[i, 6] = f['offspring']
 92 |             node_matrix[i, 7] = f['betweenness']
 93 |         return node_matrix
 94 | 
 95 |     def get_data_from_acfg(self, g):
 96 |         g = self.remove_bad_acfg_node(g)
 97 |         if len(g.nodes) > 0:
 98 |             adj = nx.adjacency_matrix(g)
 99 |             node_matrix = self.get_node_matrix(g.nodes(data=True))
100 |         else:
101 |             adj = sparse.bsr_matrix(np.zeros([1, 1]))
102 |             node_matrix = np.zeros([1, 8])
103 |         lenghts = [8] * len(node_matrix)
104 |         return adj, node_matrix, lenghts
105 | 
106 |     def get_data_from_cfg(self, cfg):
107 |         adj = sparse.csr_matrix([1, 1])
108 |         lenghts = []
109 |         node_matrix = []
110 | 
111 |         try:
112 |             adj = nx.adjacency_matrix(cfg)
113 |             nodes = cfg.nodes(data=True)
114 |             for i, n in enumerate(nodes):
115 |                 filtered = self.converter.convert_to_ids(n[1]['features'])
116 |                 lenghts.append(len(filtered))
117 |                 node_matrix.append(self.normalizer.normalize(filtered)[0])
118 |         except:
119 |             pass
120 |         return adj, node_matrix, lenghts
121 | 
122 |     def async_chunker(self, epoch, batch_size, shuffle=True):
123 |         self.num_pairs = 0
124 | 
125 |         conn = sqlite3.connect(self.db_name)
126 |         cur = conn.cursor()
127 |         q = cur.execute("SELECT id from " + self.dataset_type)
128 |         ids = q.fetchall()
129 |         ids = [ii[0] for ii in ids]
130 | 
131 |         data_len = len(ids)
132 | 
133 |         n_chunk = int(data_len / batch_size) - 1
134 |         random.seed(17)
135 |         self.num_batches = n_chunk
136 |         lista_chunk=range(0,n_chunk)
137 |         coda = Queue(maxsize=50)
138 |         n_proc = 10
139 |         listone = self.split(lista_chunk, n_proc)
140 |         for i in range(0,n_proc):
141 |             l = list(listone[i])
142 |             p = Process(target=self.async_create_pair,args=((epoch, l, batch_size, coda, shuffle, self.encoder)))
143 |             p.start()
144 | 
145 |         while coda.empty():
146 |             pass
147 |         for i in range(0, n_chunk):
148 |             yield self.async_get_dataset(i, n_chunk, batch_size, coda, shuffle)
149 | 
150 |     def get_pair_from_db(self, epoch_number, chunk, number_of_functions, label_encoder):
151 | 
152 |         conn = sqlite3.connect(self.db_name)
153 |         cur = conn.cursor()
154 | 
155 |         functions = []
156 |         labels = []
157 |         lenghts = []
158 | 
159 |         q = cur.execute("SELECT id FROM " + self.dataset_type)
160 |         ids = q.fetchall()
161 |         rng = random.Random(epoch_number)
162 |         rng.shuffle(ids)
163 |         data_len = len(ids)
164 |         i = 0
165 | 
166 |         while i < number_of_functions:
167 |             if chunk * int(number_of_functions) + i > data_len:
168 |                 break
169 | 
170 |             ii = ids[chunk * int(number_of_functions) + i]
171 |             q = cur.execute("SELECT " + self.feature_type + " FROM " + self.feature_type + " WHERE id=?", ii)
172 | 
173 |             if self.feature_type == 'acfg':
174 |                 adj, node, lenghts0 = self.get_data_from_acfg(json_graph.adjacency_graph(json.loads(q.fetchone()[0])))
175 |             elif self.feature_type == 'lstm_cfg':
176 |                 adj, node, lenghts0 = self.get_data_from_cfg(json_graph.adjacency_graph(json.loads(q.fetchone()[0])))
177 | 
178 |             functions.append([(adj, node)])
179 |             lenghts.append(lenghts0)
180 | 
181 |             if self.flags is None or self.flags.class_kind == "CMP" or self.flags.class_kind == "FML":
182 |                 query_str = "SELECT  compiler FROM functions WHERE id=?"
183 |             elif self.flags.class_kind == "CMPOPT":
184 |                 query_str = "SELECT  compiler,optimization FROM functions  WHERE id=?"
185 |             elif self.flags.class_kind == "OPT":
186 |                 query_str = "SELECT  optimization FROM functions  WHERE id=?"
187 | 
188 |             q = cur.execute(query_str, ii)
189 |             q_compiler = q.fetchone()
190 | 
191 |             if self.flags.class_kind == "CMPOPT":
192 |                 compiler = q_compiler[0] + '-' + q_compiler[1]
193 |             elif self.flags.class_kind == "FML":
194 |                 compiler = str(q_compiler[0]).split('-')[0]
195 |             else:
196 |                 compiler = q_compiler[0]
197 | 
198 |             encoded = label_encoder.transform([compiler])
199 |             labels.append(encoded)
200 |             i += 1
201 | 
202 |         if self.feature_type == 'acfg':
203 |             pairs, labels, output_len = padAndFilter(functions, labels, [[[1]]]*len(functions), self.max_num_vertices)
204 |             output_len = [[1]]
205 | 
206 |         elif self.feature_type == 'lstm_cfg':
207 |             pairs, labels, output_len = padAndFilter(functions, labels, lenghts, self.max_num_vertices)
208 | 
209 |         return pairs, labels, output_len
210 | 
211 |     def async_create_pair(self, epoch, n_chunk, number_of_functions, q, shuffle, encoder):
212 | 
213 |         for i in n_chunk:
214 |             pairs, y, lenghts = self.get_pair_from_db(epoch, i, number_of_functions, encoder)
215 |             assert (len(pairs) == len(y))
216 |             n_samples=len(pairs)
217 |             len1 = []
218 |             for l in lenghts:
219 |                 len1.append(l[0])
220 |             adj1 = []
221 |             nodes1 = []
222 |             for p in pairs:
223 |                 adj1.append(p[0][0])
224 |                 nodes1.append(p[0][1])
225 |             y_ = []
226 |             for yy in y:
227 |                 y_.append(yy[0])
228 | 
229 |             for i in range(0, n_samples, number_of_functions):
230 |                 upper_bound = min(i + number_of_functions, n_samples)
231 | 
232 |                 ret_adj = adj1[i:upper_bound]
233 |                 ret_nodes = nodes1[i:upper_bound]
234 |                 ret_len = len1[i:upper_bound]
235 |                 ret_y = y_[i:upper_bound]
236 | 
237 |             q.put((ret_adj,ret_nodes,ret_y,ret_len), block=True)
238 | 
239 |     def async_get_dataset(self, chunk, n_chunk, number_of_pairs, q, shuffle):
240 |         item = q.get()
241 |         n_samples = len(item[0])
242 |         self.batch_dim = n_samples
243 |         self.num_pairs += n_samples
244 |         return item[0], item[1], item[2], item[3]
245 | 
246 | 


--------------------------------------------------------------------------------
/compiler_provenance/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucamassarelli/Unsupervised-Features-Learning-For-Binary-Similarity/fe37c4aa0d1ac8d14488e096a5f6deb7aea929fe/compiler_provenance/__init__.py


--------------------------------------------------------------------------------
/compiler_provenance/parameters.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | 
  7 | import argparse
  8 | import time
  9 | import sys, os
 10 | import logging
 11 | 
 12 | def getLogger(logfile):
 13 |     logger = logging.getLogger(__name__)
 14 |     hdlr = logging.FileHandler(logfile)
 15 |     formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
 16 |     hdlr.setFormatter(formatter)
 17 |     logger.addHandler(hdlr) 
 18 |     logger.setLevel(logging.INFO)
 19 |     return logger, hdlr
 20 | 
 21 | class Flags:
 22 | 
 23 |     def __init__(self):
 24 |         parser = argparse.ArgumentParser(description=' cryptoarb.')
 25 | 
 26 |         parser.add_argument("-o", "--output", dest="output_file", help="output directory for logging and models", required=False)
 27 |         parser.add_argument("-e", "--embedding_matrix", dest="embedding_matrix", help="file with the embedding matrix for the instructions",required=False)
 28 |         parser.add_argument("-j", "--json_asm2id", dest="json_asm2id",help="file with the dictionary of instructions ids", required=False)
 29 |         parser.add_argument("-n", "--dbName", dest="db_name", help="Name of the database", required=False)
 30 |         parser.add_argument("-ld","--load_dir", dest="load_dir", help="Load the model from directory load_dir", required=False)
 31 |         parser.add_argument("-nn","--network_type", help="network type: Arith_Mean, Weighted_Mean, RNN, CCS", required=True, dest="network_type")
 32 |         parser.add_argument("-r", "--random", help="if present the network use random embedder", default=False, action="store_true", dest="random_embedding", required=False)
 33 |         parser.add_argument("-te","--trainable_embedding", help="if present the network consider the embedding as trainable", action="store_true", dest="trainable_embeddings", default=False)
 34 |         parser.add_argument("-cv","--cross_val", help="if present the training is done with cross validiation", default=False, action="store_true", dest="cross_val")
 35 |         parser.add_argument("-cl", "--classification_kind", help="classification kind: Compiler, Compiler+Opt, Opt",default="Compiler", required=False, dest="classification_kind")
 36 | 
 37 |         args = parser.parse_args()
 38 |         self.network_type = args.network_type
 39 | 
 40 |         if self.network_type == "Annotations":
 41 |             self.feature_type = 'acfg'
 42 |         elif self.network_type in ["Arith_Mean", "Attention_Mean", "RNN"]:
 43 |             self.feature_type = 'lstm_cfg'
 44 |         else:
 45 |             print("ERROR NETWORK NOT FOUND")
 46 |             exit(0)
 47 | 
 48 |         if args.classification_kind == "Family":
 49 |             self.class_kind="FML"
 50 |         elif args.classification_kind == "Compiler":
 51 |             self.class_kind="CMP"
 52 |         elif args.classification_kind == "Compiler+Opt":
 53 |             self.class_kind="CMPOPT"
 54 |         elif args.classification_kind == "Opt":
 55 |             self.class_kind = "OPT"
 56 |         else:
 57 |             print("Classification option unkown")
 58 |             exit(0)
 59 | 
 60 |         # mode = mean_field
 61 |         self.batch_size = 250           # minibatch size (-1 = whole dataset)
 62 |         self.num_epochs = 50            # number of epochs
 63 |         self.embedding_size = 64        # dimension of latent layers
 64 |         self.learning_rate = 0.001      # init learning_rate
 65 |         self.max_lv = 2                 # embedd depth
 66 |         self.T_iterations= 2            # max rounds of message passing
 67 |         self.l2_reg_lambda = 0          # 0.002 #0.002 # regularization coefficient
 68 |         self.num_checkpoints = 1        # max number of checkpoints
 69 |         self.out_dir = args.output_file # directory for logging
 70 |         self.db_name = args.db_name
 71 |         self.load_dir=str(args.load_dir)
 72 |         self.random_embedding = args.random_embedding
 73 |         self.trainable_embeddings = args.trainable_embeddings
 74 |         self.cross_val = args.cross_val
 75 |         self.cross_val_fold = 5
 76 |         self.dense_layer_size = 3000
 77 |         self.rnn_depth = 1              # depth of the rnn
 78 |         self.max_instructions = 50      # number of instructions
 79 |         self.rnn_kind = 1               #kind of rnn cell 0: lstm cell 1: GRU cell
 80 | 
 81 | 
 82 |         self.seed = 2 # random seed
 83 | 
 84 |         # create logdir and logger
 85 |         self.reset_logdir()
 86 | 
 87 |         self.file_embedding_matrix = args.embedding_matrix
 88 |         self.json_asm2id = args.json_asm2id
 89 | 
 90 |         self.MAX_NUM_VERTICES = 150
 91 |         self.MIN_NUM_VERTICES = 1
 92 | 
 93 |     def reset_logdir(self):
 94 |         # create logdir
 95 |         timestamp = str(int(time.time()))
 96 |         self.logdir = os.path.abspath(os.path.join(self.out_dir, "runs", timestamp))   
 97 |         os.makedirs(self.logdir, exist_ok=True)   
 98 | 
 99 |         # create logger
100 |         self.log_file = str(self.logdir)+'/console.log'
101 |         self.logger, self.hdlr = getLogger(self.log_file)
102 | 
103 |         # create symlink for last_run
104 |         sym_path_logdir = str(self.out_dir)+"/last_run"
105 |         try:
106 |             os.unlink(sym_path_logdir)   
107 |         except:
108 |             pass
109 |         try:            
110 |             os.symlink(self.logdir, sym_path_logdir)
111 |         except:
112 |             print("\nfailed to create symlink!\n")
113 | 
114 |     def close_log(self):
115 |         self.hdlr.close()
116 |         self.logger.removeHandler(self.hdlr)
117 |         handlers = self.logger.handlers[:]
118 |         for handler in handlers:
119 |             handler.close()
120 |             self.logger.removeHandler(handler)
121 | 
122 |     def __str__(self):
123 |         msg = ""
124 |         msg +="\n  Parameters:\n"
125 |         msg +="\tNetwork_Type: {}\n".format(self.network_type)
126 |         msg +="\tRandom embedding: {}\n".format(self.random_embedding)
127 |         msg +="\tTrainable embedding: {}\n".format(self.trainable_embeddings)
128 |         msg +="\tFeature Type: {}\n".format(self.feature_type)
129 |         msg +="\tlogdir: {}\n".format(self.logdir)
130 |         msg +="\tbatch_size: {}\n".format(self.batch_size)
131 |         msg +="\tnum_epochs: {}\n".format(self.num_epochs)
132 |         msg +="\tembedding_size: {}\n".format(self.embedding_size)
133 |         msg +="\tlearning_rate: {}\n".format(self.learning_rate)
134 |         msg +="\tmax_lv: {}\n".format(self.max_lv)
135 |         msg +="\tT_iterations: {}\n".format(self.T_iterations)
136 |         msg +="\tl2_reg_lambda: {}\n".format(self.l2_reg_lambda)
137 |         msg +="\tnum_checkpoints: {}\n".format(self.num_checkpoints)
138 |         msg +="\tseed: {}\n".format(self.seed)
139 |         msg +="\tMAX_NUM_VERTICES: {}\n".format(self.MAX_NUM_VERTICES)
140 |         msg +="\tMax Instructions per cfg node: {}\n".format(self.max_instructions)
141 |         msg +="\tDense Layer Size: {}\n".format(self.dense_layer_size)
142 |         msg += "\tClasses kind: {}\n".format(self.class_kind)
143 |         if self.network_type == "RNN":
144 |             msg += "\tRNN type (0, lstm; 1, GRU): {}\n".format(self.rnn_kind)
145 |             msg += "\tRNN Depth: {}\n".format(self.rnn_depth)
146 |         if self.network_type == "Attention":
147 |             msg += "\tRNN type (0, lstm; 1, GRU): {}\n".format(self.rnn_kind)
148 |             msg += "\tRNN Depth: {}\n".format(self.rnn_depth)
149 |             msg += "\tAttention hops: {}\n".format(self.attention_hops)
150 |             msg += "\tAttention depth: {}\n".format(self.attention_detph)
151 |         return msg
152 | 


--------------------------------------------------------------------------------
/compiler_provenance/s2v_classification_network_annotations.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | 
  7 | import tensorflow as tf
  8 | 
  9 | class Network:
 10 | 
 11 |     def __init__(self,
 12 |         features_size,
 13 |         embedding_size,
 14 |         max_lv,
 15 |         T_iterations,
 16 |         learning_rate,
 17 |         l2_reg_lambda,
 18 |         dense_layer_size,
 19 |         num_classes
 20 |     ):
 21 |         self.features_size = features_size
 22 |         self.embedding_size = embedding_size
 23 |         self.max_lv = max_lv
 24 |         self.T_iterations = T_iterations
 25 |         self.learning_rate=learning_rate
 26 |         self.l2_reg_lambda = l2_reg_lambda
 27 |         self.dense_layer_size = dense_layer_size
 28 |         self.number_of_classes = num_classes
 29 |         self.generateGraphClassificationNetwork()
 30 | 
 31 | 
 32 |     def meanField(self, input_x, input_adj, name):
 33 | 
 34 |         W1_tiled = tf.tile(tf.expand_dims(self.W1,0), [tf.shape(input_x)[0],1,1], name=name + "_W1_tiled")
 35 |         W2_tiled = tf.tile(tf.expand_dims(self.W2,0), [tf.shape(input_x)[0],1,1], name=name + "_W2_tiled")
 36 | 
 37 |         CONV_PARAMS_tiled = []
 38 |         for lv in range(self.max_lv):
 39 |             CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv],0), [tf.shape(input_x)[0],1,1], name=name + "_CONV_PARAMS_tiled_" + str(lv)))
 40 | 
 41 |         w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv")
 42 |         l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1))
 43 |         out=w1xv
 44 |         for i in range(self.T_iterations-1):
 45 |             ol = l
 46 |             lv = self.max_lv - 1
 47 |             while lv >= 0 :
 48 |                 with tf.name_scope('cell_' + str(lv)) as scope:
 49 |                     node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv))
 50 |                     if lv > 0:
 51 |                         ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv))
 52 |                     else:
 53 |                         ol = node_linear
 54 |                 lv -= 1
 55 |             out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2))
 56 |             l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2))
 57 | 
 58 |         fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1, name=name + "_y_potential_expand_dims")
 59 |         
 60 |         graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding')
 61 |         return graph_embedding
 62 | 
 63 |         
 64 |     def generateGraphClassificationNetwork(self):
 65 | 
 66 |         self.x = tf.placeholder(tf.float32,[None, None,self.features_size], name = "x_1") # Vettore del nodo in input 1
 67 |         self.adj = tf.placeholder(tf.float32,[None, None, None],name="adj_1") # Matrice di adiacenza 1
 68 |         self.y = tf.placeholder(tf.int32, [None], name='y_')
 69 | 
 70 |         self.lenghts = tf.placeholder(tf.float32, [None], name="len1")
 71 | 
 72 |         self.norms = []
 73 |         l2_loss = tf.constant(0.0)
 74 | 
 75 |         # -------------------------------
 76 |         #   1. MEAN FIELD COMPONENT
 77 |         # -------------------------------
 78 | 
 79 |         #1. parameters for MeanField
 80 |         with tf.name_scope('parameters_MeanField'):
 81 | 
 82 |             # W1 is a [d,p] matrix, and p is the embedding size as explained above
 83 |             self.W1 = tf.Variable(tf.truncated_normal([self.features_size,self.embedding_size], stddev=0.1), name="W1")
 84 |             self.norms.append(tf.norm(self.W1))
 85 | 
 86 |             # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv)
 87 |             self.CONV_PARAMS = []
 88 |             for lv in range(self.max_lv):
 89 |                 v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), name="CONV_PARAMS_"+str(lv))
 90 |                 self.CONV_PARAMS.append(v)
 91 |                 self.norms.append(tf.norm(v))
 92 | 
 93 |             # W2 is another [p,p] matrix to transform the embedding vector
 94 |             self.W2 =  tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), name="W2")
 95 |             self.norms.append(tf.norm(self.W2))
 96 |         
 97 |         # Mean Field
 98 |         with tf.name_scope('MeanField1'):
 99 |             self.graph_embedding = tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x,self.adj,"MeanField1"), axis=1), axis=1,name="embedding1")
100 | 
101 |         with tf.name_scope('Hidden_Layer'):
102 |             self.dense_ouput = tf.nn.relu(tf.layers.dense(self.graph_embedding, self.dense_layer_size))
103 | 
104 |         with tf.name_scope('Output_Layer'):
105 |             self.logits = tf.layers.dense(self.dense_ouput, self.number_of_classes)
106 | 
107 |         with tf.name_scope('Prediction'):
108 |             self.pred_classes = tf.argmax(self.logits, axis=1)
109 |             self.pred_probab = tf.nn.softmax(self.logits)
110 | 
111 |         # Regularization
112 |         with tf.name_scope("Regularization"):
113 |             l2_loss += tf.nn.l2_loss(self.W1)
114 |             for lv in range(self.max_lv):
115 |                 l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv])
116 |             l2_loss += tf.nn.l2_loss(self.W2)
117 | 
118 |         # CalculateMean cross-entropy loss
119 |         with tf.name_scope("Loss"):
120 |             self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y))
121 |             self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss  # regularization
122 | 
123 |         # Train step
124 |         with tf.name_scope("Train_Step"):
125 |             self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss)
126 | 


--------------------------------------------------------------------------------
/compiler_provenance/s2v_classification_network_arith_mean.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | 
  7 | import tensorflow as tf
  8 | 
  9 | class NetworkLSTM:
 10 | 
 11 |     def __init__(self,
 12 |                  features_size,
 13 |                  embedding_size,
 14 |                  max_lv,
 15 |                  T_iterations,
 16 |                  learning_rate,
 17 |                  l2_reg_lambda,
 18 |                  batch_size,
 19 |                  max_instructions,
 20 |                  max_nodes,
 21 |                  rnn_depth,
 22 |                  rnn_kind,
 23 |                  embedding_matrix,
 24 |                  trainable_embeddings,
 25 |                  dense_layer_size,
 26 |                  num_classes
 27 |                  ):
 28 |         print("Features size"+str(features_size))
 29 |         self.features_size = features_size
 30 |         self.embedding_size = embedding_size
 31 |         self.max_lv = max_lv
 32 |         self.T_iterations = T_iterations
 33 |         self.learning_rate = learning_rate
 34 |         self.l2_reg_lambda = l2_reg_lambda
 35 |         self.RRN_HIDDEN = features_size
 36 |         self.batch_size = batch_size
 37 |         self.max_instructions = max_instructions
 38 |         self.max_nodes = max_nodes
 39 |         self.rnn_depth = rnn_depth
 40 |         self.rnn_kind=rnn_kind
 41 |         self.embedding_matrix = embedding_matrix
 42 |         self.trainable_embeddings = trainable_embeddings
 43 |         self.dense_layer_size = dense_layer_size
 44 |         self.number_of_classes = num_classes
 45 |         self.generateGraphClassificationNetwork()
 46 | 
 47 |     def extract_axis_1(self, data, ind):
 48 |         """
 49 |         Get specified elements along the first axis of tensor.
 50 |         :param data: Tensorflow tensor that will be subsetted.
 51 |         :param ind: Indices to take (one for each element along axis 0 of data).
 52 |         :return: Subsetted tensor.
 53 |         """
 54 |         ind=tf.nn.relu(ind-1)
 55 |         batch_range = tf.range(tf.shape(data)[0])
 56 |         indices = tf.stack([batch_range, ind], axis=1)
 57 |         res = tf.gather_nd(data, indices)
 58 | 
 59 |         return res
 60 | 
 61 |     def create_flattening_array(self, max_nodes, batch_size):
 62 |         shape_array = []
 63 |         for p in range(0, batch_size):
 64 |             for i in range(0, max_nodes):
 65 |                 shape_array.append([p, i])
 66 |         return shape_array
 67 | 
 68 |     def create_gather_array(self, max_nodes, batch_size):
 69 |         shape_array = []
 70 |         for p in range(0, batch_size):
 71 |             x = []
 72 |             for i in range(0, max_nodes):
 73 |                 x.append([0, i + p * max_nodes])
 74 |             shape_array.append(x)
 75 |         return shape_array
 76 | 
 77 |     def lstmFeatures(self, input_x, lengths):
 78 |         flattened_inputs = tf.reshape(input_x, [-1, tf.shape(input_x)[2]], name="Flattening")
 79 |         flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs)
 80 |         last_outputs = tf.squeeze(tf.nn.l2_normalize(tf.reduce_mean(flattened_embedded, name='features_arith_mean', axis=1), axis=1))
 81 |         gather_output2 = tf.reshape(last_outputs,
 82 |                                     [-1, tf.shape(input_x)[1], self.features_size], name="Deflattening")
 83 |         output = tf.identity(gather_output2, name="LSTMOutput")
 84 |         output=tf.nn.l2_normalize(output)
 85 |         return output
 86 | 
 87 |     def meanField(self, input_x, input_adj, name):
 88 |         W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled")
 89 |         W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled")
 90 |         CONV_PARAMS_tiled = []
 91 |         for lv in range(self.max_lv):
 92 |             CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1],
 93 |                                              name=name + "_CONV_PARAMS_tiled_" + str(lv)))
 94 |         w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv")
 95 |         l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1))
 96 |         out = w1xv
 97 |         for i in range(self.T_iterations - 1):
 98 |             ol = l
 99 |             lv = self.max_lv - 1
100 |             while lv >= 0:
101 |                 with tf.name_scope('cell_' + str(lv)) as scope:
102 |                     node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv))
103 |                     if lv > 0:
104 |                         ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv))
105 |                     else:
106 |                         ol = node_linear
107 |                 lv -= 1
108 |             out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2))
109 |             l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2))
110 |         fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1,
111 |                             name=name + "_y_potential_expand_dims")
112 |         graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding')
113 |         return graph_embedding
114 | 
115 |     def generateGraphClassificationNetwork(self):
116 |         print("Features size:"+str(self.features_size))
117 |         self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix),
118 |                                     trainable=self.trainable_embeddings,
119 |                                     name="instruction_embedding", dtype=tf.float32)
120 |         self.x = tf.placeholder(tf.int32, [None, None, self.max_instructions], name="x_1")  # Vettore del nodo in input
121 |         self.adj = tf.placeholder(tf.float32, [None, None, None], name="adj_1")  # Matrice di adiacenza
122 |         self.lenghts = tf.placeholder(tf.int32, [None,None], name='lenghts_1')
123 |         self.y = tf.placeholder(tf.int32, [None], name='y_')
124 |         # Euclidean norms; p = 2
125 |         self.norms = []
126 |         # Keeping track of l2 regularization loss (optional)
127 |         l2_loss = tf.constant(0.0)
128 | 
129 |         # -------------------------------
130 |         #   1. MEAN FIELD COMPONENT
131 |         # -------------------------------
132 | 
133 |         # 1. parameters for MeanField
134 |         with tf.name_scope('parameters_MeanField'):
135 |             # W1 is a [d,p] matrix, and p is the embedding size as explained above
136 |             self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1")
137 |             self.norms.append(tf.norm(self.W1))
138 |             # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv)
139 |             self.CONV_PARAMS = []
140 |             for lv in range(self.max_lv):
141 |                 v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),
142 |                                 name="CONV_PARAMS_" + str(lv))
143 |                 self.CONV_PARAMS.append(v)
144 |                 self.norms.append(tf.norm(v))
145 |             # W2 is another [p,p] matrix to transform the embedding vector
146 |             self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),
147 |                                   name="W2")
148 |             self.norms.append(tf.norm(self.W2))
149 | 
150 |         # LSTMExtraction
151 |         with tf.name_scope('BlockFeatureExtraction'):
152 |             with tf.variable_scope('arithmetic_mean'):
153 |                 self.x_after_lstm = self.lstmFeatures(self.x, self.lenghts)
154 | 
155 |         # Mean Field
156 |         with tf.name_scope('MeanField'):
157 |             self.graph_embedding = tf.nn.l2_normalize(
158 |                 tf.squeeze(self.meanField(self.x_after_lstm, self.adj, "MeanField"), axis=1), axis=1,
159 |                 name="embedding1")  # tf.nn.l2_normalize(tf.squeeze(self.meanField(self.x_1,self.adj_1), axis=1), 1)
160 | 
161 |         with tf.name_scope('Hidden_Layer'):
162 |             self.dense_ouput = tf.nn.relu(tf.layers.dense(self.graph_embedding, self.dense_layer_size))
163 | 
164 |         with tf.name_scope('Output_Layer'):
165 |             self.logits = tf.layers.dense(self.dense_ouput, self.number_of_classes)
166 | 
167 |         with tf.name_scope('Prediction'):
168 |             self.pred_classes = tf.argmax(self.logits, axis=1)
169 |             self.pred_probab = tf.nn.softmax(self.logits)
170 | 
171 |         # Regularization
172 |         with tf.name_scope("Regularization"):
173 |             l2_loss += tf.nn.l2_loss(self.W1)
174 |             for lv in range(self.max_lv):
175 |                 l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv])
176 |             l2_loss += tf.nn.l2_loss(self.W2)
177 | 
178 |         # CalculateMean cross-entropy loss
179 |         with tf.name_scope("Loss"):
180 |             self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y))
181 |             self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss  # regularization
182 | 
183 |         # Train step
184 |         with tf.name_scope("Train_Step"):
185 |             self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss)
186 | 


--------------------------------------------------------------------------------
/compiler_provenance/s2v_classification_network_attention_mean.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | 
  7 | import tensorflow as tf
  8 | 
  9 | 
 10 | # structure2vec
 11 | # DE-MF : discriminative embedding using Mean Field
 12 | 
 13 | 
 14 | class Network:
 15 | 
 16 |     def __init__(self,
 17 |                  features_size,
 18 |                  embedding_size,
 19 |                  max_lv,
 20 |                  T_iterations,
 21 |                  learning_rate,
 22 |                  l2_reg_lambda,
 23 |                  batch_size,
 24 |                  max_instructions,
 25 |                  max_nodes,
 26 |                  rnn_depth,
 27 |                  rnn_kind,
 28 |                  embedding_matrix,
 29 |                  trainable_embeddings,
 30 |                  dense_layer_size,
 31 |                  num_classes
 32 |                  ):
 33 |         print("Features size"+str(features_size))
 34 |         self.features_size = features_size
 35 |         self.embedding_size = embedding_size
 36 |         self.max_lv = max_lv
 37 |         self.T_iterations = T_iterations
 38 |         self.learning_rate = learning_rate
 39 |         self.l2_reg_lambda = l2_reg_lambda
 40 |         self.RRN_HIDDEN = features_size
 41 |         self.batch_size = batch_size
 42 |         self.max_instructions = max_instructions
 43 |         self.max_nodes = max_nodes
 44 |         self.rnn_depth = rnn_depth
 45 |         self.rnn_kind=rnn_kind
 46 |         self.embedding_matrix = embedding_matrix
 47 |         self.trainable_embeddings = trainable_embeddings
 48 |         self.dense_layer_size = dense_layer_size
 49 |         self.number_of_classes = num_classes
 50 |         self.generateGraphClassificationNetwork()
 51 | 
 52 |     def extract_axis_1(self, data, ind):
 53 |         """
 54 |         Get specified elements along the first axis of tensor.
 55 |         :param data: Tensorflow tensor that will be subsetted.
 56 |         :param ind: Indices to take (one for each element along axis 0 of data).
 57 |         :return: Subsetted tensor.
 58 |         """
 59 |         ind=tf.nn.relu(ind-1)
 60 |         batch_range = tf.range(tf.shape(data)[0])
 61 |         indices = tf.stack([batch_range, ind], axis=1)
 62 |         res = tf.gather_nd(data, indices)
 63 | 
 64 |         return res
 65 | 
 66 |     def create_flattening_array(self, max_nodes, batch_size):
 67 |         shape_array = []
 68 |         for p in range(0, batch_size):
 69 |             for i in range(0, max_nodes):
 70 |                 shape_array.append([p, i])
 71 |         return shape_array
 72 | 
 73 |     def create_gather_array(self, max_nodes, batch_size):
 74 |         shape_array = []
 75 |         for p in range(0, batch_size):
 76 |             x = []
 77 |             for i in range(0, max_nodes):
 78 |                 x.append([0, i + p * max_nodes])
 79 |             shape_array.append(x)
 80 |         return shape_array
 81 | 
 82 |     def lstmFeatures(self, input_x, lengths):
 83 | 
 84 |         flattened_inputs = tf.reshape(input_x, [-1, tf.shape(input_x)[2]], name="Flattening")
 85 |         flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs)
 86 |         self.W0 = tf.Variable(tf.constant(1.0 / self.max_instructions, shape=[1, self.max_instructions]), name="W0")
 87 |         w0_tiled = tf.tile(tf.expand_dims(self.W0, 0), [tf.shape(flattened_inputs)[0], 1, 1], name="W0_tiled")
 88 |         last_outputs = tf.squeeze(tf.nn.l2_normalize(tf.matmul(w0_tiled, flattened_embedded, name='features_weighted_mean')), axis=1)
 89 |         gather_output2 = tf.reshape(last_outputs,
 90 |                                     [-1, tf.shape(input_x)[1], self.features_size], name="Deflattening")
 91 |         output = tf.identity(gather_output2, name="LSTMOutput")
 92 |         output=tf.nn.l2_normalize(output)
 93 |         return output
 94 | 
 95 |     def meanField(self, input_x, input_adj, name):
 96 | 
 97 |         W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled")
 98 |         W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled")
 99 | 
100 |         CONV_PARAMS_tiled = []
101 |         for lv in range(self.max_lv):
102 |             CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1],
103 |                                              name=name + "_CONV_PARAMS_tiled_" + str(lv)))
104 | 
105 |         w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv")
106 |         l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1))
107 |         out = w1xv
108 |         for i in range(self.T_iterations - 1):
109 |             ol = l
110 |             lv = self.max_lv - 1
111 |             while lv >= 0:
112 |                 with tf.name_scope('cell_' + str(lv)) as scope:
113 |                     node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv))
114 |                     if lv > 0:
115 |                         ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv))
116 |                     else:
117 |                         ol = node_linear
118 |                 lv -= 1
119 |             out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2))
120 |             l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2))
121 | 
122 |         fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1,
123 |                             name=name + "_y_potential_expand_dims")
124 |         graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding')
125 |         return graph_embedding
126 | 
127 |     def generateGraphClassificationNetwork(self):
128 |         print("Features size:"+str(self.features_size))
129 |         self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix),
130 |                                                     trainable=self.trainable_embeddings,
131 |                                                     name="instruction_embedding", dtype=tf.float32)
132 |         self.x = tf.placeholder(tf.int32, [None, None, self.max_instructions],name="x_1")
133 |         self.adj = tf.placeholder(tf.float32, [None, None, None], name="adj_1")
134 |         self.lenghts = tf.placeholder(tf.int32, [None,None], name='lenghts_1')
135 |         self.y = tf.placeholder(tf.int32, [None], name='y_')
136 |         self.norms = []
137 | 
138 |         l2_loss = tf.constant(0.0)
139 | 
140 |         # -------------------------------
141 |         #   1. MEAN FIELD COMPONENT
142 |         # -------------------------------
143 | 
144 |         # 1. parameters for MeanField
145 |         with tf.name_scope('parameters_MeanField'):
146 | 
147 |             # W1 is a [d,p] matrix, and p is the embedding size as explained above
148 |             self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1")
149 |             self.norms.append(tf.norm(self.W1))
150 | 
151 |             # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv)
152 |             self.CONV_PARAMS = []
153 |             for lv in range(self.max_lv):
154 |                 v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),
155 |                                 name="CONV_PARAMS_" + str(lv))
156 |                 self.CONV_PARAMS.append(v)
157 |                 self.norms.append(tf.norm(v))
158 | 
159 |             # W2 is another [p,p] matrix to transform the embedding vector
160 |             self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),name="W2")
161 |             self.norms.append(tf.norm(self.W2))
162 | 
163 |         #
164 |         # LSTMExtraction
165 |         with tf.name_scope('BlockFeatureExtraction'):
166 |             self.block_features = self.lstmFeatures(self.x, self.lenghts)
167 | 
168 |         # Mean Field
169 |         with tf.name_scope('MeanField'):
170 |             self.graph_embedding = tf.nn.l2_normalize(
171 |                 tf.squeeze(self.meanField(self.block_features, self.adj, "MeanField1"), axis=1), axis=1, name="embedding")
172 | 
173 |         with tf.name_scope('Hidden_Layer'):
174 |             self.dense_ouput = tf.nn.relu(tf.layers.dense(self.graph_embedding, self.dense_layer_size))
175 | 
176 |         with tf.name_scope('Output_Layer'):
177 |             self.logits = tf.layers.dense(self.dense_ouput, self.number_of_classes)
178 | 
179 |         with tf.name_scope('Prediction'):
180 |             self.pred_classes = tf.argmax(self.logits, axis=1)
181 |             self.pred_probab = tf.nn.softmax(self.logits)
182 | 
183 |         # Regularization
184 |         with tf.name_scope("Regularization"):
185 |             l2_loss += tf.nn.l2_loss(self.W1)
186 |             for lv in range(self.max_lv):
187 |                 l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv])
188 |             l2_loss += tf.nn.l2_loss(self.W2)
189 | 
190 |         # CalculateMean cross-entropy loss
191 |         with tf.name_scope("Loss"):
192 |             self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y))
193 |             self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss  # regularization
194 | 
195 |         # Train step
196 |         with tf.name_scope("Train_Step"):
197 |             self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss)
198 | 


--------------------------------------------------------------------------------
/compiler_provenance/s2v_classification_network_rnn.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | 
  7 | import tensorflow as tf
  8 | 
  9 | 
 10 | # structure2vec
 11 | # DE-MF : discriminative embedding using Mean Field
 12 | 
 13 | 
 14 | class NetworkLSTM:
 15 | 
 16 |     def __init__(self,
 17 |                  features_size,  # Dimensione delle features del nodo
 18 |                  embedding_size,  # Dimensione dell'embedding del vettore
 19 |                  max_lv,
 20 |                  T_iterations,
 21 |                  learning_rate,  # Learning rate
 22 |                  l2_reg_lambda,
 23 |                  batch_size,
 24 |                  max_instructions,
 25 |                  max_nodes,
 26 |                  rnn_depth,
 27 |                  rnn_kind,
 28 |                  embedding_matrix,
 29 |                  trainable_embeddings,
 30 |                  dense_layer_size,
 31 |                  num_classes
 32 |                  ):
 33 |         print("Features size"+str(features_size))
 34 |         self.features_size = features_size
 35 |         self.embedding_size = embedding_size
 36 |         self.max_lv = max_lv
 37 |         self.T_iterations = T_iterations
 38 |         self.learning_rate = learning_rate
 39 |         self.l2_reg_lambda = l2_reg_lambda
 40 |         self.RRN_HIDDEN = features_size
 41 |         self.batch_size = batch_size
 42 |         self.max_instructions = max_instructions
 43 |         self.max_nodes = max_nodes
 44 |         self.rnn_depth = rnn_depth
 45 |         self.rnn_kind=rnn_kind
 46 |         self.embedding_matrix = embedding_matrix
 47 |         self.trainable_embeddings = trainable_embeddings
 48 |         self.dense_layer_size = dense_layer_size
 49 |         self.number_of_classes = num_classes
 50 |         self.generateGraphClassificationNetwork()
 51 | 
 52 |     def extract_axis_1(self, data, ind):
 53 |         """
 54 |         Get specified elements along the first axis of tensor.
 55 |         :param data: Tensorflow tensor that will be subsetted.
 56 |         :param ind: Indices to take (one for each element along axis 0 of data).
 57 |         :return: Subsetted tensor.
 58 |         """
 59 |         ind=tf.nn.relu(ind-1)
 60 |         batch_range = tf.range(tf.shape(data)[0])
 61 |         indices = tf.stack([batch_range, ind], axis=1)
 62 |         res = tf.gather_nd(data, indices)
 63 | 
 64 |         return res
 65 | 
 66 |     def lstmFeatures(self, input_x, lengths):
 67 |         flattened_inputs=tf.reshape(input_x,[-1,tf.shape(input_x)[2]],name="Flattening")
 68 |         flattened_lenghts = tf.reshape(lengths, [-1])
 69 |         max = tf.reduce_max(flattened_lenghts)
 70 |         flattened_inputs=flattened_inputs[:,:max]
 71 |         flattened_embedded = tf.nn.embedding_lookup(self.instruction_embeddings_t, flattened_inputs)
 72 |         zeros = tf.zeros(tf.shape(flattened_lenghts)[0], dtype=tf.int32)
 73 |         mask = tf.not_equal(flattened_lenghts, zeros)
 74 |         int_mask = tf.cast(mask, tf.int32)
 75 |         fake_output = tf.zeros([self.features_size], dtype=tf.float32)
 76 |         partitions = tf.dynamic_partition(flattened_embedded, int_mask, 2)
 77 |         real_nodes=partitions[1]
 78 |         real_lenghts=tf.boolean_mask(flattened_lenghts,mask)
 79 |         fake_zero = tf.tile([fake_output], [tf.shape(flattened_embedded)[0] - tf.shape(partitions[1])[0], 1])
 80 | 
 81 |         if self.rnn_kind==0:
 82 |             rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in ([self.features_size] * self.rnn_depth)]
 83 |         else:
 84 |             rnn_layers = [tf.nn.rnn_cell.GRUCell(size) for size in ([self.features_size] * self.rnn_depth)]
 85 |         cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
 86 |         rnn_outputs, _ = tf.nn.dynamic_rnn(cell, real_nodes, sequence_length=real_lenghts, dtype=tf.float32,
 87 |                                            time_major=False, parallel_iterations=88)
 88 |         last_outputs = self.extract_axis_1(rnn_outputs, real_lenghts)
 89 |         condition_indices = tf.dynamic_partition(
 90 |             tf.range(tf.shape(flattened_embedded)[0]), int_mask, 2)
 91 |         last_outputs = tf.dynamic_stitch(condition_indices, [fake_zero, last_outputs])
 92 |         gather_output2 = tf.reshape(last_outputs, [-1, tf.shape(input_x)[1], self.features_size], name="Deflattening")
 93 | 
 94 |         output = tf.identity(gather_output2, name="LSTMOutput")
 95 |         output=tf.nn.l2_normalize(output)
 96 |         return output
 97 | 
 98 |     def meanField(self, input_x, input_adj, name):
 99 | 
100 |         W1_tiled = tf.tile(tf.expand_dims(self.W1, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W1_tiled")
101 |         W2_tiled = tf.tile(tf.expand_dims(self.W2, 0), [tf.shape(input_x)[0], 1, 1], name=name + "_W2_tiled")
102 | 
103 |         CONV_PARAMS_tiled = []
104 |         for lv in range(self.max_lv):
105 |             CONV_PARAMS_tiled.append(tf.tile(tf.expand_dims(self.CONV_PARAMS[lv], 0), [tf.shape(input_x)[0], 1, 1],
106 |                                              name=name + "_CONV_PARAMS_tiled_" + str(lv)))
107 | 
108 |         w1xv = tf.matmul(input_x, W1_tiled, name=name + "_w1xv")
109 |         l = tf.matmul(input_adj, w1xv, name=name + '_l_iteration' + str(1))
110 |         out = w1xv
111 |         for i in range(self.T_iterations - 1):
112 |             ol = l
113 |             lv = self.max_lv - 1
114 |             while lv >= 0:
115 |                 with tf.name_scope('cell_' + str(lv)) as scope:
116 |                     node_linear = tf.matmul(ol, CONV_PARAMS_tiled[lv], name=name + '_conv_params_' + str(lv))
117 |                     if lv > 0:
118 |                         ol = tf.nn.relu(node_linear, name=name + '_relu_' + str(lv))
119 |                     else:
120 |                         ol = node_linear
121 |                 lv -= 1
122 |             out = tf.nn.tanh(w1xv + ol, name=name + "_mu_iteration" + str(i + 2))
123 |             l = tf.matmul(input_adj, out, name=name + '_l_iteration' + str(i + 2))
124 |         fi = tf.expand_dims(tf.reduce_sum(out, axis=1, name=name + "_y_potential_reduce_sum"), axis=1,
125 |                             name=name + "_y_potential_expand_dims")
126 |         graph_embedding = tf.matmul(fi, W2_tiled, name=name + '_graph_embedding')
127 |         return graph_embedding
128 | 
129 |     def generateGraphClassificationNetwork(self):
130 | 
131 |         print("Features size:" + str(self.features_size))
132 |         self.instruction_embeddings_t = tf.Variable(initial_value=tf.constant(self.embedding_matrix),
133 |                                                     trainable=self.trainable_embeddings,
134 |                                                     name="instruction_embedding", dtype=tf.float32)
135 | 
136 |         self.x = tf.placeholder(tf.int32, [None, None, self.max_instructions], name="x_1")
137 |         self.adj = tf.placeholder(tf.float32, [None, None, None], name="adj_1")
138 |         self.lenghts = tf.placeholder(tf.int32, [None, None], name='lenghts_1')
139 |         self.y = tf.placeholder(tf.int32, [None], name='y_')
140 | 
141 |         self.norms = []
142 | 
143 |         l2_loss = tf.constant(0.0)
144 | 
145 |         # -------------------------------
146 |         #   1. MEAN FIELD COMPONENT
147 |         # -------------------------------
148 | 
149 |         # 1. parameters for MeanField
150 |         with tf.name_scope('parameters_MeanField'):
151 | 
152 |             # W1 is a [d,p] matrix, and p is the embedding size as explained above
153 |             self.W1 = tf.Variable(tf.truncated_normal([self.features_size, self.embedding_size], stddev=0.1), name="W1")
154 |             self.norms.append(tf.norm(self.W1))
155 | 
156 |             # CONV_PARAMSi (i=1,...,n) is a [p,p] matrix. We refer to n as the embedding depth (self.max_lv)
157 |             self.CONV_PARAMS = []
158 |             for lv in range(self.max_lv):
159 |                 v = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1),
160 |                                 name="CONV_PARAMS_" + str(lv))
161 |                 self.CONV_PARAMS.append(v)
162 |                 self.norms.append(tf.norm(v))
163 | 
164 |             # W2 is another [p,p] matrix to transform the embedding vector
165 |             self.W2 = tf.Variable(tf.truncated_normal([self.embedding_size, self.embedding_size], stddev=0.1), name="W2")
166 |             self.norms.append(tf.norm(self.W2))
167 | 
168 |         #
169 |         # LSTMExtraction
170 |         with tf.name_scope('BlockFeatureExtraction'):
171 |             self.block_features = self.lstmFeatures(self.x, self.lenghts)
172 | 
173 |         # Mean Field
174 |         with tf.name_scope('MeanField'):
175 |             self.graph_embedding = tf.nn.l2_normalize(
176 |                 tf.squeeze(self.meanField(self.block_features, self.adj, "MeanField1"), axis=1), axis=1, name="embedding")
177 | 
178 |         with tf.name_scope('Hidden_Layer'):
179 |             self.dense_ouput = tf.nn.relu(tf.layers.dense(self.graph_embedding, self.dense_layer_size))
180 | 
181 |         with tf.name_scope('Output_Layer'):
182 |             self.logits = tf.layers.dense(self.dense_ouput, self.number_of_classes)
183 | 
184 |         with tf.name_scope('Prediction'):
185 |             self.pred_classes = tf.argmax(self.logits, axis=1)
186 |             self.pred_probab = tf.nn.softmax(self.logits)
187 | 
188 |         # Regularization
189 |         with tf.name_scope("Regularization"):
190 |             l2_loss += tf.nn.l2_loss(self.W1)
191 |             for lv in range(self.max_lv):
192 |                 l2_loss += tf.nn.l2_loss(self.CONV_PARAMS[lv])
193 |             l2_loss += tf.nn.l2_loss(self.W2)
194 | 
195 |         # CalculateMean cross-entropy loss
196 |         with tf.name_scope("Loss"):
197 |             self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y))
198 |             self.regularized_loss = self.loss + self.l2_reg_lambda * l2_loss  # regularization
199 | 
200 |         # Train step
201 |         with tf.name_scope("Train_Step"):
202 |             self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.regularized_loss)
203 | 


--------------------------------------------------------------------------------
/compiler_provenance/s2v_trainer.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | 
  7 | 
  8 | from compiler_provenance.s2v_classification_network_arith_mean import NetworkLSTM as arithMeanNetwork
  9 | from compiler_provenance.s2v_classification_network_rnn import NetworkLSTM as rrnFastMeanNetwork
 10 | from compiler_provenance.s2v_classification_network_annotations import Network as annotationNetwork
 11 | from compiler_provenance.s2v_classification_network_attention_mean import Network as weightedMeanNetwork
 12 | 
 13 | from compiler_provenance.FunctionFactory import PairFactory as FunctionFactory
 14 | 
 15 | import tensorflow as tf
 16 | import random
 17 | import sys, os
 18 | import numpy as np
 19 | from sklearn import metrics
 20 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 21 | import matplotlib
 22 | import sqlite3
 23 | import pickle
 24 | matplotlib.use('Agg')
 25 | import matplotlib.pyplot as plt
 26 | import itertools
 27 | import tqdm
 28 | 
 29 | class S2VTrainerLSTM:
 30 | 
 31 |     def __init__(self, flags, embedding_matrix):
 32 |         self.embedding_size = flags.embedding_size
 33 |         self.max_lv = flags.max_lv
 34 |         self.num_epochs = flags.num_epochs
 35 |         self.learning_rate = flags.learning_rate
 36 |         self.l2_reg_lambda = flags.l2_reg_lambda
 37 |         self.num_checkpoints = flags.num_checkpoints
 38 |         self.logdir = flags.logdir
 39 |         self.logger = flags.logger
 40 |         self.T_iterations = flags.T_iterations
 41 |         self.seed = flags.seed
 42 |         self.batch_size = flags.batch_size
 43 |         self.max_instructions = flags.max_instructions
 44 |         self.rnn_depth = flags.rnn_depth
 45 |         self.rnn_kind = flags.rnn_kind
 46 |         self.max_nodes = flags.MAX_NUM_VERTICES
 47 |         self.embeddings_matrix = embedding_matrix
 48 |         self.session = None
 49 |         self.db_name = flags.db_name
 50 |         self.feature_type = flags.feature_type
 51 |         self.json_asm2id = flags.json_asm2id
 52 |         self.trainable_embeddings = flags.trainable_embeddings
 53 |         self.network_type = flags.network_type
 54 |         self.cross_val = flags.cross_val
 55 |         self.dense_layer_size = flags.dense_layer_size
 56 |         self.flags = flags
 57 |         self.functions = False
 58 | 
 59 |         if flags.class_kind == "CMP" or flags.class_kind=="FML":
 60 |             query_str="SELECT DISTINCT  compiler FROM functions"
 61 |         elif flags.class_kind == "CMPOPT":
 62 |             query_str = "SELECT DISTINCT  compiler,optimization FROM functions"
 63 |         elif flags.class_kind == "OPT":
 64 |             query_str = "SELECT DISTINCT  optimization FROM functions"
 65 | 
 66 |         conn = sqlite3.connect(self.db_name)
 67 |         cur = conn.cursor()
 68 |         print("Looking in db for classes")
 69 |         q = cur.execute(query_str)
 70 |         q_compilers = q.fetchall()
 71 |         #q_compilers = [c[0] for c in compilers]
 72 |         compilers = []
 73 | 
 74 |         for c in q_compilers:
 75 |             if flags.class_kind == "CMPOPT":
 76 |                  compiler = c[0] + '-' + c[1]
 77 |             elif flags.class_kind == "FML":
 78 |                 compiler = str(c[0]).split('-')[0]
 79 | 
 80 |             else:
 81 |                  compiler = c[0]
 82 |             compilers.append(compiler)
 83 | 
 84 |         print(compilers)
 85 | 
 86 | 
 87 |         compilers = list(set(compilers))
 88 |         conn.close()
 89 | 
 90 |         self.encoder = LabelEncoder()
 91 |         self.encoder.fit(compilers)
 92 |         self.num_classes = len(self.encoder.classes_)
 93 | 
 94 |         print("Num classes = " + str(self.num_classes))
 95 | 
 96 |         random.seed(self.seed)
 97 |         np.random.seed(self.seed)
 98 | 
 99 |         print(self.db_name)
100 | 
101 |     def plot_confusion_matrix(self, cm, classes, normalize=False,
102 |                               title='Confusion matrix',
103 |                               cmap=plt.cm.Blues):
104 |         """
105 |         This function prints and plots the confusion matrix.
106 |         Normalization can be applied by setting `normalize=True`.
107 |         """
108 |         if normalize:
109 |             cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
110 | 
111 |         plt.imshow(cm, interpolation='nearest', cmap=cmap)
112 |         plt.title(title)
113 |         plt.colorbar()
114 |         tick_marks = np.arange(len(classes))
115 |         plt.xticks(tick_marks, classes, rotation=45)
116 |         plt.yticks(tick_marks, classes)
117 | 
118 |         fmt = '.2f' if normalize else 'd'
119 |         thresh = cm.max() / 2.
120 |         for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
121 |             plt.text(j, i, format(cm[i, j], fmt),
122 |                      horizontalalignment="center",
123 |                      color="white" if cm[i, j] > thresh else "black")
124 | 
125 |         plt.tight_layout()
126 |         plt.ylabel('True label')
127 |         plt.xlabel('Predicted label')
128 | 
129 |     def loadmodel(self):
130 |         tf.reset_default_graph()
131 |         with tf.Graph().as_default() as g:
132 |             session_conf = tf.ConfigProto(
133 |                 allow_soft_placement=True,
134 |                 log_device_placement=False
135 |             )
136 |             sess = tf.Session(config=session_conf)
137 | 
138 |             # Sets the graph-level random seed.
139 |             tf.set_random_seed(self.seed)
140 | 
141 |             self.createNetwork()
142 |             saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.num_checkpoints)
143 |             checkpoint_dir = os.path.abspath(os.path.join(self.logdir, "checkpoints"))
144 |             saver.restore(sess, os.path.join(checkpoint_dir, "model"))
145 |             self.session = sess
146 |         return
147 | 
148 | 
149 |     def createNetwork(self):
150 |         self.features_size = np.shape(self.embeddings_matrix)[1]
151 |         if self.network_type == "Arith_Mean":
152 | 
153 |             self.network = arithMeanNetwork(
154 |                 features_size=self.features_size,
155 |                 embedding_size=self.embedding_size,
156 |                 max_lv=self.max_lv,
157 |                 T_iterations=self.T_iterations,
158 |                 learning_rate=self.learning_rate,
159 |                 l2_reg_lambda=self.l2_reg_lambda,
160 |                 batch_size=self.batch_size,
161 |                 max_instructions=self.max_instructions,
162 |                 max_nodes=self.max_nodes,
163 |                 rnn_depth=self.rnn_depth,
164 |                 rnn_kind=self.rnn_kind,
165 |                 embedding_matrix=self.embeddings_matrix,
166 |                 trainable_embeddings=self.trainable_embeddings,
167 |                 num_classes=self.num_classes,
168 |                 dense_layer_size=self.dense_layer_size
169 |             )
170 | 
171 |         if self.network_type == "RNN":
172 | 
173 |             self.network = rrnFastMeanNetwork(
174 |                 features_size=self.features_size,
175 |                 embedding_size=self.embedding_size,
176 |                 max_lv=self.max_lv,
177 |                 T_iterations=self.T_iterations,
178 |                 learning_rate=self.learning_rate,
179 |                 l2_reg_lambda=self.l2_reg_lambda,
180 |                 batch_size=self.batch_size,
181 |                 max_instructions = self.max_instructions,
182 |                 max_nodes = self.max_nodes,
183 |                 rnn_depth = self.rnn_depth,
184 |                 rnn_kind=self.rnn_kind,
185 |                 embedding_matrix=self.embeddings_matrix,
186 |                 trainable_embeddings=self.trainable_embeddings,
187 |                 dense_layer_size=self.dense_layer_size,
188 |                 num_classes=self.num_classes
189 |             )
190 | 
191 |         if self.network_type == "Attention_Mean":
192 | 
193 |             self.network = weightedMeanNetwork(
194 |                 features_size=self.features_size,
195 |                 embedding_size=self.embedding_size,
196 |                 max_lv=self.max_lv,
197 |                 T_iterations=self.T_iterations,
198 |                 learning_rate=self.learning_rate,
199 |                 l2_reg_lambda=self.l2_reg_lambda,
200 |                 batch_size=self.batch_size,
201 |                 max_instructions = self.max_instructions,
202 |                 max_nodes = self.max_nodes,
203 |                 rnn_depth = self.rnn_depth,
204 |                 rnn_kind=self.rnn_kind,
205 |                 embedding_matrix=self.embeddings_matrix,
206 |                 trainable_embeddings=self.trainable_embeddings,
207 |                 dense_layer_size=self.dense_layer_size,
208 |                 num_classes = self.num_classes
209 |             )
210 | 
211 |         if self.network_type == "Annotations":
212 |             self.features_size = 8
213 |             self.network = annotationNetwork(
214 |                 features_size=self.features_size,
215 |                 embedding_size=self.embedding_size,
216 |                 max_lv=self.max_lv,
217 |                 T_iterations=self.T_iterations,
218 |                 learning_rate=self.learning_rate,
219 |                 l2_reg_lambda=self.l2_reg_lambda,
220 |                 dense_layer_size=self.dense_layer_size,
221 |                 num_classes=self.num_classes
222 |             )
223 | 
224 |     def read_weight(self):
225 |         a = self.session.run(self.session.graph.get_tensor_by_name('LSTMExtraction1/lstm1/W0:0'))
226 |         plt.bar(range(0, 150), a[0])
227 |         plt.show()
228 |         plt.savefig('/home/massarelli/weight.pdf')
229 | 
230 | 
231 |     def train(self):
232 |         tf.reset_default_graph()
233 |         with tf.Graph().as_default() as g:
234 |             session_conf = tf.ConfigProto(
235 |                 allow_soft_placement=True,
236 |                 log_device_placement=False
237 |             )
238 |             sess = tf.Session(config=session_conf)
239 | 
240 |             # Sets the graph-level random seed.
241 |             tf.set_random_seed(self.seed)
242 | 
243 |             self.createNetwork()
244 | 
245 |             print("Network created")
246 | 
247 |             # Initialize all variables
248 |             sess.run(tf.global_variables_initializer())
249 | 
250 |             # TensorBoard
251 |             # Summaries for loss and accuracy
252 |             loss_summary = tf.summary.scalar("loss", self.network.loss)
253 | 
254 |             # Train Summaries
255 |             train_summary_op = tf.summary.merge([loss_summary])
256 |             train_summary_dir = os.path.join(self.logdir, "summaries", "train")
257 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
258 | 
259 |             # Validation summaries
260 |             val_summary_op = tf.summary.merge([loss_summary])
261 |             val_summary_dir = os.path.join(self.logdir, "summaries", "validation")
262 |             val_summary_writer = tf.summary.FileWriter(val_summary_dir, sess.graph)
263 | 
264 |             # Test summaries
265 |             test_summary_op = tf.summary.merge([loss_summary])
266 |             test_summary_dir = os.path.join(self.logdir, "summaries", "test")
267 |             test_summary_writer = tf.summary.FileWriter(test_summary_dir, sess.graph)
268 | 
269 |             # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
270 |             checkpoint_dir = os.path.abspath(os.path.join(self.logdir, "checkpoints"))
271 |             checkpoint_prefix = os.path.join(checkpoint_dir, "model")
272 |             if not os.path.exists(checkpoint_dir):
273 |                 os.makedirs(checkpoint_dir)
274 |             saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.num_checkpoints)
275 | 
276 |             BEST_ACCURACY = 0
277 |             stat_file = open(str(self.logdir) + "/epoch_stats.tsv", "w")
278 |             stat_file.write("#epoch\ttrain_loss\tval_loss\tval_auc\ttest_loss\ttest_auc\n")
279 | 
280 |             print("Creating functions factories...")
281 |             sys.stdout.flush()
282 | 
283 |             p_train = FunctionFactory(self.db_name, self.feature_type, 'train', self.json_asm2id, self.max_instructions, self.max_nodes, self.encoder,self.batch_size,self.flags)
284 |             p_validation = FunctionFactory(self.db_name, self.feature_type, 'validation', self.json_asm2id, self.max_instructions, self.max_nodes, self.encoder,self.batch_size,self.flags)
285 |             p_test = FunctionFactory(self.db_name, self.feature_type, 'test', self.json_asm2id,self.max_instructions, self.max_nodes, self.encoder,self.batch_size,self.flags)
286 | 
287 |             print("Starting train!")
288 |             sys.stdout.flush()
289 | 
290 |             step = 0
291 |             for epoch in range(0, self.num_epochs):
292 |                 epoch_msg = ""
293 |                 epoch_msg += "  epoch: {}\n".format(epoch)
294 | 
295 |                 epoch_loss =  0
296 | 
297 |                 # ----------------------#
298 |                 #         TRAIN	       #
299 |                 # ----------------------#
300 |                 n_batch=0
301 |                 for adj_batch, nodes_batch, y_batch, len_batch in tqdm.tqdm(p_train.async_chunker(epoch%25, self.batch_size, shuffle=True), total=p_train.num_batches):
302 | 
303 |                     assert len(adj_batch)
304 | 
305 |                     feed_dict = {
306 |                         self.network.x: nodes_batch,
307 |                         self.network.adj: adj_batch,
308 |                         self.network.lenghts: len_batch,
309 |                         self.network.y: y_batch,
310 |                     }
311 | 
312 |                     summaries, _, loss, norms = sess.run(
313 |                         [train_summary_op, self.network.train_step, self.network.loss, self.network.norms],
314 |                         feed_dict=feed_dict)
315 | 
316 |                     n_batch=n_batch+1
317 | 
318 |                     # tensorboard
319 |                     train_summary_writer.add_summary(summaries, step)
320 |                     epoch_loss += loss * p_train.batch_dim  # ???
321 |                     step += 1
322 | 
323 |                 # recap epoch
324 |                 epoch_loss /= p_train.num_pairs
325 | 
326 |                 # ----------------------#
327 |                 #      VALIDATION	   #
328 |                 # ----------------------#
329 |                 val_loss = 0
330 |                 epoch_msg += "\n"
331 |                 val_y = []
332 |                 val_pred = []
333 |                 print("Validating")
334 |                 for adj_batch, nodes_batch, y_batch, len_batch in tqdm.tqdm(p_validation.async_chunker(0, self.batch_size),total=p_validation.num_batches):
335 |                     feed_dict = {
336 |                         self.network.x: nodes_batch,
337 |                         self.network.adj: adj_batch,
338 |                         self.network.lenghts: len_batch,
339 |                         self.network.y: y_batch,
340 |                     }
341 | 
342 |                     summaries, loss, pred_probab, pred_classes = sess.run(
343 |                         [val_summary_op, self.network.loss, self.network.pred_probab, self.network.pred_classes], feed_dict=feed_dict)
344 |                     val_loss += loss * p_validation.batch_dim
345 |                     val_summary_writer.add_summary(summaries, step)
346 |                     val_y.extend(y_batch)
347 |                     val_pred.extend(pred_classes)
348 |                 val_loss /= p_validation.num_pairs
349 | 
350 |                 val_accuracy = metrics.accuracy_score(val_y, val_pred)
351 | 
352 |                 val_report = metrics.classification_report(val_y, val_pred, target_names=self.encoder.classes_)
353 | 
354 |                 tmp = val_report.split("\n")
355 |                 val_report = ""
356 |                 for l in tmp:
357 |                     val_report += "\t\t" + l + "\n"
358 | 
359 |                 stri = "\tepoch {} / {}, train loss {:g}, val loss {:g}, val accuracy {:g}\n".format(epoch, self.num_epochs, epoch_loss, val_loss, val_accuracy)
360 |                 
361 |                 epoch_msg += stri
362 | 
363 |                 sys.stdout.write(stri)
364 | 
365 |                 sys.stdout.flush()
366 | 
367 |                 # execute test only if validation auc increased
368 |                 test_loss = "-"
369 |                 test_auc = "-"
370 | 
371 |                 if val_accuracy > BEST_ACCURACY and self.cross_val:
372 |                     BEST_ACCURACY =  val_accuracy
373 |                     saver.save(sess, checkpoint_prefix)
374 |                     print("\nNEW BEST_VAL_ACCURACY: {} !\n".format(BEST_ACCURACY))
375 | 
376 |                 if val_accuracy > BEST_ACCURACY and not self.cross_val:
377 |                     BEST_ACCURACY = val_accuracy
378 | 
379 |                     sys.stdout.write("\t" + "-*"*40 + "\n")
380 | 
381 |                     stri = "\tNEW BEST_ACCURACY: {} !\n\tVal Classification Report: \n {} \n".format(BEST_ACCURACY, val_report)
382 | 
383 |                     epoch_msg += stri
384 |                     sys.stdout.write(stri)
385 | 
386 |                     # save best model
387 |                     saver.save(sess, checkpoint_prefix)
388 | 
389 |                     # ----------------------#
390 |                     #         TEST  	    #
391 |                     # ----------------------#
392 | 
393 |                     # TEST
394 |                     test_loss = 0
395 |                     epoch_msg += "\n"
396 |                     test_y = []
397 |                     test_pred = []
398 |                     print("Testing")
399 |                     for adj_batch, nodes_batch, y_batch, len_batch in tqdm.tqdm(p_test.async_chunker(0, self.batch_size),total=p_test.num_batches):
400 | 
401 |                         feed_dict = {
402 |                             self.network.x: nodes_batch,
403 |                             self.network.adj: adj_batch,
404 |                             self.network.lenghts: len_batch,
405 |                             self.network.y: y_batch,
406 |                         }
407 | 
408 |                         summaries, loss, pred_probab, pred_classes = sess.run(
409 |                             [val_summary_op, self.network.loss, self.network.pred_probab, self.network.pred_classes],
410 |                             feed_dict=feed_dict)
411 |                         test_loss += loss * p_test.batch_dim
412 |                         test_summary_writer.add_summary(summaries, step)
413 |                         test_y.extend(y_batch)
414 |                         test_pred.extend(pred_classes)
415 |                     test_loss /= p_test.num_pairs
416 | 
417 |                     test_accuracy = metrics.accuracy_score(test_y, test_pred)
418 | 
419 |                     test_report = metrics.classification_report(test_y, test_pred, target_names=self.encoder.classes_)
420 | 
421 |                     tmp = test_report.split("\n")
422 |                     test_report = ""
423 |                     for l in tmp:
424 |                         test_report += "\t\t" + l + "\n"
425 | 
426 |                     # Compute confusion matrix
427 |                     cnf_matrix = metrics.confusion_matrix(test_y,  test_pred)
428 |                     np.set_printoptions(precision=2)
429 |                     np.savetxt(str(self.logdir) + "/best_test_confusion_matrix.csv", cnf_matrix, delimiter=',')
430 | 
431 |                     fig=plt.figure()
432 |                     self.plot_confusion_matrix(cnf_matrix, self.encoder.classes_)
433 |                     plt.savefig(str(self.logdir) + "/best_test_confusion_matrix.png")
434 |                     plt.close(fig)
435 |                    
436 |                     tmp = str(cnf_matrix).split('\n')
437 |                     scnf = ""
438 |                     for l in tmp:
439 |                         scnf += "\t\t" + l + "\n"
440 | 
441 |                     stri = "\tTest_loss : {}\n\tTest Accuracy: {}\n\tTest Classification Report:\n {} \tTest Confusion Matrix : \n {} \n".format(test_loss, test_accuracy, test_report, scnf)
442 |                     epoch_msg += stri
443 | 
444 |                     sys.stdout.write(stri)
445 | 
446 |                     sys.stdout.write("\t" + "-*"*40 + "\n")
447 | 
448 |                 stat_file.write(
449 |                     "{}\t{}\t{}\t{}\t{}\t{}\n".format(epoch, epoch_loss, val_loss, val_accuracy, test_loss, test_accuracy))
450 |                 self.logger.info("\n *-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-\n {} \n".format(epoch_msg))
451 |             stat_file.close()
452 |             sess.close()
453 |             return BEST_ACCURACY
454 | 


--------------------------------------------------------------------------------
/compiler_provenance/train.py:
--------------------------------------------------------------------------------
 1 | # SAFE TEAM
 2 | #
 3 | #
 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
 5 | #
 6 | 
 7 | from compiler_provenance.s2v_trainer import S2VTrainerLSTM
 8 | from compiler_provenance.parameters import Flags
 9 | import numpy as np
10 | 
11 | 
12 | def run_test():
13 |     flags = Flags()
14 |     flags.logger.info("\n{}\n".format(flags))
15 | 
16 |     print(str(flags))
17 | 
18 |     file_embedding_matrix = flags.file_embedding_matrix
19 | 
20 |     embedding_matrix = np.float32(np.load(file_embedding_matrix))
21 |     if flags.random_embedding:
22 |         embedding_matrix = np.random.rand(*np.shape(embedding_matrix)).astype(np.float32)
23 |         embedding_matrix[0, :] = np.zeros(np.shape(embedding_matrix)[1]).astype(np.float32)
24 | 
25 |     if flags.cross_val:
26 |         print("STARTING CROSS VALIDATION")
27 |         res = []
28 |         mean = 0
29 |         for i in range(0, flags.cross_val_fold):
30 |             print("CROSS VALIDATION STARTING FOLD: " + str(i))
31 |             if i > 0:
32 |                 flags.close_log()
33 |                 flags.reset_logdir()
34 |                 del flags
35 |                 flags = Flags()
36 |                 flags.logger.info("\n{}\n".format(flags))
37 | 
38 |             flags.logger.info("Starting cross validation fold: {}".format(i))
39 | 
40 |             flags.db_name = flags.db_name + "_val_" + str(i+1) + ".db"
41 |             flags.logger.info("Cross validation db name: {}".format(flags.db_name))
42 | 
43 |             trainer = S2VTrainerLSTM(flags, embedding_matrix)
44 |             best_val_auc = trainer.train()
45 | 
46 |             mean += best_val_auc
47 |             res.append(best_val_auc)
48 | 
49 |             flags.logger.info("Cross validation fold {} finished best auc: {}".format(i, best_val_auc))
50 |             print("FINISH FOLD: " + str(i) + " BEST VAL AUC: " + str(best_val_auc))
51 | 
52 |         print("CROSS VALIDATION ENDED")
53 |         print("Result: " + str(res))
54 |         print("")
55 | 
56 |         flags.logger.info("Cross validation finished results: {}".format(res))
57 |         flags.logger.info(" mean: {}".format(mean / flags.cross_val_fold))
58 |         flags.close_log()
59 | 
60 |         flags.close_log()
61 | 
62 |     else:
63 |         trainer = S2VTrainerLSTM(flags, embedding_matrix)
64 |         trainer.train()
65 |         flags.close_log()
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     run_test()
70 | 


--------------------------------------------------------------------------------
/compiler_provenance/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Type of the network to use
 4 | 
 5 | NETWORK_TYPE="Attention_Mean"
 6 | # NETWORK_TYPE="Arith_Mean"
 7 | # NETWORK_TYPE="RNN"
 8 | # NETWORK_TYPE="Annotations"
 9 | 
10 | # What to classify:
11 | CLASSIFICATION_KIND="Family"      # Compiler Family
12 | # CLASSIFICATION_KIND="Compiler"      # Compiler Family + Version
13 | # CLASSIFICATION_KIND="Compiler+Opt"   # Compiler Familt + Version + Optimization
14 | # CLASSIFICATION_KIND="Opt"      # Optimization
15 | 
16 | 
17 | # Root path for the experiment
18 | MODEL_PATH=experiments/
19 | 
20 | # Path to the sqlite db with diassembled functions
21 | DB_PATH=../data/restricted_compilers_dataset.db
22 | 
23 | # Path to embedding matrix
24 | EMBEDDING_MATRIX=../data/i2v/embedding_matrix.npy
25 | 
26 | # Path to instruction2id dictionary
27 | INS2ID=../data/i2v/word2id.json
28 | 
29 | # Add this argument to train.py to use random instructions embeddings
30 | RANDOM_EMBEDDINGS="-r"
31 | 
32 | # Add this argument to train.py to use trainable instructions embeddings
33 | TRAINABLE_EMBEDDINGS="-te"
34 | 
35 | python3 train.py --o $MODEL_PATH -n $DB_PATH -nn $NETWORK_TYPE -e $EMBEDDING_MATRIX -j $INS2ID -cl $CLASSIFICATION_KIND
36 | 
37 | 


--------------------------------------------------------------------------------
/compiler_provenance/utils.py:
--------------------------------------------------------------------------------
 1 | # SAFE TEAM
 2 | #
 3 | #
 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
 5 | #
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | def __padAndFilter(input_pairs, input_labels, input_len, max_num_vertices):
11 | 
12 |     output_pairs = []
13 |     output_labels = []
14 |     output_len = []
15 | 
16 |     for pair, label, lens in zip(input_pairs, input_labels, input_len):
17 |         try:
18 |             g1 = pair[0]
19 | 
20 |             # graph 1
21 |             adj1 = g1[0]
22 |             nodes1 = g1[1]
23 | 
24 |             if len(nodes1) <= max_num_vertices:
25 |                 # graph 1
26 |                 pad_lenght1 = max_num_vertices - len(nodes1)
27 |                 new_node1 = np.pad(nodes1, [(0, pad_lenght1), (0, 0)], mode='constant')
28 |                 pad_lenght1 = max_num_vertices - adj1.shape[0]
29 | 
30 |                 # pass to dense for padding
31 |                 adj1_dense = np.pad(adj1.todense(), [(0, pad_lenght1), (0, pad_lenght1)], mode='constant')
32 | 
33 |                 g1 = (adj1_dense, new_node1)
34 |                 output_pairs.append([g1])
35 |                 output_labels.append(label)
36 | 
37 |                 new_lens_0 = lens + [0]*(max_num_vertices-len(lens))
38 |                 output_len.append([new_lens_0])
39 |         except:
40 |             pass
41 | 
42 |     return output_pairs, output_labels, output_len
43 | 
44 | 


--------------------------------------------------------------------------------
/dataset_creation/BlockFeaturesExtractor.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | 
  7 | 
  8 | # Questa classe estra da un blocco di codice assembler le features utilizzate nell'articolo ccs17
  9 | 
 10 | class BlockFeaturesExtractor:
 11 |     x86_ARIT = 0
 12 |     x86_MOV = 0
 13 |     string = []
 14 |     dyn_string = []
 15 |     constants = []
 16 |     num_transfer = 0
 17 |     num_instructions = 0
 18 |     num_calls = 0
 19 |     num_arith = 0
 20 | 
 21 |     def __init__(self, architecture, instructions, r2_disasm, string_addr):
 22 |         self.architecture = architecture
 23 |         self.instructions = instructions
 24 |         self.r2_disasm = r2_disasm
 25 |         self.string_addr = string_addr
 26 | 
 27 |         self.string = []
 28 |         self.constant = []
 29 |         self.num_transfer = 0
 30 |         self.num_instructions = 0
 31 |         self.num_calls = 0
 32 |         self.num_arith = 0
 33 | 
 34 |     def getFeatures(self):
 35 |         if len(self.instructions) != 0:
 36 |             self.num_instructions = len(self.instructions)
 37 |             self.constant, self.string = self.extractConstansStrings()
 38 |             self.num_transfer = self.countTransfer()
 39 |             self.num_calls = self.countCalls()
 40 |             self.num_arith = self.countArith()
 41 | 
 42 |         return ({'string': self.string, 'constant': self.constant,
 43 |                 'transfer': self.num_transfer, 'instruction': self.num_instructions,
 44 |                 'call': self.num_calls, 'arith': self.num_arith})
 45 | 
 46 |     def countCalls(self):
 47 |         x86_mnemonics = ['call', 'int']
 48 |         arm_mnemonics = ['bl', 'blx']
 49 |         mips_mnemonics = ['jal', 'jalr', 'syscall']
 50 | 
 51 |         mips_mnemonics = [s.lower() for s in mips_mnemonics]
 52 |         arm_mnemonics = [s.lower() for s in arm_mnemonics]
 53 |         x86_mnemonics = [s.lower() for s in x86_mnemonics]
 54 | 
 55 |         count = 0
 56 |         for i in list(self.instructions):
 57 |             if self.architecture == 'x86':
 58 |                 if str(i['mnemonic']) in x86_mnemonics:
 59 |                     count = count + 1
 60 |             elif self.architecture == 'mips':
 61 |                 if str(i['mnemonic']) in mips_mnemonics:
 62 |                     count = count + 1
 63 |             elif self.architecture == 'arm':
 64 |                 if str(i['mnemonic']) in arm_mnemonics:
 65 |                     count = count + 1
 66 |         return count
 67 | 
 68 |     # Questa funzione conta le istruzione aritmetiche all'interno del blocco
 69 |     def countArith(self):
 70 |         x86_mnemonics = ['add', 'sub', 'div', 'imul', 'idiv', 'mul', 'shl', 'dec', 'adc', 'adcx', 'addpd', 'addps',
 71 |                          'addsd', 'addss', 'addsubpd', 'ADDSUBPS', 'adox', 'divpd', 'divps'
 72 |             , 'divsd', 'divss', 'dppd', 'dpps', 'f2xm1', 'fabs', 'fadd', 'faddp', 'fcos', 'fdiv', 'fdivp', 'fiadd',
 73 |                          'fidiv', 'fimul', 'fisub', 'fisubr', 'fmul', 'fmulp', 'FPATAN', 'FPREM', 'FPREM1', 'FPTAN',
 74 |                          'FRNDINT', 'FSCALE'
 75 |             , 'FSIN', 'FSINCOS', 'FSQRT', 'FSUB', 'FSUBP', 'FSUBR', 'FSUBRP', 'FYL2X', 'FYL2XP1', 'HADDPD', 'HADDPS',
 76 |                          'HSUBPD', 'HSUBPS', 'KADDB', 'KADDD', 'KADDD', 'KADDW', 'KSHIFTLB', 'KSHIFTLD', 'KSHIFTLQ',
 77 |                          'KSHIFTLW', 'KSHIFTRB', 'KSHIFTRD', 'KSHIFTRQ', 'KSHIFTRW'
 78 |             , 'MAXPD', 'MAXPS', 'MAXSD', 'MAXSS', 'MINPD', 'MINPS', 'MINSD', 'MINSS', 'MULPD'
 79 |             , 'MULPS', 'MULSS', 'MULSD', 'MULX', 'PADDB', 'PADDD', 'PADDQ', 'PADDSB', 'PADDSW', 'PADDUSB', 'PADDUSW'
 80 |             , 'PADDW', 'PAVGB', 'PAVGW', 'PHADDD', 'PHADDSW', 'PHADDW', 'PHMINPOSUW', 'PHSUBD', 'PHSUBSW', 'PHSUBW'
 81 |             , 'PMADDUBSW', 'PMADDWD', 'PMAXSB', 'PMAXSD', 'PMAXSQ', 'PMAXSW', 'PMAXUB', 'PMAXUD', 'PMAXUQ', 'PMAXUW',
 82 |                          'PMINSB'
 83 |             , 'PMINSD', 'PMINSQ', 'PMINSW', 'PMINUB', 'PMINUD', 'PMINUQ', 'PMINUW', 'PMULDQ', 'PMULHRSW', 'PMULHUW',
 84 |                          'PMULHW', 'PMULLD', 'PMULLQ'
 85 |             , 'PMULLW', 'PMULUDQ', 'PSADBW', 'PSLLD', 'PSLLW', 'PSRAD', 'PSLLQ', 'PSRAQ', 'PSRLQ', 'PSRLW'
 86 |             , 'PSUBB', 'PSUBD', 'PSUBQ', 'PSUBSB', 'PSUBSW', 'PSUBUSB', 'PSUBUSW', 'RCL', 'RCR'
 87 |             , 'ROL', 'ROR', 'ROUNDPD', 'ROUNDPS', 'ROUNDSD', 'ROUNDSS', 'RSQRTPS'
 88 |             , 'RSQRTSS', 'SAL', 'SAR', 'SARX', 'SBB', 'inc', 'SHLD', 'SHLX', 'SHR', 'SHRD', 'SHRX', 'SQRTPD', 'SQRTPS',
 89 |                          'SQRTSD', 'SQRTSS'
 90 |             , 'SUBPD', 'SUBPS', 'SUBSD', 'SUBSS', 'VFMADD132PD', 'VPSLLVD', 'VPSLLVQ', 'VPSLLVW', 'VPSRAVD', 'VPSRAVQ'
 91 |             , 'VPSRAVW', 'VPSRLVD', 'VPSRLVQ', 'VPSRLVW', 'VRNDSCALEPD', 'VRNDSCALEPS', 'XADD']
 92 | 
 93 |         arm_mnemonics = ['add', 'adc', 'qadd', 'dadd', 'sub', 'SBC', 'RSB', 'RSC', 'subs', 'qsub',
 94 |                          'add16', 'SUB16', 'add8', 'sub8', 'ASX', 'sax', 'usad8', 'SSAT', 'MUL'
 95 |             , 'smul', 'MLA', 'MLs', 'UMULL', 'UMLAL', 'UMaAL', 'SMULL', 'smlal'
 96 |             , 'SMULxy', 'SMULWy', 'SMLAxy', 'SMLAWy', 'SMLALxy', 'SMUAD'
 97 |             , 'SMLAD', 'SMLALD', 'SMUSD', 'SMLSD', 'SMLSLD', 'SMMUL'
 98 |             , 'SMMLA', 'MIA', 'MIAPH', 'MIAxy', 'SDIV', 'udiv'
 99 |             , 'ASR', 'LSL', 'LSR', 'ROR', 'RRX']
100 | 
101 |         mips_mnemonics = ['add', 'addu', 'addi', 'addiu', 'mult', 'multu', 'div', 'divu'
102 |             , 'AUI', 'DAUI', 'DAHI', 'DATI', 'CLO', 'CLZ', 'DADD', 'DADDI'
103 |             , 'DADDIU', 'DADDU', 'DCLO', 'DCLZ', 'DDIV', 'DDIVU', 'MOD'
104 |             , 'MODU', 'DMOD', 'DMODU', 'DMULTU', 'DROTR', 'DROTR32', 'DSLLV'
105 |             , 'DSRA', 'DSRA32', 'DSRAV', 'DSRL', 'DSRL32'
106 |             , 'DSRLV', 'DSUB', 'DSUBU', 'DSRL', 'FLOOR', 'MAX', 'MIN', 'MINA', 'MAXA'
107 |             , 'MSUB', 'MSUBU', 'MUL', 'MUH', 'MULU', 'MUHU', 'DMUL', 'DMUH'
108 |             , 'DMULU', 'DMUHU', 'DMUL', 'NEG'
109 |             , 'NMADD', 'NMSUB', 'RECIP', 'RINT', 'ROTR', 'ROUND', 'RSQRT'
110 |             , 'SLL', 'SLLV', 'SQRT', 'SRA', 'SRAV', 'SRL', 'SRLV'
111 |             , 'SUB', 'SUBU', 'madd', 'maddu', 'msub', 'msubu', 'sll'
112 |             , 'srl', 'sra', 'sllv', 'srla', 'srlv']
113 | 
114 |         mips_mnemonics = [s.lower() for s in mips_mnemonics]
115 |         arm_mnemonics = [s.lower() for s in arm_mnemonics]
116 |         x86_mnemonics = [s.lower() for s in x86_mnemonics]
117 | 
118 |         count = 0
119 |         for i in list(self.instructions):
120 |             if self.architecture == 'x86':
121 |                 if str(i['mnemonic']).lower() in x86_mnemonics:
122 |                     count = count + 1
123 |             elif self.architecture == 'mips':
124 |                 if str(i['mnemonic']).lower() in mips_mnemonics:
125 |                     count = count + 1
126 |             elif self.architecture == 'arm':
127 |                 if str(i['mnemonic']).lower() in arm_mnemonics:
128 |                     count = count + 1
129 |             elif self.architecture == 'arm':
130 |                 if str(i['mnemonic']).lower() in arm_mnemonics:
131 |                     count = count + 1
132 |         nop = 0
133 |         return count
134 | 
135 |     # Questa funzione conta le istruzioni logiche all'interno del blocco
136 |     def countLogic(self):
137 |         x86_mnemonics = ['and', 'andn', 'andnpd', 'andpd', 'andps', 'andnps', 'test', 'xor', 'xorpd', 'pslld'
138 |             , 'ANDNPD', 'ANDNPS', 'ANDPD', 'ANDPS', 'KANDB', 'KANDD', 'KANDNB', 'KANDND', 'KANDNQ', 'KANDNW', 'KANDQ',
139 |                          'KANDW'
140 |             , 'KNOTB', 'KNOTq', 'KNOTD', 'KNOTw', 'korq', 'korb', 'korw', 'kord', 'KTESTB', 'ktestd', 'ktestq', 'ktestw'
141 |             , 'KXNORB', 'KXNORd', 'KXNORq', 'KXORB', 'KXORq', 'KXORd', 'KXORw', 'NOT', 'OR', 'ORPD', 'ORPS', 'PAND',
142 |                          'PAND'
143 |             , 'PCMPEQB', 'PCMPEQD', 'PCMPEQQ', 'PCMPGTB', 'PTEST', 'pxor', 'VPCMPB', 'VPCMPD', 'VPCMPQ',
144 |                          'VPTESTMB', 'VPTESTMD', 'VPTESTMQ', 'VPTESTMW', 'VPTESTNMB', 'VPTESTNMD', 'VPTESTNMQ',
145 |                          'VPTESTNMW'
146 |             , 'XORPD', 'XORPS']
147 |         arm_mnemonics = ['AND', 'EOR', 'ORR', 'ORN', 'BIC']
148 |         mips_mnemonics = ['and', 'andi', 'or', 'ori', 'xor', 'nor', 'slt', 'slti', 'sltu']
149 | 
150 |         mips_mnemonics = [s.lower() for s in mips_mnemonics]
151 |         arm_mnemonics = [s.lower() for s in arm_mnemonics]
152 |         x86_mnemonics = [s.lower() for s in x86_mnemonics]
153 | 
154 |         count = 0
155 |         for i in list(self.instructions):
156 |             if self.architecture == 'x86':
157 |                 if str(i['mnemonic']).lower() in x86_mnemonics:
158 |                     count = count + 1
159 |             elif self.architecture == 'mips':
160 |                 if str(i['mnemonic']).lower() in mips_mnemonics:
161 |                     count = count + 1
162 |             elif self.architecture == 'arm':
163 |                 if str(i['mnemonic']).lower() in arm_mnemonics:
164 |                     count = count + 1
165 |         return count
166 | 
167 |     def countTransfer(self):
168 |         x86_mnemonics = ['BNDLDX', 'BNDMK', 'BNDMOV', 'BNDSTX'
169 |             , 'CMOVA', 'CMOVZ', 'CMOVPO', 'CMOVPE', 'CMOVP', 'CMOVO', 'CMOVNZ', 'CMOVNP', 'CMOVNO', 'CMOVNG', 'CMOVL'
170 |             , 'FIST', 'FISTP', 'FISTTP', 'FSAVE', 'KMOVB', 'KMOVD', 'KMOVQ', 'KMOVW'
171 |             , 'LDDQU', 'LDS', 'LEA', 'LODS', 'LODSB', 'LODSD', 'LODSQ', 'LODSW'
172 |             , 'LSS', 'LSL', 'MOV', 'MOVAPD', 'MOVAPS', 'MOVBE', 'MOVD', 'MOVDDUP', 'MOVDQ2Q', 'MOVDQA', 'MOVDQU'
173 |             , 'MOVHLPS', 'MOVHPD', 'MOVHPS', 'MOVLHPS', 'MOVLPD', 'MOVLPS', 'MOVQ', 'MOVS', 'MOVSB', 'MOVSD', 'MOVNTQ'
174 |             , 'MOVNTDQ', 'MOVMSKPS', 'MOVSQ', 'MOVSS', 'MOVSW', 'MOVSX', 'MOVSXD', 'MOVUPD', 'MOVUPS', 'MOVZX',
175 |                          'PMOVMSKB'
176 |             , 'PMOVSX', 'PMOVZX', 'PUSH', 'PUSHA', 'PUSHAD', 'PUSHF', 'STOS', 'STOSB', 'STOSD', 'STOSQ', 'STOSW'
177 |             , 'VBROADCAST', 'VEXPANDPD', 'VEXPANDPS', 'VMOVDQA32', 'VMOVDQA64', 'VMOVDQU16', 'VMOVDQU32', 'VMOVDQU64',
178 |                          'VMOVDQU8'
179 |             , 'VPBROADCAST', 'VPBROADCASTB', 'VPEXPANDD', 'VPEXPANDQ', 'movb', 'movq']
180 |         arm_mnemonics = ['MOV', 'MVN', 'MOVT', 'MRA', 'MAR', 'LDR', 'STR', 'PLD', 'PLI', 'PLDW', 'LDM', 'LDREX',
181 |                          'LDREXD', 'STM', 'STREX', 'STREXD']
182 |         mips_mnemonics = ['LB', 'LBE', 'LBU', 'LBUE', 'LD', 'LDE', 'LDU', 'LDUE', 'LDC1', 'LDC2'
183 |             , 'LDL', 'LDPC', 'LDR', 'LDXC1', 'LH', 'LHE', 'LHU', 'LHUE', 'LL'
184 |             , 'LLD', 'LLE', 'LLDP', 'LLWP', 'LLWPE', 'LSA', 'LUXC1', 'LW'
185 |             , 'LWC1', 'LWC2', 'LWL', 'LWLE', 'LWPC'
186 |             , 'LWR', 'LWRE', 'LWU', 'MOV', 'SB', 'SBE', 'SC'
187 |             , 'SCD', 'SCDP', 'SCE', 'SCWP', 'SCWPE'
188 |             , 'SD', 'SDBBP', 'SDC1', 'SDC2', 'SDL', 'SDR', 'SDXC1', 'SH', 'SHU', 'SHE'
189 |             , 'SW', 'SWE', 'SWC1', 'SWC2', 'SWL', 'SWR', 'SWLE', 'SWRE', 'SWXC1']
190 | 
191 |         mips_mnemonics = [s.lower() for s in mips_mnemonics]
192 |         arm_mnemonics = [s.lower() for s in arm_mnemonics]
193 |         x86_mnemonics = [s.lower() for s in x86_mnemonics]
194 | 
195 |         count = 0
196 |         for i in list(self.instructions):
197 |             if self.architecture == 'x86':
198 |                 if str(i['mnemonic']).lower() in x86_mnemonics:
199 |                     count = count + 1
200 |             elif self.architecture == 'mips':
201 |                 if str(i['mnemonic']).lower() in mips_mnemonics:
202 |                     count = count + 1
203 |             elif self.architecture == 'arm':
204 |                 if str(i['mnemonic']).lower() in arm_mnemonics:
205 |                     count = count + 1
206 |         return count
207 | 
208 |     def extractConstansStrings(self):
209 |         constants = 0
210 |         strings = 0
211 |         for i, ins in enumerate(self.instructions):
212 |             if 'opex' not in ins:
213 |                 continue
214 |             for operand in ins['opex']['operands']:
215 |                 if operand['type'] == 'imm':
216 |                     if 'disasm' in self.r2_disasm[i] and 'str.' in self.r2_disasm[i]['disasm']:
217 |                         strings += 1
218 |                     elif operand['value'] in self.string_addr:
219 |                         strings += 1
220 |                     else:
221 |                         constants += 1
222 | 
223 |         return (constants, strings)
224 | 


--------------------------------------------------------------------------------
/dataset_creation/DataSplitter.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | import json
  7 | import random
  8 | import sqlite3
  9 | from tqdm import tqdm
 10 | 
 11 | 
 12 | class DataSplitter:
 13 | 
 14 |     def __init__(self, db_name):
 15 |         self.db_name = db_name
 16 | 
 17 |     def create_pair_table(self, table_name):
 18 |         conn = sqlite3.connect(self.db_name)
 19 |         c = conn.cursor()
 20 |         c.executescript("DROP TABLE IF EXISTS {} ".format(table_name))
 21 |         c.execute("CREATE TABLE  {} (id INTEGER PRIMARY KEY, true_pair  TEXT, false_pair TEXT)".format(table_name))
 22 |         conn.commit()
 23 |         conn.close()
 24 | 
 25 |     def get_ids(self, set_type):
 26 |         conn = sqlite3.connect(self.db_name)
 27 |         cur = conn.cursor()
 28 |         q = cur.execute("SELECT id FROM {}".format(set_type))
 29 |         ids = q.fetchall()
 30 |         conn.close()
 31 |         return ids
 32 | 
 33 |     @staticmethod
 34 |     def select_similar_cfg(id, provenance, ids, cursor):
 35 |         q1 = cursor.execute('SELECT id FROM functions WHERE project=? AND file_name=? and function_name=?', provenance)
 36 |         candidates = [i[0] for i in q1.fetchall() if (i[0] != id and i[0] in ids)]
 37 |         if len(candidates) == 0:
 38 |             return None
 39 |         id_similar = random.choice(candidates)
 40 |         return id_similar
 41 | 
 42 |     @staticmethod
 43 |     def select_dissimilar_cfg(ids, provenance, cursor):
 44 |         while True:
 45 |             id_dissimilar = random.choice(ids)
 46 |             q2 = cursor.execute('SELECT project, file_name, function_name FROM functions WHERE id=?', (id_dissimilar,))
 47 |             res = q2.fetchone()
 48 |             if res != provenance:
 49 |                 break
 50 |         return id_dissimilar
 51 | 
 52 |     def create_epoch_pairs(self, epoch_number, pairs_table,id_table):
 53 |         random.seed = epoch_number
 54 | 
 55 |         conn = sqlite3.connect(self.db_name)
 56 |         cur = conn.cursor()
 57 |         ids = cur.execute("SELECT id FROM "+id_table).fetchall()
 58 |         ids = [i[0] for i in ids]
 59 |         id_set = set(ids)
 60 |         true_pair = []
 61 |         false_pair = []
 62 | 
 63 |         for my_id in tqdm(ids):
 64 |             q = cur.execute('SELECT project, file_name, function_name FROM functions WHERE id =?', (my_id,))
 65 |             cfg_0_provenance = q.fetchone()
 66 |             id_sim = DataSplitter.select_similar_cfg(my_id, cfg_0_provenance, id_set, cur)
 67 |             id_dissim = DataSplitter.select_dissimilar_cfg(ids, cfg_0_provenance, cur)
 68 |             if id_sim is not None and id_dissim is not None:
 69 |                 true_pair.append((my_id, id_sim))
 70 |                 false_pair.append((my_id, id_dissim))
 71 | 
 72 |         true_pair = str(json.dumps(true_pair))
 73 |         false_pair = str(json.dumps(false_pair))
 74 | 
 75 |         cur.execute("INSERT INTO {} VALUES (?,?,?)".format(pairs_table), (epoch_number, true_pair, false_pair))
 76 |         conn.commit()
 77 |         conn.close()
 78 | 
 79 |     def create_pairs(self, total_epochs):
 80 | 
 81 |         self.create_pair_table('train_pairs')
 82 |         self.create_pair_table('validation_pairs')
 83 |         self.create_pair_table('test_pairs')
 84 | 
 85 |         for i in range(0, total_epochs):
 86 |             print("Creating training pairs for epoch {} of {}".format(i, total_epochs))
 87 |             self.create_epoch_pairs(i, 'train_pairs','train')
 88 | 
 89 |         print("Creating validation pairs")
 90 |         self.create_epoch_pairs(0, 'validation_pairs','validation')
 91 | 
 92 |         print("Creating test pairs")
 93 |         self.create_epoch_pairs(0, "test_pairs",'test')
 94 | 
 95 | 
 96 |     @staticmethod
 97 |     def prepare_set(data_to_include, table_name, file_list, cur):
 98 |         i = 0
 99 |         while i < data_to_include and len(file_list) > 0:
100 |             choice = random.choice(file_list)
101 |             file_list.remove(choice)
102 |             q = cur.execute("SELECT id FROM functions where project=? AND file_name=?", choice)
103 |             data = q.fetchall()
104 |             cur.executemany("INSERT INTO {} VALUES (?)".format(table_name), data)
105 |             i += len(data)
106 |         return file_list, i
107 | 
108 |     def split_data(self, validation_dim, test_dim):
109 |         random.seed = 12345
110 |         conn = sqlite3.connect(self.db_name)
111 |         c = conn.cursor()
112 | 
113 |         q = c.execute('''SELECT project, file_name FROM functions ''')
114 |         data = q.fetchall()
115 |         conn.commit()
116 | 
117 |         num_data = len(data)
118 |         num_test = int(num_data * test_dim)
119 |         num_validation = int(num_data * validation_dim)
120 | 
121 |         filename = list(set(data))
122 | 
123 |         c.execute("DROP TABLE IF EXISTS train")
124 |         c.execute("DROP TABLE IF EXISTS test")
125 |         c.execute("DROP TABLE IF EXISTS validation")
126 | 
127 |         c.execute("CREATE TABLE IF NOT EXISTS train (id INTEGER PRIMARY KEY)")
128 |         c.execute("CREATE TABLE IF NOT EXISTS validation (id INTEGER PRIMARY KEY)")
129 |         c.execute("CREATE TABLE IF NOT EXISTS test (id INTEGER PRIMARY KEY)")
130 | 
131 |         c.execute('''CREATE INDEX IF NOT EXISTS  my_index   ON functions(project, file_name, function_name)''')
132 |         c.execute('''CREATE INDEX IF NOT EXISTS  my_index_2 ON functions(project, file_name)''')
133 | 
134 |         filename, test_num = DataSplitter.prepare_set(num_test, 'test', filename, conn.cursor())
135 |         conn.commit()
136 |         assert len(filename) > 0
137 |         filename, val_num = self.prepare_set(num_validation, 'validation', filename, conn.cursor())
138 |         conn.commit()
139 |         assert len(filename) > 0
140 |         _, train_num = self.prepare_set(num_data - num_test - num_validation, 'train', filename, conn.cursor())
141 |         conn.commit()
142 | 
143 |         print("Train Size: {}".format(train_num))
144 |         print("Validation Size:  {}".format(val_num))
145 |         print("Test Size: {}".format(test_num))
146 | 


--------------------------------------------------------------------------------
/dataset_creation/DatabaseFactory.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | from FunctionAnalyzerRadare import RadareFunctionAnalyzer
  7 | import json
  8 | import multiprocessing
  9 | from multiprocessing import Pool
 10 | from multiprocessing.dummy import Pool as ThreadPool
 11 | import os
 12 | import random
 13 | import signal
 14 | import sqlite3
 15 | from tqdm import tqdm
 16 | from networkx.readwrite import json_graph
 17 | 
 18 | 
 19 | class DatabaseFactory:
 20 | 
 21 |     def __init__(self, db_name, root_path):
 22 |         self.db_name = db_name
 23 |         self.root_path = root_path
 24 | 
 25 |     @staticmethod
 26 |     def worker(item):
 27 |         DatabaseFactory.analyze_file(item)
 28 |         return 0
 29 | 
 30 |     @staticmethod
 31 |     def extract_function(graph_analyzer):
 32 |         return graph_analyzer.extractAll()
 33 | 
 34 |     @staticmethod
 35 |     def to_jsongraph(graph):
 36 |         return json.dumps(json_graph.adjacency_data(graph))
 37 | 
 38 |     @staticmethod
 39 |     def insert_in_db(db_name, pool_sem, func, filename, function_name):
 40 |         path = filename.split(os.sep)
 41 |         if len(path) < 4:
 42 |             return
 43 |         pool_sem.acquire()
 44 |         conn = sqlite3.connect(db_name)
 45 |         cfg = DatabaseFactory.to_jsongraph(func["cfg"])
 46 |         cur = conn.cursor()
 47 |         cur.execute('''INSERT INTO functions VALUES (?,?,?,?,?,?,?)''', (None,            # id
 48 |                                                                          path[-4],      # project
 49 |                                                                          path[-3],      # compiler
 50 |                                                                          path[-2],      # optimization
 51 |                                                                          path[-1],      # file_name
 52 |                                                                          function_name, # function_name
 53 |                                                                          cfg))
 54 | 
 55 |         inserted_id = cur.lastrowid
 56 |         acfg = DatabaseFactory.to_jsongraph(func["acfg"])
 57 |         lstm_cfg = DatabaseFactory.to_jsongraph(func["lstm_cfg"])
 58 | 
 59 |         cur.execute('''INSERT INTO acfg VALUES (?,?)''', (inserted_id, acfg))
 60 |         conn.commit()
 61 |         cur.execute('''INSERT INTO lstm_cfg VALUES (?,?)''', (inserted_id, lstm_cfg))
 62 |         conn.commit()
 63 | 
 64 |         conn.close()
 65 |         pool_sem.release()
 66 | 
 67 |     @staticmethod
 68 |     def analyze_file(item):
 69 |         global pool_sem
 70 |         os.setpgrp()
 71 | 
 72 |         filename = item[0]
 73 |         db = item[1]
 74 |         use_symbol = item[2]
 75 | 
 76 |         analyzer = RadareFunctionAnalyzer(filename, use_symbol)
 77 |         p = ThreadPool(1)
 78 |         res = p.apply_async(analyzer.analyze)
 79 | 
 80 |         try:
 81 |             result = res.get(120)
 82 |         except multiprocessing.TimeoutError:
 83 |                 print("Aborting due to timeout:" + str(filename))
 84 |                 print('Try to modify the timeout value in DatabaseFactory instruction  result = res.get(TIMEOUT)')
 85 |                 os.killpg(0, signal.SIGKILL)
 86 |         except Exception:
 87 |                 print("Aborting due to error:" + str(filename))
 88 |                 os.killpg(0, signal.SIGKILL)
 89 | 
 90 |         for func in result:
 91 |             DatabaseFactory.insert_in_db(db, pool_sem, result[func], filename, func)
 92 | 
 93 |         analyzer.close()
 94 | 
 95 |         return 0
 96 | 
 97 |     # Create the db where data are stored
 98 |     def create_db(self):
 99 |         print('Database creation...')
100 |         conn = sqlite3.connect(self.db_name)
101 |         conn.execute(''' CREATE TABLE  IF NOT EXISTS functions (id INTEGER PRIMARY KEY, 
102 |                                                                 project text, 
103 |                                                                 compiler text, 
104 |                                                                 optimization text, 
105 |                                                                 file_name text, 
106 |                                                                 function_name text,
107 |                                                                 cfg text)''')
108 | 
109 |         conn.execute('''CREATE TABLE  IF NOT EXISTS acfg  (id INTEGER PRIMARY KEY, acfg text)''')
110 |         conn.execute('''CREATE TABLE  IF NOT EXISTS lstm_cfg  (id INTEGER PRIMARY KEY, lstm_cfg text)''')
111 | 
112 |         conn.commit()
113 |         conn.close()
114 | 
115 |     # Scan the root directory to find all the file to analyze,
116 |     # query also the db for already analyzed files.
117 |     def scan_for_file(self, start):
118 |         file_list = []
119 |         # Scan recursively all the subdirectory
120 |         directories = os.listdir(start)
121 |         for item in directories:
122 |             item = os.path.join(start,item)
123 |             if os.path.isdir(item):
124 |                 file_list.extend(self.scan_for_file(item + os.sep))
125 |             elif os.path.isfile(item) and item.endswith('.o'):
126 |                 file_list.append(item)
127 |         return file_list
128 | 
129 |     # Looks for already existing files in the database
130 |     # It returns a list of files that are not in the database
131 |     def remove_override(self, file_list):
132 |         conn = sqlite3.connect(self.db_name)
133 |         cur = conn.cursor()
134 |         q = cur.execute('''SELECT project, compiler, optimization, file_name FROM functions''')
135 |         names = q.fetchall()
136 |         names = [os.path.join(self.root_path, n[0], n[1], n[2], n[3]) for n in names]
137 |         names = set(names)
138 |         # If some files is already in the db remove it from the file list
139 |         if len(names) > 0:
140 |             print(str(len(names)) + ' Already in the database')
141 |         cleaned_file_list = []
142 |         for f in file_list:
143 |             if not(f in names):
144 |                 cleaned_file_list.append(f)
145 | 
146 |         return cleaned_file_list
147 | 
148 |     # root function to create the db
149 |     def build_db(self, use_symbol):
150 |         global pool_sem
151 | 
152 |         pool_sem = multiprocessing.BoundedSemaphore(value=1)
153 | 
154 |         self.create_db()
155 |         file_list = self.scan_for_file(self.root_path)
156 | 
157 |         print('Found ' + str(len(file_list)) + ' during the scan')
158 |         file_list = self.remove_override(file_list)
159 |         print('Find ' + str(len(file_list)) + ' files to analyze')
160 |         random.shuffle(file_list)
161 | 
162 |         t_args = [(f, self.db_name, use_symbol) for f in file_list]
163 | 
164 |         # Start a parallel pool to analyze files
165 |         p = Pool(processes=None, maxtasksperchild=20)
166 |         for _ in tqdm(p.imap_unordered(DatabaseFactory.worker, t_args), total=len(file_list)):
167 |             pass
168 | 
169 |         p.close()
170 |         p.join()
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/dataset_creation/ExperimentUtil.py:
--------------------------------------------------------------------------------
 1 | # SAFE TEAM
 2 | #
 3 | #
 4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
 5 | #
 6 | import argparse
 7 | from dataset_creation import DatabaseFactory, DataSplitter
 8 | 
 9 | def debug_msg():
10 |     msg =  " DATABASE UTILITY"
11 |     msg += "-------------------------------------------------\n"
12 |     msg += "This program is an utility to save data into an sqlite database with SAFE \n\n"
13 |     msg += "There are three main command: \n"
14 |     msg += "BUILD:  It create a db with two tables: functions, filtered_functions. \n"
15 |     msg += "        In the first table there are all the functions extracted from the executable with their hex code.\n"
16 |     msg += "        In the second table functions are converted to i2v representation. \n"
17 |     msg += "SPLIT:  Data are splitted into train validation and test set. " \
18 |            "        Then it generate the pairs for the training of the network.\n"
19 |     msg += "EMBEDD: Generate the embeddings of each function in the database using a trained SAFE model\n\n"
20 |     msg += "If you want to train the network use build + split"
21 |     msg += "If you want to create a knowledge base for the binary code search engine use build + embedd"
22 |     msg += "This program has been written by the SAFE team.\n"
23 |     msg += "-------------------------------------------------"
24 |     return msg
25 | 
26 | 
27 | def build_configuration(db_name, root_dir, use_symbols):
28 |     msg = "Database creation options: \n"
29 |     msg += " - Database Name: {} \n".format(db_name)
30 |     msg += " - Root dir: {} \n".format(root_dir)
31 |     msg += " - Use symbols: {} \n".format(use_symbols)
32 |     return msg
33 | 
34 | 
35 | def split_configuration(db_name, val_split, test_split, epochs):
36 |     msg = "Splitting options: \n"
37 |     msg += " - Database Name: {} \n".format(db_name)
38 |     msg += " - Validation Size: {} \n".format(val_split)
39 |     msg += " - Test Size: {} \n".format(test_split)
40 |     msg += " - Epochs: {} \n".format(epochs)
41 |     return msg
42 | 
43 | 
44 | if __name__ == '__main__':
45 | 
46 |     parser = argparse.ArgumentParser(description=debug_msg)
47 | 
48 |     parser.add_argument("-db", "--db", help="Name of the database to create", required=True)
49 | 
50 |     parser.add_argument("-b", "--build", help="Build db disassebling executables",   action="store_true")
51 |     parser.add_argument("-s", "--split", help="Perform data splitting for training", action="store_true")
52 | 
53 |     parser.add_argument("-dir", "--dir",     help="Root path of the directory to scan")
54 |     parser.add_argument("-sym", "--symbols", help="Use it if you want to use symbols", action="store_true")
55 | 
56 |     parser.add_argument("-test", "--test_size", help="Test set size [0-1]",            type=float, default=0.2)
57 |     parser.add_argument("-val",  "--val_size",  help="Validation set size [0-1]",      type=float, default=0.2)
58 |     parser.add_argument("-epo",  "--epochs",    help="# Epochs to generate pairs for", type=int,    default=25)
59 | 
60 |     try:
61 |         args = parser.parse_args()
62 |     except:
63 |         parser.print_help()
64 |         print(debug_msg())
65 |         exit(0)
66 | 
67 |     if args.build:
68 |         print("Disassemblying files and creating dataset")
69 |         print(build_configuration(args.db, args.dir, args.symbols))
70 |         factory = DatabaseFactory.DatabaseFactory(args.db, args.dir)
71 |         factory.build_db(args.symbols)
72 | 
73 |     if args.split:
74 |         print("Splitting data and generating epoch pairs")
75 |         print(split_configuration(args.db, args.val_size, args.test_size, args.epochs))
76 |         splitter = DataSplitter.DataSplitter(args.db)
77 |         splitter.split_data(args.val_size, args.test_size)
78 |         splitter.create_pairs(args.epochs)
79 | 
80 |     exit(0)
81 | 


--------------------------------------------------------------------------------
/dataset_creation/FunctionAnalyzerRadare.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | #
  3 | #
  4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
  5 | #
  6 | import json
  7 | import r2pipe
  8 | import networkx as nx
  9 | from dataset_creation.BlockFeaturesExtractor import BlockFeaturesExtractor
 10 | 
 11 | 
 12 | class Dict2Obj(object):
 13 |     """
 14 |     Turns a dictionary into a class
 15 |     """
 16 | 
 17 |     # ----------------------------------------------------------------------
 18 |     def __init__(self, dictionary):
 19 |         """Constructor"""
 20 |         for key in dictionary:
 21 |             setattr(self, key, dictionary[key])
 22 | 
 23 | class RadareFunctionAnalyzer:
 24 | 
 25 |     def __init__(self, filename, use_symbol):
 26 |         self.r2 = r2pipe.open(filename, flags=['-2'])
 27 |         self.filename = filename
 28 |         self.arch, _ = self.get_arch()
 29 |         self.use_symbol = use_symbol
 30 | 
 31 |     def __enter__(self):
 32 |         return self
 33 | 
 34 |     @staticmethod
 35 |     def filter_reg(op):
 36 |         return op["value"]
 37 | 
 38 |     @staticmethod
 39 |     def filter_imm(op):
 40 |         imm = int(op["value"])
 41 |         if -int(5000) <= imm <= int(5000):
 42 |             ret = str(hex(op["value"]))
 43 |         else:
 44 |             ret = str('HIMM')
 45 |         return ret
 46 | 
 47 |     @staticmethod
 48 |     def filter_mem(op):
 49 |         if "base" not in op:
 50 |             op["base"] = 0
 51 | 
 52 |         if op["base"] == 0:
 53 |             r = "[" + "MEM" + "]"
 54 |         else:
 55 |             reg_base = str(op["base"])
 56 |             disp = str(op["disp"])
 57 |             scale = str(op["scale"])
 58 |             r = '[' + reg_base + "*" + scale + "+" + disp + ']'
 59 |         return r
 60 | 
 61 |     @staticmethod
 62 |     def filter_memory_references(i):
 63 |         inst = "" + i["mnemonic"]
 64 | 
 65 |         for op in i["opex"]["operands"]:
 66 |             if op["type"] == 'reg':
 67 |                 inst += " " + RadareFunctionAnalyzer.filter_reg(op)
 68 |             elif op["type"] == 'imm':
 69 |                 inst += " " + RadareFunctionAnalyzer.filter_imm(op)
 70 |             elif op["type"] == 'mem':
 71 |                 inst += " " + RadareFunctionAnalyzer.filter_mem(op)
 72 |             if len(i["opex"]["operands"]) > 1:
 73 |                 inst = inst + ","
 74 | 
 75 |         if "," in inst:
 76 |             inst = inst[:-1]
 77 |         inst = inst.replace(" ", "_")
 78 | 
 79 |         return str(inst)
 80 | 
 81 |     @staticmethod
 82 |     def get_callref(my_function, depth):
 83 |         calls = {}
 84 |         if 'callrefs' in my_function and depth > 0:
 85 |             for cc in my_function['callrefs']:
 86 |                 if cc["type"] == "C":
 87 |                     calls[cc['at']] = cc['addr']
 88 |         return calls
 89 | 
 90 | 
 91 |     def process_instructions(self, instructions):
 92 |         filtered_instructions = []
 93 |         for insn in instructions:
 94 |             #operands = []
 95 |             if 'opex' not in insn:
 96 |                 continue
 97 |             #for op in insn['opex']['operands']:
 98 |             #    operands.append(Dict2Obj(op))
 99 |             #insn['operands'] = operands
100 |             stringized = RadareFunctionAnalyzer.filter_memory_references(insn)
101 |             if "x86" in self.arch:
102 |                 stringized = "X_" + stringized
103 |             elif "arm" in self.arch:
104 |                 stringized = "A_" + stringized
105 |             else:
106 |                 stringized = "UNK_" + stringized
107 |             filtered_instructions.append(stringized)
108 |         return filtered_instructions
109 | 
110 |     def process_block(self, block):
111 |         bytes = ""
112 |         disasm = []
113 |         for op in block['ops']:
114 |             if 'disasm' in op:
115 |                 disasm.append(op['disasm'])
116 |                 bytes += str(op['bytes'])
117 | 
118 |         self.r2.cmd("s " + str(block['offset']))
119 |         instructions = json.loads(self.r2.cmd("aoj " + str(len(block['ops']))))
120 |         string_addresses = [s['vaddr'] for s in json.loads(self.r2.cmd("izzj"))]
121 |         bfe = BlockFeaturesExtractor(self.arch, instructions, block['ops'], string_addresses)
122 |         annotations = bfe.getFeatures()
123 |         filtered_instructions = self.process_instructions(instructions)
124 | 
125 |         return disasm, bytes, annotations, filtered_instructions
126 | 
127 |     def function_to_cfg(self, func):
128 |         if self.use_symbol:
129 |             s = 'vaddr'
130 |         else:
131 |             s = 'offset'
132 | 
133 |         self.r2.cmd('s ' + str(func[s]))
134 |         try:
135 |             cfg = json.loads(self.r2.cmd('agfj ' + str(func[s])))
136 |         except:
137 |             cfg = []
138 | 
139 |         my_cfg = nx.DiGraph()
140 |         acfg = nx.DiGraph()
141 |         lstm_cfg = nx.DiGraph()
142 | 
143 |         if len(cfg) == 0:
144 |             return my_cfg, acfg, lstm_cfg
145 |         else:
146 |             cfg = cfg[0]
147 | 
148 |         for block in cfg['blocks']:
149 |             disasm, block_bytes, annotations, filtered_instructions = self.process_block(block)
150 |             my_cfg.add_node(block['offset'], asm=block_bytes, label=disasm)
151 |             acfg.add_node(block['offset'], features=annotations)
152 |             lstm_cfg.add_node(block['offset'], features=filtered_instructions)
153 | 
154 |         for block in cfg['blocks']:
155 |             if 'jump' in block:
156 |                 if block['jump'] in my_cfg.nodes:
157 |                     my_cfg.add_edge(block['offset'],block['jump'])
158 |                     acfg.add_edge(block['offset'], block['jump'])
159 |                     lstm_cfg.add_edge(block['offset'], block['jump'])
160 |             if 'fail' in block:
161 |                 if block['fail'] in my_cfg.nodes:
162 |                     my_cfg.add_edge(block['offset'],block['fail'])
163 |                     acfg.add_edge(block['offset'], block['fail'])
164 |                     lstm_cfg.add_edge(block['offset'], block['fail'])
165 | 
166 |         between = nx.betweenness_centrality(acfg)
167 |         for n in acfg.nodes(data=True):
168 |             d = n[1]['features']
169 |             d['offspring'] = len(nx.descendants(acfg, n[0]))
170 |             d['betweenness'] = between[n[0]]
171 |             n[1]['features'] = d
172 | 
173 |         return my_cfg, acfg, lstm_cfg
174 | 
175 |     def get_arch(self):
176 |         try:
177 |             info = json.loads(self.r2.cmd('ij'))
178 |             if 'bin' in info:
179 |                 arch = info['bin']['arch']
180 |                 bits = info['bin']['bits']
181 |         except:
182 |             print("Error loading file")
183 |             arch = None
184 |             bits = None
185 |         return arch, bits
186 | 
187 |     def find_functions(self):
188 |         self.r2.cmd('aaa')
189 |         try:
190 |             function_list = json.loads(self.r2.cmd('aflj'))
191 |         except:
192 |             function_list = []
193 |         return function_list
194 | 
195 |     def find_functions_by_symbols(self):
196 |         self.r2.cmd('aa')
197 |         try:
198 |             symbols = json.loads(self.r2.cmd('isj'))
199 |             fcn_symb = [s for s in symbols if s['type'] == 'FUNC']
200 |         except:
201 |             fcn_symb = []
202 |         return fcn_symb
203 | 
204 |     def analyze(self):
205 |         if self.use_symbol:
206 |             function_list = self.find_functions_by_symbols()
207 |         else:
208 |             function_list = self.find_functions()
209 | 
210 |         result = {}
211 |         for my_function in function_list:
212 |             if self.use_symbol:
213 |                 address = my_function['vaddr']
214 |             else:
215 |                 address = my_function['offset']
216 | 
217 |             try:
218 |                 cfg, acfg, lstm_cfg = self.function_to_cfg(my_function)
219 |                 result[my_function['name']] = {'cfg': cfg, "acfg": acfg, "lstm_cfg": lstm_cfg, "address": address}
220 |             except:
221 |                 print("Error in functions: {} from {}".format(my_function['name'], self.filename))
222 |                 pass
223 |         return result
224 | 
225 |     def close(self):
226 |         self.r2.quit()
227 | 
228 |     def __exit__(self, exc_type, exc_value, traceback):
229 |         self.r2.quit()
230 | 
231 | 
232 | 
233 | 


--------------------------------------------------------------------------------
/dataset_creation/__init__.py:
--------------------------------------------------------------------------------
1 | # SAFE TEAM
2 | #
3 | #
4 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.txt) #
5 | #
6 | 
7 | 


--------------------------------------------------------------------------------
/downloader.py:
--------------------------------------------------------------------------------
  1 | # SAFE TEAM
  2 | # distributed under license: CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode)
  3 | 
  4 | import argparse
  5 | import os
  6 | import sys
  7 | from subprocess import call
  8 | 
  9 | class Downloader:
 10 | 
 11 |     def __init__(self):
 12 |         parser = argparse.ArgumentParser(description='SAFE downloader')
 13 | 
 14 |         parser.add_argument("-i2v", "--i2v", dest="i2v", help="Download the i2v dictionary and embedding matrix",
 15 |                             action="store_true",
 16 |                             required=False)
 17 | 
 18 |         parser.add_argument("-op", "--openSSL", dest="openSSL",
 19 |                             help="Download OpenSSL dataset",
 20 |                             action="store_true",
 21 |                             required=False)
 22 | 
 23 |         parser.add_argument("-rc", "--restricted_compiler", dest="restricted_compiler",
 24 |                             help="Download the restricted compiler dataset",
 25 |                             action="store_true",
 26 |                             required=False)
 27 | 
 28 |         parser.add_argument("-c", "--compiler", dest="compiler",
 29 |                             help="Download the compiler dataset. Be careful, it is very huge ( 30 GB ).",
 30 |                             action="store_true",
 31 |                             required=False)
 32 | 
 33 |         args = parser.parse_args()
 34 | 
 35 |         self.i2v = args.i2v
 36 |         self.openSSL = args.openSSL
 37 |         self.restricted_compiler = args.restricted_compiler
 38 |         self.compiler = args.compiler
 39 | 
 40 |         if not (self.i2v, self.openSSL or self.restricted_compiler or self.compiler):
 41 |             parser.print_help(sys.__stdout__)
 42 | 
 43 |         self.url_i2v = "https://drive.google.com/file/d/1ndKVrot5lBPklGGFn-olEt-rCtzjv69z/view?usp=sharing"
 44 |         self.url_openSSL = "https://drive.google.com/file/d/1NnC4qCtZUDdb32Yfeq2toa94jvCKTBxZ/view?usp=sharing"
 45 |         self.url_restricted_compiler = "https://drive.google.com/file/d/15VUJ3iwj5VHCqAXiUcr4zJgVWSCbaU_d/view?usp=sharing"
 46 |         self.url_compiler = "https://drive.google.com/file/d/1fEr9N97fTsAS2NXYpYI3GRTxadaJwhTe/view?usp=sharing"
 47 | 
 48 |         self.base_path = "data"
 49 |         self.path_i2v = os.path.join(self.base_path, "")
 50 |         self.path_openSSL = os.path.join(self.base_path, "")
 51 |         self.path_restricted_compiler = os.path.join(self.base_path, "")
 52 |         self.path_compiler = os.path.join(self.base_path, "")
 53 | 
 54 |         self.i2v_compress_name='i2v.tar.bz2'
 55 |         self.openSSL_compress_name='openSSL_dataset.tar.bz2'
 56 |         self.restricted_compiler_compress_name='restricted_compiler_dataset.tar.bz2'
 57 |         self.compiler_compress_name = 'compiler_dataset.bz2'
 58 | 
 59 | 
 60 |     @staticmethod
 61 |     def download_file(id,path):
 62 |         try:
 63 |             print("Downloading from "+ str(id) +" into "+str(path))
 64 |             call(['./godown.pl',id,path])
 65 |         except Exception as e:
 66 |             print("Error downloading file at url:" + str(id))
 67 |             print(e)
 68 | 
 69 |     @staticmethod
 70 |     def decompress_file(file_src,file_path):
 71 |         try:
 72 |             call(['tar','-xvf',file_src,'-C',file_path])
 73 |         except Exception as e:
 74 |             print("Error decompressing file:" + str(file_src))
 75 |             print('you need tar command e b2zip support')
 76 |             print(e)
 77 | 
 78 |     def download(self):
 79 |         print('Making the godown.pl script executable, thanks:'+str('https://github.com/circulosmeos/gdown.pl'))
 80 |         call(['chmod', '+x','godown.pl'])
 81 |         print("SAFE --- downloading models")
 82 | 
 83 |         if self.i2v:
 84 |             print("Downloading i2v model.... in the folder data/i2v/")
 85 |             if not os.path.exists(self.path_i2v):
 86 |                 os.makedirs(self.path_i2v)
 87 |             Downloader.download_file(self.url_i2v, os.path.join(self.path_i2v,self.i2v_compress_name))
 88 |             print("Decompressing i2v model and placing in " + str(self.path_i2v))
 89 |             Downloader.decompress_file(os.path.join(self.path_i2v,self.i2v_compress_name),self.path_i2v)
 90 | 
 91 |         if self.openSSL:
 92 |             print("Downloading the OpenSSL dataset... in the folder data")
 93 |             if not os.path.exists(self.path_openSSL):
 94 |                 os.makedirs(self.path_openSSL)
 95 |             Downloader.download_file(self.url_openSSL, os.path.join(self.path_openSSL, self.openSSL_compress_name))
 96 |             print("Decompressing OpenSSL dataset and placing in " + str(self.path_openSSL))
 97 |             Downloader.decompress_file(os.path.join(self.path_openSSL, self.openSSL_compress_name), self.path_openSSL)
 98 | 
 99 |         if self.restricted_compiler:
100 |             print("Downloading the restricted compiler dataset... in the folder data")
101 |             if not os.path.exists(self.path_restricted_compiler):
102 |                 os.makedirs(self.path_restricted_compiler)
103 |             Downloader.download_file(self.url_restricted_compiler, os.path.join(self.path_restricted_compiler,self.restricted_compiler_compress_name))
104 |             print("Decompressing restricted compiler dataset and placing in " + str(self.path_restricted_compiler))
105 |             Downloader.decompress_file(os.path.join(self.path_restricted_compiler, self.restricted_compiler_compress_name), self.path_restricted_compiler)
106 | 
107 |         if self.compiler:
108 |             print("Downloading the compiler dataset... in the folder data")
109 |             if not os.path.exists(self.path_compiler):
110 |                 os.makedirs(self.path_compiler)
111 |             Downloader.download_file(self.url_compiler, os.path.join(self.path_compiler,self.compiler_compress_name))
112 |             print("Decompressing restricted compiler dataset and placing in " + str(self.path_compiler))
113 |             Downloader.decompress_file(os.path.join(self.path_compiler, self.compiler_compress_name), self.path_compiler)
114 | 
115 | 
116 | if __name__=='__main__':
117 |     a = Downloader()
118 |     a.download()


--------------------------------------------------------------------------------
/godown.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # Google Drive direct download of big files
 4 | # ./gdown.pl 'gdrive file url' ['desired file name']
 5 | #
 6 | # v1.0 by circulosmeos 04-2014.
 7 | # v1.1 by circulosmeos 01-2017.
 8 | # http://circulosmeos.wordpress.com/2014/04/12/google-drive-direct-download-of-big-files
 9 | # Distributed under GPL 3 (http://www.gnu.org/licenses/gpl-3.0.html)
10 | #
11 | use strict;
12 | 
13 | my $TEMP='gdown.cookie.temp';
14 | my $COMMAND;
15 | my $confirm;
16 | my $check;
17 | sub execute_command();
18 | 
19 | my $URL=shift;
20 | die "\n./gdown.pl 'gdrive file url' [desired file name]\n\n" if $URL eq '';
21 | 
22 | my $FILENAME=shift;
23 | $FILENAME='gdown' if $FILENAME eq '';
24 | 
25 | if ($URL=~m#^https?://drive.google.com/file/d/([^/]+)#) {
26 |     $URL="https://docs.google.com/uc?id=$1&export=download";
27 | }
28 | 
29 | execute_command();
30 | 
31 | while (-s $FILENAME < 100000) { # only if the file isn't the download yet
32 |     open fFILENAME, '<', $FILENAME;
33 |     $check=0;
34 |     foreach (<fFILENAME>) {
35 |         if (/href="(\/uc\?export=download[^"]+)/) {
36 |             $URL='https://docs.google.com'.$1;
37 |             $URL=~s/&amp;/&/g;
38 |             $confirm='';
39 |             $check=1;
40 |             last;
41 |         }
42 |         if (/confirm=([^;&]+)/) {
43 |             $confirm=$1;
44 |             $check=1;
45 |             last;
46 |         }
47 |         if (/"downloadUrl":"([^"]+)/) {
48 |             $URL=$1;
49 |             $URL=~s/\\u003d/=/g;
50 |             $URL=~s/\\u0026/&/g;
51 |             $confirm='';
52 |             $check=1;
53 |             last;
54 |         }
55 |     }
56 |     close fFILENAME;
57 |     die "Couldn't download the file :-(\n" if ($check==0);
58 |     $URL=~s/confirm=([^;&]+)/confirm=$confirm/ if $confirm ne '';
59 | 
60 |     execute_command();
61 | }
62 | 
63 | unlink $TEMP;
64 | 
65 | sub execute_command() {
66 |     $COMMAND="wget --no-check-certificate --load-cookie $TEMP --save-cookie $TEMP \"$URL\"";
67 |     $COMMAND.=" -O \"$FILENAME\"" if $FILENAME ne '';
68 |     `$COMMAND`;
69 |     return 1;
70 | }
71 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | tensorflow
3 | networkx
4 | tqdm
5 | matplotlib
6 | sklearn
7 | r2pipe
8 | 


--------------------------------------------------------------------------------