├── LICENSE.txt ├── README.md ├── regex.txt └── src └── main └── matlab ├── ClassificationModel.m ├── LogitLinearModel.m ├── cv_part.m ├── cv_virusclass.m ├── gen_graft_data.m ├── line_fewer_markers.m ├── make_fig1.m ├── make_fig2.m ├── make_fig3.m ├── parseArgs.m ├── read_data.m ├── rotateticklabel.m ├── run_pipeline.m ├── setoptions.m └── subaxis.m /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2012 Konstantin Berlin 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | 205 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Malicious Behavior Detection using Windows Audit Logs 2 | 3 | This is the github page for the AI-Sec 2015 publication "Malicious Behavior Detection using Windows Audit Logs" by Konstantin Berlin, David Slater, and Joshua Saxe. This work will be presented on __Friday October 16, 2015 at 2015 ACM Workshop on Artificial Intelligence and Security__, which is co-located with CCS. 4 | 5 | The free pre-print of the publication can be found at http://arxiv.org/abs/1506.04200. 6 | 7 | __If you have any questions or issues, or something is not clear about our data or scripts, please report them, so we can fix them as soon as possible.__ 8 | 9 | ###Synoposis 10 | 11 | We investigated the utility of agentless detection of malicious endpoint behavior, using only the standard build-in Windows audit logging facility as our signal. We found that Windows audit logs, while emitting manageable sized data streams on the endpoints, provide enough information to allow robust detection of malicious behavior. Audit logs provide an effective, low-cost alternative to deploying additional expensive agent-based breach detection systems in many government and industrial settings, and can be used to detect, in our tests, 83% percent of malware samples with a 0.1% false positive rate. They can also supplement already existing host signature-based antivirus solutions, like Kaspersky, Symantec, and McAfee, detecting, in our testing environment, 78% of malware missed by those antivirus systems. 12 | 13 | ## Data and Anonymization 14 | 15 | The anonymized version of the data that we used to compute our results can be found [here](https://www.dropbox.com/s/y6zbdgh3t9rl2cd/learn_final_r1.tar.gz?dl=0). The data has been anonomized in order to protect the privacy of the users from which it was collected. 16 | 17 | The goal of the anonymization was not only to protect privacy, but also allow the security community to supplement and reuse our data for their own needs. Therefore, part of our data has been anonymized such that if a new set of audit logs are added created by someone, it can be added to our dataset without duplicating feature names. 18 | 19 | The following is the description of the anonymization steps 20 | 21 | 1. Transform all the entries using the regex transformations. 22 | 2. Observe all the paths (including directory and name) and the subpaths for files, process names, and registry entries, for all the sandbox derived Windows audit logs. Put them in a bag of public paths called __P__. 23 | 3. Encrypt the file/registry/process names in pieces. 24 | * Do not encrypt any logs from the sandbox runs. 25 | * For each audit log entry in the enterprise data, see if all or parts of its path is in __P__. Encrypt each directory/registry using its name, if the full path of the directory/registry is not in __P__, otherwise leave unencrypted. The encrypted name is the text `sha1_` followed by the sha1 of the directory/registry name. For files we leave the extension exposed, and only hash the name. Ex. `[windows]\system32\fake_dir\fake.dll` will be encrypted as `[windows]\system32\sha1_\sha1_.dll`. 26 | * For sensitive files types, like documents, slides, text, etc., we salt the filenames before hashing. 27 | 28 | ### Regex Transformations 29 | 30 | The regex transformations that we use to generate our feature labels are located in the [regex file](regex.txt). The regex expressions must be executed in order listed to reproduce our results. 31 | 32 | ### Data Content 33 | 34 | The following is the data that we used for our analysis. The file format is specified in Section [File Formats](#ff). 35 | 36 | #### Root directory 37 | 38 | The root directory contains the following: 39 | 40 | * `inter` - directory containing the raw and intermediate file representation of the audit logs (see below) 41 | * `pace_classification.txt` - label classification scores for the cuckoo box data (<0 means unknown label, 0-1 are virus total scores, >1 means malware due to original source of file) 42 | * `pace_column_labels_anon.txt` - string names of the features 43 | * `pace_row_labels_anon.txt` - list of sha1 of the binary files that were ran through CuckooBox. They are in same order as `pace_classification.txt`, and their row number directly maps into `pace_feature_matrix.txt` row numbers 44 | * `pace_created_labels.txt` - time from epoch when the file was created based on the compile time stamp. If not detected, file created time stamp 45 | * `pace_feature_matrix.txt` - the feature matrix in the format described on the bottom 46 | * `pace_malware_kaspersky_labels.txt` - Kaspersky labels for the CuckooBox data 47 | * `pace_malware_mcafee_labels.txt` - Mcafee labels for the CuckooBox data 48 | * `pace_malware_symantec_labels.txt` - Symantec labels for the CuckooBox data 49 | 50 | These are same type of files as described above but for our enterprise dataset (3 users) and splunk dataset (1 user): 51 | * `pace_enterprise_feature_matrix.txt` 52 | * `pace_enterprise_row_labels_anon.txt` 53 | * `pace_splunk_feature_matrix.txt` 54 | * `pace_splunk_row_labels_anon.txt` 55 | 56 | #### The `inter` Directory 57 | 58 | The `inter` directory contains the intermediate files that we used to form our n-grams. Their names match the *\_row_labels.txt content. 59 | 60 | * `*.log` - the JSON formatted intermediate file format. This was the file that we used to create n-grams, and has abstractions of paths and file deletions. The ignored entries, as given by the JSON entry `ignored`, are removed before forming the n-grams. 61 | 62 | ##### File Formats 63 | The matrix format written in text is following: 64 | * First line: `<#number of rows> <#number of columns> <#number of non-zero elements in the matrix>` 65 | * This is followed by a list of non-zero entries: ` ` 66 | 67 | ## Build 68 | 69 | In order to reproduce the plots from the paper using our MATLAB scripts you will you will need MATLAB 2014 or higher, with Statistics and Machine Learning Toolbox. In addition, you will need the [Glmnet](http://web.stanford.edu/~hastie/glmnet_matlab/) MATLAB package in your MATLAB path. 70 | 71 | To load the data from disk into MATLAB, type: 72 | 73 | ``` 74 | [A, y, names, virus_kasp, virus_mcafee, virus_symantec, column_labels, t_created, cuckoo_idx, splunk_idx] = read_data(); 75 | ``` 76 | To create Figure 1 from the manuscript, type: 77 | 78 | `make_fig1(A, y, column_labels, virus_kasp, cuckoo_idx, t_created);` 79 | 80 | To create Figure 2 from the manuscript, type: 81 | 82 | `make_fig2(A, y, cuckoo_idx, splunk_idx, t_created, virus_kasp);` 83 | 84 | To create Figure 3 from the manuscript, type: 85 | 86 | `make_fig3(A, y, cuckoo_idx, splunk_idx, virus_kasp, virus_mcafee, virus_symantec, t_created);` 87 | 88 | ## Copyright and License 89 | 90 | Code, documentation, and data copyright 2014-2015 Invincea Labs, LLC. Release is governed by [Apache 2.0](LICENSE.txt) license. 91 | 92 | -------------------------------------------------------------------------------- /regex.txt: -------------------------------------------------------------------------------- 1 | ALL WINDOWS_FILE ^(?:[abe-z]:) c: 2 | ALL ALL s-1-5-[0-9]{1,2}(?:(?:-[0-9]{10}){3}-[0-9]{3,4})? 3 | ALL ALL [{]?[a-f0-9]{8}-(?:[a-f0-9]{4}-){3}[a-f0-9]{12}[}]? 4 | ALL ALL c:\\users\\[^\\]*\\ c:\\users\\\\ 5 | ALL ALL kb[0-9]{6,7} 6 | ALL WINDOWS_FILE c:\\_\d{6}_ [msi installer] 7 | CUCKOO WINDOWS_FILE c:\\[a-z]{5,10}\\dll\\[a-z]{6}\.dll$ [cuckoo] 8 | CUCKOO WINDOWS_FILE c:\\[a-z]{5,10}\\bin\\execsc\.[a-z]{3}$ [cuckoo] 9 | ALL ALL c:\\users\\\\appdata\\local\\microsoft\\windows\\wer\\reportqueue\\.*$ [windows error reporting report queue] 10 | ALL WINDOWS_FILE ^(?:c\:\\\$recycle\.bin)($|\\) [recycle bin]$1 11 | ALL WINDOWS_FILE ^(?:c\:\\python27)($|\\) [python]$1 12 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\ \(x86\)\\internet\ explorer)($|\\) [internet explorer x86]$1 13 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\\internet\ explorer)($|\\) [internet explorer]$1 14 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\\microsoft\ office)($|\\) [microsoft office]$1 15 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\\mcafee)($|\\) [mcafee]$1 16 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\\adobe\ reader)($|\\) [adobe reader]$1 17 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\\microsoft\ silverlight)($|\\) [silverlight]$1 18 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\ \(x86\)\\java)($|\\) [java]$1 19 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\ \(x86\)\\adobe)($|\\) [adobe]$1 20 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\\adobe)($|\\) [adobe]$1 21 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\\mozilla\ firefox)($|\\) [firefox]$1 22 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\ \(x86\)\\google\\chrome)($|\\) [chrome]$1 23 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\\google\\chrome)($|\\) [chrome]$1 24 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\ \(x86\)\\microsoft\ office)($|\\) [office]$1 25 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\ \(x86\)\\quicktime)($|\\) [quicktime]$1 26 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\\quicktime)($|\\) [quicktime]$1 27 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\\windows\ media\ player)($|\\) [windows media player]$1 28 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\\common\ files)($|\\) [program files common]$1 29 | ALL WINDOWS_FILE ^(?:c\:\\program\ files)($|\\) [program files]$1 30 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\ \(x86\)\\common\ files)($|\\) [program files common x86]$1 31 | ALL WINDOWS_FILE ^(?:c\:\\program\ files\ \(x86\))($|\\) [program files x86]$1 32 | ALL WINDOWS_FILE ^(?:c\:\\programdata\\microsoft\\windows\\start\ menu\\programs\\startup)($|\\) [startup (common)]$1 33 | ALL WINDOWS_FILE ^(?:c\:\\programdata\\microsoft\\windows\\start\ menu\\programs\\administrative\ tools)($|\\) [admin tools (common)]$1 34 | ALL WINDOWS_FILE ^(?:c\:\\programdata\\microsoft\\windows\\start\ menu)($|\\) [start menu (common)]$1 35 | ALL WINDOWS_FILE ^(?:c\:\\programdata\\microsoft\\windows\\templates)($|\\) [templates (common)]$1 36 | ALL WINDOWS_FILE ^(?:c\:\\programdata)($|\\) [program data]$1 37 | ALL WINDOWS_FILE ^(?:c\:\\perflogs)($|\\) [perf logs]$1 38 | ALL WINDOWS_FILE ^(?:c\:\\recovery)($|\\) [recovery]$1 39 | ALL WINDOWS_FILE ^(?:c\:\\testperms)($|\\) [test perms]$1 40 | ALL WINDOWS_FILE ^(?:c\:\\temp)($|\\) [temp]$1 41 | ALL WINDOWS_FILE ^(?:c\:\\tmp)($|\\) [temp]$1 42 | ALL WINDOWS_FILE ^(?:c\:\\users\\public)($|\\) [public]$1 43 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\appdata\\roaming\\microsoft\\windows\\cookies\\low)($|\\) [cookies (low)]$1 44 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\appdata\\roaming\\microsoft\\windows\\cookies)($|\\) [cookies]$1 45 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\appdata\\roaming\\microsoft\\windows\\start\ menu\\programs\\administrative\ tools)($|\\) [admin tools]$1 46 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\appdata\\roaming\\microsoft\\windows\\start\ menu\\programs\\startup)($|\\) [startup]$1 47 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\appdata\\roaming\\microsoft\\windows\\start\ menu)($|\\) [start menu]$1 48 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\appdata\\roaming\\microsoft\\windows\\templates)($|\\) [templates]$1 49 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\appdata\\roaming)($|\\) [appdata]$1 50 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\appdata\\local\\temp)($|\\) [temp]$1 51 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\appdata\\local)($|\\) [appdata (local)]$1 52 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\appdata\\locallow)($|\\) [appdata (local low)]$1 53 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\desktop)($|\\) [desktop]$1 54 | ALL WINDOWS_FILE ^(?:c\:\\users\\\\\documents)($|\\) [documents]$1 55 | ALL WINDOWS_FILE ^(?:c\:\\users)($|\\) [users]$1 56 | ALL WINDOWS_FILE ^(?:c\:\\windows\\serviceprofiles\\networkservice\\appdata\\roaming\\microsoft\\windows\\start\ menu)($|\\) [start menu (network service)]$1 57 | ALL WINDOWS_FILE ^(?:c\:\\windows\\serviceprofiles\\localservice\\appdata\\roaming\\microsoft\\windows\\start\ menu)($|\\) [start menu (local service)]$1 58 | ALL WINDOWS_FILE ^(?:c\:\\windows\\system32\\config\\systemprofile)($|\\) [system profile]$1 59 | ALL WINDOWS_FILE ^(?:c\:\\windows\\syswow64\\config\\systemprofile)($|\\) [system profile x86]$1 60 | ALL WINDOWS_FILE ^(?:c\:\\windows\\fonts)($|\\) [fonts]$1 61 | ALL WINDOWS_FILE ^(?:c\:\\windows\\system)($|\\) [system 16]$1 62 | ALL WINDOWS_FILE ^(?:c\:\\windows\\system32)($|\\) [system]$1 63 | ALL WINDOWS_FILE ^(?:c\:\\windows\\syswow64)($|\\) [system x86]$1 64 | ALL WINDOWS_FILE ^(?:c\:\\windows\\winsxs)($|\\) [winsxs]$1 65 | ALL WINDOWS_FILE ^(?:c\:\\windows)($|\\) [windows]$1 66 | ALL WINDOWS_REGISTRY \\registry\\a [registry (apps)] 67 | ALL WINDOWS_REGISTRY \\registry\\machine\\system\\controlset00\d [control set] 68 | ALL WINDOWS_REGISTRY \\registry\\machine [registry (machine)] 69 | ALL WINDOWS_REGISTRY \\registry\\user\\s-1-5-18 [registry (user)]\\[local system] 70 | ALL WINDOWS_REGISTRY \\registry\\user\\s-1-5-19 [registry (user)]\\[local service] 71 | ALL WINDOWS_REGISTRY \\registry\\user\\s-1-5-20 [registry (user)]\\[network service] 72 | ALL WINDOWS_REGISTRY \\registry\\user\\\.default [registry (user)]\\ 73 | ALL WINDOWS_REGISTRY \\registry\\user\\s-1-5-[0-9]{1,2}(?:(?:-[0-9]{10}){3}-[0-9]{3,4})? [registry (user)]\\ 74 | ALL WINDOWS_REGISTRY \\registry\\user\\ [registry (user)]\\ 75 | ALL WINDOWS_REGISTRY \\registry\\user\\nt authority\\local system [registry (user)]\\[local system] 76 | ALL WINDOWS_REGISTRY \\registry\\user\\nt authority\\local service [registry (user)]\\[local service] 77 | ALL WINDOWS_REGISTRY \\registry\\user\\nt authority\\network service [registry (user)]\\[network service] 78 | ALL WINDOWS_REGISTRY \\registry\\user\\[^\\]+\\[^\\]*(_classes|)($|\\) [registry (user)]\\$1$2 79 | ALL WINDOWS_REGISTRY \\registry\\user [registry (user)] 80 | ALL WINDOWS_REGISTRY \\wow6432node 81 | ALL WINDOWS_FILE ^\[program\ data\]\\microsoft\\windows\\wer\\reportqueue($|\\).*$ [program data]\\microsoft\\windows\\wer\\reportqueue\\\\<*> 82 | ALL WINDOWS_REGISTRY ^\[registry\ \(user\)\]\\\\\software\\classes\\software\\microsoft\\windows\\currentversion\\deployment\\sidebyside($|\\).*$ [registry (user)]\\\\software\\classes\\software\\microsoft\\windows\\currentversion\\deployment\\sidebyside\\\\<*> 83 | ALL WINDOWS_FILE ^\[appdata\ \(local\)\]\\apps($|\\).*$ [appdata (local)]\\apps\\\\<*> 84 | ALL WINDOWS_FILE ^\[windows\]\\microsoft\.net\\framework($|\\).*$ [windows]\\microsoft.net\\framework\\\\<*> 85 | ALL WINDOWS_FILE \[windows\]\\assembly(?:\\.*)*(\\.*$) [windows]\\assembly\\<*>$1 86 | ALL WINDOWS_FILE \[windows\]\\prefetch\\(.*)-.*\.pf$ [windows]\\prefetch\\$1<*>.pf 87 | ALL WINDOWS_FILE \[winsxs\](?:\\.*)*(\\.*$) [winsxs]\\<*>$1 -------------------------------------------------------------------------------- /src/main/matlab/ClassificationModel.m: -------------------------------------------------------------------------------- 1 | classdef ClassificationModel < handle 2 | properties(SetAccess = private) 3 | sm; 4 | num_features; 5 | class_options; 6 | A_islogical; 7 | end 8 | 9 | methods(Static) 10 | 11 | function [sm_out] = column_corr(A,y,w,k,tol) 12 | 13 | %remove rare events for speed 14 | sm_valid = sum(A~=0,1)>0; 15 | A = A(:,sm_valid); 16 | 17 | y = -1.*(y==0)+1.*(y>0); 18 | 19 | %make a sparse diagnal 20 | diagw = spdiags(w, 0, length(w), length(w)); 21 | 22 | %take the weight into account; 23 | y = w.*y; 24 | A = diagw*A; 25 | 26 | %compute the column norms 27 | d = sqrt(sum(A.*A,1))'; 28 | d(d==0)=1; 29 | 30 | %make sparse diagnal 31 | %d = spdiags(1./d, 0, size(A,2), size(A,2)); 32 | 33 | c = (A.'*y)./(d.*norm(y)); 34 | c = abs(c); 35 | 36 | sm_curr = find(c>=tol); 37 | 38 | if (length(sm_curr)>k) 39 | [~,I] = sort(c,'descend'); 40 | sm_curr = I(1:k); 41 | end 42 | 43 | if isempty(sm_curr) 44 | error('Could not find any correlated columns.'); 45 | end 46 | 47 | sm_out = false(1,size(A,2)); 48 | 49 | sm_valid = find(sm_valid); 50 | sm_out(sm_valid(sm_curr)) = true; 51 | end 52 | 53 | function w = getDefaultWeights(y) 54 | 55 | good = sum(y==0); 56 | bad = sum(y>0); 57 | w = .98/good.*(y==0)+0.02/bad.*(y>0); 58 | end 59 | 60 | function w = getClassUniformWeights(y) 61 | 62 | good = sum(y==0); 63 | bad = sum(y>0); 64 | w = .5/good.*(y==0)+.5/bad.*(y>0); 65 | end 66 | 67 | function w = getUniformWeights(y) 68 | 69 | w = ones(length(y),1); 70 | end 71 | 72 | function [A,y,w,p] = subsample(A,y,w,options) 73 | 74 | if islogical(options.subsample) && ~options.subsample 75 | return 76 | end 77 | 78 | n = size(A,1); 79 | if (ischar(options.subsample) && strcmpi(options.subsample,'sqrt')) 80 | n = round(sqrt(size(A,1))); 81 | elseif (isfloat(options.subsample) && options.subsample>=0 && options.subsample<=1.0) 82 | n = round(options.subsample*size(A,1)); 83 | elseif (isfloat(options.subsample) && options.subsample>=1.0) 84 | n = min(n,round(options.subsample)); 85 | end 86 | 87 | if (n=1 && size(A,2)>=1, ['Invalid dimension of the feature matrix: ', num2str(size(A))]); 114 | assert(size(A,1)==length(y),'Feature matrix must have same number of rows as label vector.'); 115 | assert(length(unique(y))==2,'Can only process two labels classification.'); 116 | 117 | obj.A_islogical = islogical(A); 118 | obj.class_options = options; 119 | obj.num_features = size(A,2); 120 | 121 | if (~islogical(options.feature_select) && numel(options.feature_select)==1 && options.feature_select) 122 | 123 | %good = sum(y==0); 124 | %bad = sum(y>0); 125 | %w0 = .5/good.*(y==0)+.5/bad.*(y>0); 126 | 127 | w0 = ones(size(y)); 128 | 129 | %cut down the number of features 130 | obj.sm = ClassificationModel.column_corr(A, y, w0, options.feature_select, 0.0); 131 | elseif (islogical(options.feature_select) && numel(options.feature_select)==1 && ~options.feature_select) 132 | obj.sm = []; 133 | else 134 | if islogical(options.feature_select) 135 | assert(length(options.feature_select)==size(A,2), 'Column indicies must match size of matrix.'); 136 | obj.sm = options.feature_select; 137 | elseif isvector(options.feature_select) 138 | assert(max(options.feature_select)<=size(A,2), 'Column indicies must match size of matrix.'); 139 | 140 | idx = false(1,size(A,2)); 141 | idx(options.feature_select) = true; 142 | 143 | obj.sm = idx; 144 | else 145 | error('Unknown feature selection vector type.'); 146 | end 147 | end 148 | 149 | if (sum(obj.sm)cutoff) = cutoff; 160 | 161 | end 162 | 163 | function y = predict(obj, A) 164 | 165 | assert(size(A,2)==obj.getNumFeatures(),'Feature matrix does not have same number of features as original training matrix.'); 166 | assert(islogical(A)==obj.A_islogical,'Test A must be of same type as training A.'); 167 | 168 | y = []; 169 | end 170 | 171 | function v = getNumFeatures(obj) 172 | v = obj.num_features; 173 | end 174 | 175 | function [X,Y,T,auc] = getROCPoints(obj, A, y, Xcrit, Ycrit) 176 | 177 | if (nargin<5) 178 | Xcrit = 'fpr'; 179 | Ycrit = 'tpr'; 180 | end 181 | 182 | y_pred = obj.predict(A); 183 | 184 | [X,Y,T,auc] = perfcurve(y>0,y_pred,true,'UseNearest','off','TVals',[0.0, logspace(-5,-2, 2000), linspace(0.01+10*eps,1.0+10*eps, 2000)], 'Xcrit',Xcrit, 'Ycrit',Ycrit); 185 | end 186 | 187 | function options = getOptions(obj) 188 | options = obj.class_options; 189 | end 190 | 191 | function value = getActiveFeatures(obj) 192 | if (isempty(obj.sm)) 193 | value = 1:obj.getNumFeatures(); 194 | else 195 | if (islogical(obj.sm)) 196 | value = find(obj.sm); 197 | else 198 | value = obj.sm; 199 | end 200 | end 201 | end 202 | 203 | function [] = plotROC(obj,A,y) 204 | [X,Y,~,auc] = getROCPoints(obj, A, y); 205 | 206 | plot(X ,Y); 207 | xlim([0, .2]); 208 | ylim([0, 1.0]); 209 | xlabel('FPR'); 210 | ylabel('TPR'); 211 | title(['AUC=',num2str(auc)]) 212 | end 213 | 214 | 215 | end % methods 216 | 217 | 218 | end % classdef -------------------------------------------------------------------------------- /src/main/matlab/LogitLinearModel.m: -------------------------------------------------------------------------------- 1 | classdef LogitLinearModel < ClassificationModel 2 | properties(SetAccess = private) 3 | B1 = []; 4 | nonzeros = []; 5 | end 6 | 7 | methods(Static) 8 | function model = computeModel(A,y,w,options) 9 | 10 | assert(nnz(A)<1e8,['A has too many non-zeros: ', num2str(nnz(A))]); 11 | 12 | adj = false; 13 | if (adj) 14 | 15 | %now clean the labels 16 | %[U,S] = svd(full(double(A)),'econ'); 17 | [U,S] = rand_svd(A, min(round(size(A,1)/2),2000), 2, 20); 18 | A_cut = U*S; 19 | 20 | size(A_cut) 21 | 22 | [x] = logit_regress([ones(size(A_cut,1),1), A_cut], y, 1.0e-7, w, 0); 23 | preds = glmval(x,A_cut,'logit'); 24 | 25 | %get the right cutoff 26 | [~,~,~,~,optpoint] = perfcurve(y,preds,true,'Prior',[.95, .05]); 27 | [X,Y,T] = perfcurve(y,preds,true,'XVals',optpoint(1),'UseNearest','on'); 28 | [X,Y,T] 29 | 30 | 31 | %perform readjustment of scores 32 | y = preds>=T; 33 | 34 | histogram(preds(y==0), 40); 35 | hold on; 36 | histogram(preds(y>0), 40); 37 | hold off; 38 | %pause; 39 | end 40 | 41 | %tic 42 | %options = statset('Display','iter','UseParallel',true); 43 | %[model,FitInfo] = lassoglm(A,y,'binomial','NumLambda',25,'CV',3,'Options',options,'LambdaRatio',1e-2,'Weights',w); 44 | 45 | lasso_alpha = options.lasso_alpha; 46 | assert(lasso_alpha>=0.0 & lasso_alpha<=1.0,'Invalid LassoAlpha value.'); 47 | 48 | options_glm.weights = w; 49 | options_glm.alpha = lasso_alpha; 50 | options_glm.thresh = 1.0e-5; 51 | options_glm.maxit = 2e3; 52 | %options_glm.maxit = 20; 53 | options_glm.ltype = 'modified.Newton'; 54 | %options_glm.lambda_min = 5.0e-3; 55 | options_glm.lambda_min = 1.0e-4; 56 | 57 | %options.standardize = false; 58 | [model] = cvglmnet(double(A), y, 'binomial', glmnetSet(options_glm), [], 20, [], true, false, true); 59 | 60 | %cvglmnetPlot(model) 61 | 62 | end 63 | end 64 | 65 | methods 66 | 67 | function obj = LogitLinearModel(A,y,w,options) 68 | 69 | obj@ClassificationModel(A,y,options); 70 | 71 | if (isempty(w)) 72 | w = ClassificationModel.getUniformWeights(y); 73 | end 74 | 75 | assert(length(w)==length(y),'Number of weights must match number of observations.'); 76 | 77 | %now compute it 78 | model = LogitLinearModel.computeModel(A(:,obj.getActiveFeatures()),y,w,options); 79 | 80 | %indx = find(model.lambda==model.lambda_min); 81 | indx = find(model.lambda==model.lambda_1se); 82 | 83 | cnst = model.glmnet_fit.a0(indx); 84 | obj.B1 = [cnst;model.glmnet_fit.beta(:,indx)]; 85 | 86 | %get the max value 87 | %maxval = max(abs(obj.B1(2:end))); 88 | 89 | obj.nonzeros = false(1, length(obj.B1)-1); 90 | obj.nonzeros(abs(obj.B1(2:end)) > 0) = true; 91 | 92 | w = obj.B1(2:end); 93 | maxval = max(abs(w)); 94 | 95 | fprintf('Logit: Model contains %d non-zeros, and %d significant non-zeros, out of %d active features.\n', sum(obj.nonzeros), sum(abs(w)>maxval*1.0e-4), size(obj.getActiveFeatures(),2)); 96 | end 97 | 98 | function z = margin(obj, A) 99 | assert(size(A,2)~=length(obj.sm),'Feature matrix not same size as original training matrix.'); 100 | 101 | z = A*obj.B1(2:end)+obj.B1(1); 102 | end 103 | 104 | function y = predict(obj, A) 105 | predict@ClassificationModel(obj, A); 106 | 107 | y = glmval(obj.B1, double(A(:,obj.getActiveFeatures())),'logit'); 108 | end 109 | 110 | function nnz = numberNonZeros(obj) 111 | if (islogical(obj.nonzeros)) 112 | index = find(obj.nonzeros); 113 | else 114 | index = obj.nonzeros; 115 | end 116 | 117 | nnz = length(index); 118 | end 119 | 120 | function offset = getModelOffset(obj) 121 | offset = obj.B1(1); 122 | end 123 | 124 | function weights = getModelWeights(obj) 125 | 126 | w =obj.B1(2:end); 127 | 128 | features = obj.getActiveFeatures(); 129 | 130 | weights = sparse(1,obj.getNumFeatures()); 131 | weights(features) = w; 132 | end 133 | 134 | function [] = explain(obj, A, names, column_labels, max_score, virus) 135 | 136 | if (nargin<6) 137 | virus = cell(size(A,1)); 138 | end 139 | if (nargin<5) 140 | max_score = 0.0; 141 | end 142 | 143 | p = obj.predict(A); 144 | 145 | A = A(:,obj.getActiveFeatures()); 146 | column_labels = column_labels(obj.getActiveFeatures()); 147 | 148 | weights = obj.B1(2:end)'; 149 | max_weight = max(abs(weights)); 150 | 151 | total_val = zeros(1, size(A,2)); 152 | for iter=1:size(A,1) 153 | if (p(iter)>max_score) 154 | fprintf('P=%.3f, %s, label=%s\n', p(iter), names{iter}, virus{iter}); 155 | end 156 | 157 | %sort the values 158 | val = full(A(iter,:)).*weights; 159 | [~,I] = sort(abs(val),'descend'); 160 | 161 | total_val = total_val+val; 162 | 163 | %figure out why 164 | iter2 = 1; 165 | if (p(iter)>max_score) 166 | while (iter2<=length(I) && val(I(iter2))>max_weight*1.0e-4) 167 | fprintf('\tW=%.4f, %s\n', val(I(iter2)), column_labels{I(iter2)}); 168 | iter2 = iter2+1; 169 | end 170 | 171 | fprintf('\n'); 172 | end 173 | 174 | end 175 | 176 | %now output the most common values 177 | [~,I] = sort(abs(total_val),'descend'); 178 | max_val = abs(total_val(I(1))); 179 | iter = 1; 180 | 181 | fprintf('Most important observed features:\n'); 182 | while abs(total_val(I(iter)))>max_val*1.0e-2 183 | fprintf('\tW=%.4f, %s\n', total_val(I(iter)), column_labels{I(iter)}); 184 | iter = iter+1; 185 | end 186 | 187 | end 188 | 189 | function [] = makeJson(obj, column_labels, filename) 190 | 191 | weights = obj.getModelWeights(); 192 | offset = obj.getModelOffset(); 193 | 194 | nzeros = find(abs(weights) > 1.0e-4*max(abs(weights))); 195 | 196 | [~,I] = sort(abs(weights(nzeros)),'descend'); 197 | 198 | w_sorted = weights(nzeros(I)); 199 | index_sorted = nzeros(I); 200 | 201 | struct.offset = offset; 202 | struct.weights = []; 203 | for iter=1:length(I) 204 | struct.weights(iter).weight = full(w_sorted(iter)); 205 | %struct.weights(iter).index = index_sorted(iter); 206 | struct.weights(iter).name = strtrim(strsplit(column_labels{index_sorted(iter)},',\t')); 207 | end 208 | 209 | savejson('',struct,'FileName',filename); 210 | 211 | end 212 | 213 | end % methods 214 | end % classdef -------------------------------------------------------------------------------- /src/main/matlab/cv_part.m: -------------------------------------------------------------------------------- 1 | function [cv, valid] = cv_part(y, kfold, c, method, t_created, cuckoo_idx, splunk_idx, virus) 2 | 3 | if (nargin<4) 4 | method = 'standard'; 5 | end 6 | if (nargin<3) 7 | c = 0.3; 8 | end 9 | 10 | valid = y==0 | y>=c; 11 | 12 | if (~islogical(cuckoo_idx)) 13 | cuckoo_idx_temp = false(length(y),1); 14 | cuckoo_idx_temp(cuckoo_idx) = true; 15 | cuckoo_idx = cuckoo_idx_temp; 16 | end 17 | if (~islogical(splunk_idx)) 18 | splunk_idx_temp = false(length(y),1); 19 | splunk_idx_temp(splunk_idx) = true; 20 | splunk_idx = splunk_idx_temp; 21 | end 22 | 23 | %methods = 'standard', 'creation' 24 | 25 | if (strcmp(method,'creation')) 26 | t_smallest = round(86400 * (datenum('1995', 'yyyy') - datenum('1970', 'yyyy'))); 27 | t_largest = round(86400 * (now - datenum('1970', 'yyyy'))); 28 | 29 | %adjust so the dates make sense 30 | t_created(t_createdt_largest) = 0; 31 | 32 | t = sort(t_created(t_created>0 & y>=c)); 33 | 34 | per = 1-1/kfold; 35 | cutoff = t(round(length(t)*per)); 36 | 37 | %index of benignware 38 | I_good = valid & t_created>0 & y==0; 39 | cv_good = cvpartition(sum(I_good),'KFold',kfold); 40 | 41 | %index of malware 42 | I_old = valid & t_created>0 & t_created0; 43 | I_new = valid & t_created>0 & t_created>=cutoff & y>0; 44 | 45 | %now partition old 46 | cv_old = cvpartition(sum(I_old),'KFold',kfold); 47 | 48 | %now combine the data 49 | for iter=1:kfold 50 | 51 | %adjust the indicies 52 | cv_good_training = zeros(length(y),1); 53 | cv_good_training(I_good) = cv_good.training(iter); 54 | cv_good_test = zeros(length(y),1); 55 | cv_good_test(I_good) = cv_good.test(iter); 56 | 57 | cv_old_training = zeros(length(y),1); 58 | cv_old_training(I_old) = cv_old.training(iter); 59 | 60 | cv{iter}.training = cv_old_training | cv_good_training; 61 | cv{iter}.test = I_new | cv_good_test; 62 | end 63 | 64 | elseif (strcmp(method,'type')) 65 | 66 | %index of benignware 67 | I_good = valid & y==0; 68 | cv_good = cvpartition(sum(I_good),'KFold',kfold); 69 | 70 | %split by type 71 | %for iter=1:length(virus) 72 | % x = strsplit(virus{iter},'.'); 73 | % virus{iter} = lower(x{1}); 74 | % x = strsplit(virus{iter},'-'); 75 | % virus{iter} = lower(x{1}); 76 | %end 77 | 78 | [unique_names,~,z] = unique(virus); 79 | %d = hist(z(y>0 & valid),length(unique_names)); 80 | 81 | %remove no label 82 | I = ~strcmp(unique_names,'') & ~strcmp(unique_names,'Trojan.Win32.Generic'); 83 | unique_names = unique_names(I); 84 | 85 | I_malware = valid & y>0; 86 | 87 | %perform a random permute 88 | cv_malware = cvpartition(length(unique_names),'KFold',kfold); 89 | 90 | %now combine the data 91 | for iter=1:kfold 92 | 93 | I_malware_test = false(size(y)); 94 | p = find(cv_malware.test(iter)); 95 | for iter2=1:length(p) 96 | I_malware_test = I_malware_test | (valid & y>0 & strcmp(virus, unique_names(p(iter2)))); 97 | end 98 | I_malware_train = I_malware & ~I_malware_test & ~strcmp(virus,'') & ~strcmp(virus,'Trojan.Win32.Generic'); 99 | 100 | %adjust the indicies 101 | cv_good_training = zeros(length(y),1); 102 | cv_good_training(I_good) = cv_good.training(iter); 103 | cv_good_test = zeros(length(y),1); 104 | cv_good_test(I_good) = cv_good.test(iter); 105 | 106 | cv{iter}.training = cv_good_training | I_malware_train; 107 | cv{iter}.test = cv_good_test | I_malware_test; 108 | end 109 | 110 | elseif(strcmp(method,'standard')) 111 | 112 | cv_all = cvpartition(sum(valid),'KFold',kfold); 113 | 114 | for iter=1:kfold 115 | cv_adj_training = false(length(y),1); 116 | cv_adj_training(valid) = cv_all.training(iter); 117 | cv_adj_test = false(length(y),1); 118 | cv_adj_test(valid) = cv_all.test(iter); 119 | 120 | cv{iter}.training = cv_adj_training; 121 | cv{iter}.test = cv_adj_test; 122 | end 123 | elseif(strcmp(method,'cuckoo_validate')) 124 | 125 | cv_all = cvpartition(sum(valid),'KFold',kfold); 126 | 127 | for iter=1:kfold 128 | cv_adj_training = false(length(y),1); 129 | cv_adj_training(valid) = cv_all.training(iter); 130 | cv_adj_test = false(length(y),1); 131 | cv_adj_test(valid) = cv_all.test(iter); 132 | 133 | cv{iter}.training = cv_adj_training | (cv_adj_test & ~cuckoo_idx); 134 | cv{iter}.test = cv_adj_test & cuckoo_idx; 135 | end 136 | end 137 | end 138 | 139 | function [cv_all] = get_parition(y, selected, kfold) 140 | 141 | y = y(selected); 142 | 143 | good = false; 144 | count = 1; 145 | while (~good && count<100) 146 | cv_all = cvpartition(sum(selected),'KFold',kfold); 147 | good = true; 148 | 149 | for iter=1:kfold 150 | 151 | if (sum(y(cv_all.training(iter))==0)<2 || sum(y(cv_all.training(iter))>0)<2) 152 | good = false; 153 | break; 154 | end 155 | end 156 | 157 | count = count+1; 158 | end 159 | 160 | if (~good) 161 | warn('Could not find a split with all classes in them.'); 162 | end 163 | end -------------------------------------------------------------------------------- /src/main/matlab/cv_virusclass.m: -------------------------------------------------------------------------------- 1 | function [I] = cv_virusclass(y, virus_label, kfold) 2 | 3 | for iter=1:length(virus_label) 4 | x = strsplit(virus_label{iter},'.'); 5 | virus_label{iter} = lower(x{1}); 6 | x = strsplit(virus_label{iter},'-'); 7 | virus_label{iter} = lower(x{1}); 8 | end 9 | 10 | n = randperm(length(y)); 11 | y = y(n); 12 | virus_label = virus_label(n); 13 | 14 | [label,~,z] = unique(virus_label); 15 | %d = hist(z,length(label)); 16 | label 17 | 18 | 19 | %bad index 20 | idx = find(strcmp(label,'')); 21 | 22 | %get the good labels 23 | idx_good = find(y==0)'; 24 | s_good = round(length(idx_good)/kfold); 25 | 26 | %split into kfolds 27 | s = round(length(label)/kfold); 28 | 29 | iter =1; 30 | for ii = 1:s:length(label) 31 | exl_index = ii:min(length(label),(ii+s-1)); 32 | 33 | %now the training index 34 | I_part{iter} = ~ismember(z', [exl_index, idx]) & y'>0; 35 | 36 | iter = iter+1; 37 | end 38 | 39 | iter =1; 40 | for ii = 1:s_good:length(idx_good) 41 | 42 | exl_index = ii:min(length(idx_good),(ii+s_good-1)); 43 | exl_index = ismember(1:length(idx_good), exl_index); 44 | 45 | I_part{iter} = I_part{iter} | ismember(1:length(y), idx_good(~exl_index)); 46 | 47 | iter = iter+1; 48 | end 49 | 50 | %get back the order 51 | for iter=1:length(I_part) 52 | x = I_part{iter}; 53 | x(n) = I_part{iter}; 54 | I{iter} = x; 55 | end 56 | -------------------------------------------------------------------------------- /src/main/matlab/gen_graft_data.m: -------------------------------------------------------------------------------- 1 | function [A, y, I] = gen_graft_data(A,y,cuckoo_idx,splunk_idx, I, model) 2 | 3 | good_cuckoo = cuckoo_idx & y==0 & I; 4 | bad_cuckoo = cuckoo_idx & y>0 & I; 5 | 6 | %adjust the cuckoo data 7 | if (nargin>5) 8 | 9 | %figure out positive cuckoo benign features 10 | good_cuckoo_pop = (sum(A(cuckoo_idx & y==0,:),1)>0.01*sum(cuckoo_idx & y==0, 1)) & sum(A(splunk_idx,:),1)==0; 11 | 12 | %now find the ones that also have positive weight 13 | w = model.getModelWeights(); 14 | 15 | filter_idx = good_cuckoo_pop & w>0; 16 | 17 | fprintf('Found %d cuckoo only features, %d of them with positive weight, forming a total sum of %f.\n', full(sum(good_cuckoo_pop)), full(sum(filter_idx)), full(sum(w(filter_idx)))); 18 | 19 | A(:,filter_idx) = 0; 20 | end; 21 | 22 | 23 | splunk = y==0 & splunk_idx & I; 24 | splunk = find(splunk); 25 | middle = round(length(splunk)/2); 26 | bad_splunk = splunk(1:middle); 27 | good_splunk = splunk((middle+1):length(splunk)); 28 | 29 | if (isempty(bad_splunk)) 30 | error('No splunk data found.'); 31 | end 32 | 33 | bad_other_perm = []; 34 | while length(bad_other_perm)0); 50 | 51 | 52 | 53 | end -------------------------------------------------------------------------------- /src/main/matlab/line_fewer_markers.m: -------------------------------------------------------------------------------- 1 | % line_fewer_markers - line with controlled amount of markers and correct legend behaviour 2 | % 3 | % LINE_FEWER_MARKERS(X,Y,NUM_MARKERS) adds the line in vectors X and Y to the current axes 4 | % with exactly NUM_MARKERS markers drawn. 5 | % 6 | % LINE_FEWER_MARKERS(X,Y,NUM_MARKERS,'PropertyName',PropertyValue,...) plots the data 7 | % stored in the vectors X and Y. 8 | % 9 | % LINE_FEWER_MARKERS returns handles to LINE/MARKER objects. 10 | % 11 | % [H1,H2,H3] = LINE_FEWER_MARKERS(X,Y,NUM_MARKERS,'PropertyName',PropertyValue,...) 12 | % performs the actions as above and returns the handles of all the plotted lines/markers. 13 | % H1 = handle to the main marker(1 point); it may be put in array and used with legend 14 | % H2 = handle to the continuous line (as in H2=plot()) 15 | % H3 = handle to all other markers 16 | % 17 | % Property/Value pairs and descriptions: 18 | % 19 | % Spacing - 'x' : ordinary uniform along x 20 | % - 'curve' : equal lengths along curve y(x) 21 | % - 'logx' : to be used with logarithmic x scale 22 | % 23 | % LockOnMax - 0 : first marker on 1st data point 24 | % - 1 : offset all markers such that one marker on first max of y(x) 25 | % 26 | % LegendLine - 'on' : default, reproduce linestyle also in legend 27 | % - 'off' : shows only marker in legend 28 | % 29 | % LineSpec: same as for LINE: LineStyle,LineWidth,Marker,MarkerSize,MarkerFaceColor... 30 | % 31 | % 32 | % Example: plot 3 curves with 9,9, and 15 markers each, using different input styles 33 | % 34 | % figure; hold on; 35 | % t = 0:0.005:pi; 36 | % line_fewer_markers(t*180/pi,cos(t) ,9, '--bs','spacing','curve'); 37 | % line_fewer_markers(t*180/pi,sin(t) ,9, '-.ro','MarkerFaceColor','g', ... 38 | % 'markersize',6,'linewidth',2); 39 | % grey1 = [1 1 1]*0.5; 40 | % line_fewer_markers(t*180/pi,sin(t).*cos(t) ,15, ':','marker','h','color',grey1, ... 41 | % 'markerfacecolor',grey1,'linewidth',2,'LockOnMax',1); 42 | % leg = legend('cos','sin','sin*cos','location','best'); 43 | % 44 | % Inspired by Ioannis Filippidis's answer: 45 | % http://www.mathworks.com/matlabcentral/answers/2165-too-many-markers 46 | % 47 | % rev.4, Massimo Ciacci, October 17, 2014 48 | % 49 | function [H1,H2,H3] = line_fewer_markers(x,y,num_Markers, varargin) 50 | 51 | %% find marker spec in varargin and remove it; extract special params: LockOnMax,Spacing 52 | if mod(length(varargin),2) 53 | if ischar(varargin{1}) 54 | linspec = varargin{1}; 55 | extraArgs = varargin(2:end); 56 | [varargInNoMk,varargInNoMkNoLn,lm,ms,mfc,LockOnMax,Spacing,LegendLine] = parseargsLineSpec(linspec,extraArgs); 57 | else 58 | error('odd sized [param | val] list, missing one param ?'); 59 | end 60 | else 61 | [varargInNoMk,varargInNoMkNoLn,lm,ms,mfc,LockOnMax,Spacing,LegendLine] = parseargs(varargin{:}); 62 | end 63 | 64 | %% input size check 65 | if isvector(x) && isvector(y) 66 | % make x,y row vectors 67 | if iscolumn(x), x = x.'; end 68 | if iscolumn(y), y = y.'; end 69 | else 70 | error('line_fewer_markers: input arguments must be 1D vectors'); 71 | end 72 | 73 | % How the method works: plots 3 times: 74 | % a) once only the line with all points with the style 'r--' and invisible handle, 75 | % b) last time the markers, using fewer points with style 'ro' and again invisible handle. 76 | % c) once with a visible handle, only the first point, using the complete style you specified (e.g. 'r--o') 77 | 78 | %% a) once only the line with all points with the style 79 | H2 = line(x ,y ,varargInNoMk{:}); %no markers here 80 | hasbehavior(H2,'legend',0); %prevent to appear in legends! 81 | 82 | %% b) last time the markers, using fewer points with style 83 | %sort the xvalues 84 | [x_t,I] = sort(x); 85 | y_t = y(I); 86 | [x_t,I] = unique(x_t); 87 | y_t = y_t(I); 88 | 89 | if (strcmp(Spacing,'x') || strcmp(Spacing,'X')) 90 | ti = round(linspace(1,length(x_t),num_Markers)); 91 | elseif (strcmp(Spacing,'logx') || strcmp(Spacing,'log')) 92 | xi = logspace(log10(x_t(2)),log10(x_t(end-1)),num_Markers); 93 | ti = floor(interp1(x_t,(1:length(x_t)),xi)); 94 | elseif (strcmp(Spacing,'curve') || strcmp(Spacing,'Curve')) 95 | scaleY = 3/4; % 1/1 figure aspect ratio 96 | yNrm = (y-min(y))./(max(y)-min(y))*scaleY; %NORMALIZE y scale in [0 1], height of display is prop to max(abs(y)) 97 | xNrm = (x-min(x))./(max(x)-min(x)); %NORMALIZE x scale in [0 1] 98 | 99 | if (sum(isinf(yNrm))>0) || sum(isinf(x))>0 %spacing along curve not possible with infinites 100 | ti = round(linspace(1,length(x),num_Markers)); 101 | else 102 | t = 1:length(x); 103 | s = [0 cumsum(sqrt(diff(xNrm).^2+diff(yNrm).^2))];%measures length along the curve 104 | si = (0:num_Markers-1)*s(end)/(num_Markers-1); %equally spaced lengths along the curve 105 | si(end) = s(end); %fix last point to be within the curve 106 | ti = round(interp1(s,t,si)); %find x index of markers 107 | end 108 | else 109 | error('invalid spacing parameter'); 110 | end 111 | %if LockOnMax 112 | %set one ti on max if found 113 | % [Mv,idx] = max(y); idx=idx(1); 114 | % [mv,idxti] = min(abs(idx-ti)); 115 | % deltati = ti(idxti)-idx; 116 | % ti = max(1,min(ti-deltati,length(y))); 117 | %end 118 | xi = x_t(ti); 119 | yi = y_t(ti); 120 | H3 = line(xi,yi,varargInNoMkNoLn{:},'Marker',lm,'MarkerSize',ms,'MarkerFaceColor',mfc,'LineStyle','none'); %plot markers only 121 | hasbehavior(H3,'legend',0); %prevent to appear in legends! 122 | 123 | %% c) once with a visible handle, only the first point, using the complete style you specified 124 | if strcmp(LegendLine,'on') 125 | H1 = line(xi(1),yi(1),varargInNoMk{:},'Marker',lm,'MarkerSize',ms,'MarkerFaceColor',mfc); 126 | else 127 | H1 = line(xi(1),yi(1),varargInNoMk{:},'linestyle','none','Marker',lm,'MarkerSize',ms,'MarkerFaceColor',mfc); 128 | end 129 | 130 | 131 | %------------------------------------------------------------- 132 | % PARSE FUNCTIONS 133 | %------------------------------------------------------------- 134 | % varargInNoMk = list of property pairs, marker specs removed 135 | % varargInNoMkNoLn = list of property pairs, marker specs and line specs removed 136 | function [varargInNoMk,varargInNoMkNoLn,lm,ms,mfc,LockOnMax,Spacing,LegendLine] = parseargs(varargin) 137 | lm =[]; ms =[]; mfc=[]; LockOnMax=[]; Spacing=[]; LegendLine=[]; 138 | varargInNoMk = {}; 139 | varargInNoMkNoLn = {}; 140 | arg_index = 1; 141 | while arg_index <= length(varargin) 142 | arg = varargin{arg_index}; 143 | % extract special params and marker specs from arg list 144 | if strcmp(arg,'marker') || strcmp(arg,'Marker') || strcmp(arg,'Mk') || strcmp(arg,'mk') 145 | lm = varargin{arg_index+1}; 146 | elseif strcmp(arg,'MarkerSize') || strcmp(arg,'markersize') || strcmp(arg,'Mks') || strcmp(arg,'mks') 147 | ms = varargin{arg_index+1}; 148 | elseif strcmp(arg,'MarkerFaceColor') || strcmp(arg,'markerfacecolor')||strcmp(arg,'MFC')||strcmp(arg,'mfc') 149 | mfc = varargin{arg_index+1}; 150 | elseif strcmp(arg,'LockOnMax') || strcmp(arg,'lockonmax') 151 | LockOnMax = varargin{arg_index+1}; 152 | elseif strcmp(arg,'Spacing') || strcmp(arg,'spacing') 153 | Spacing = varargin{arg_index+1}; 154 | elseif strcmp(arg,'LegendLine') || strcmp(arg,'legendline') 155 | LegendLine = varargin{arg_index+1}; 156 | else 157 | % keep other params in arg list for line command 158 | varargInNoMk = {varargInNoMk{:}, varargin{arg_index}, varargin{arg_index+1}}; 159 | if ~strcmp(arg,'LineStyle') && ~strcmp(arg,'linestyle') 160 | % exclude line params for marker only plot 161 | varargInNoMkNoLn = {varargInNoMkNoLn{:}, varargin{arg_index}, varargin{arg_index+1}}; 162 | end 163 | end 164 | arg_index = arg_index + 2; 165 | end 166 | %EXTRA DEFAULTS ARE SET HERE 167 | if isempty(lm), lm = 'o' ; end 168 | if isempty(ms), ms = 10 ; end 169 | if isempty(mfc), mfc = 'none'; end 170 | if isempty(LockOnMax), LockOnMax = 1 ; end 171 | if isempty(Spacing), Spacing = 'x' ; end %%'x' -> marker delta-x constant; 'curve' : spacing constant along the curve length 172 | if isempty(LegendLine), LegendLine = 'on' ; end 173 | 174 | %------------------------------------------------------------- 175 | % Parse LineSpec string and other arguments 176 | % varargInNoMk = list of property pairs, marker specs removed 177 | % varargInNoMkNoLn = list of property pairs, marker specs and line specs removed 178 | function [varargInNoMk,varargInNoMkNoLn,lm,ms,mfc,LockOnMax,Spacing,LegendLine] = parseargsLineSpec(linspec, extraArgs) 179 | % b blue . point - solid 180 | % g green o circle : dotted 181 | % r red x x-mark -. dashdot 182 | % c cyan + plus -- dashed 183 | % m magenta * star (none) no line 184 | % y yellow s square 185 | % k black d diamond 186 | % w white v triangle (down) 187 | % ^ triangle (up) 188 | % < triangle (left) 189 | % > triangle (right) 190 | % p pentagram 191 | % h hexagram 192 | varargInNoMk = {}; 193 | varargInNoMkNoLn = {}; 194 | 195 | foundLine = false; 196 | stringSearch = {'-.','--','-',':'}; 197 | for ii=1:4 198 | if strfind(linspec, stringSearch{ii}) 199 | foundLine = true; 200 | ls = stringSearch{ii}; 201 | linspec = setdiff(linspec,ls); 202 | break 203 | end 204 | end 205 | if foundLine 206 | varargInNoMk = {varargInNoMk{:},'lineStyle',ls}; 207 | else 208 | varargInNoMk = {varargInNoMk{:},'lineStyle','-'}; 209 | end 210 | 211 | if ~isempty(linspec) 212 | foundCol = false; 213 | stringSearch = {'b','g','r','c','m','y','k','w'}; 214 | for ii=1:8 215 | if strfind(linspec, stringSearch{ii}) 216 | foundCol = true; 217 | colspec = stringSearch{ii}; 218 | linspec = setdiff(linspec,colspec); 219 | break 220 | end 221 | end 222 | if foundCol 223 | varargInNoMk = {varargInNoMk{:},'color',colspec}; 224 | varargInNoMkNoLn = {varargInNoMkNoLn{:},'color',colspec}; 225 | end 226 | end 227 | 228 | if ~isempty(linspec) 229 | foundMk = false; 230 | stringSearch = {'.','o','x','+','*','s','d','v','^','<','>','p','h'}; 231 | for ii=1:13 232 | if strfind(linspec, stringSearch{ii}) 233 | foundMk = true; 234 | mkspec = stringSearch{ii}; 235 | break 236 | end 237 | end 238 | if foundMk, lm = mkspec; else lm = 'none'; end 239 | else 240 | lm = 'none'; 241 | end 242 | 243 | 244 | [extraArgs1,unused,lm2,ms,mfc,LockOnMax,Spacing,LegendLine] = parseargs(extraArgs{:}); 245 | if strcmp(lm,'none') && ~strcmp(lm2,'none') %if other marker specified in Property Pairs take that one 246 | lm = lm2; 247 | end 248 | varargInNoMk = {varargInNoMk{:},extraArgs1{:}}; 249 | varargInNoMkNoLn = {varargInNoMkNoLn{:},extraArgs1{:}}; 250 | 251 | -------------------------------------------------------------------------------- /src/main/matlab/make_fig1.m: -------------------------------------------------------------------------------- 1 | function make_fig1(A, y, column_labels, virus, cuckoo_idx, t_created) 2 | 3 | h = figure; 4 | set(h,'Renderer','painters','Position',[100 900 600 550]); 5 | 6 | subaxis(2,2,1,'Margin',0.0,'Padding',-0.00,'PaddingBottom',0.05,'PaddingTop',0.00,'PaddingLeft',-0.00,'PaddingRight',0.05,'MarginBottom',0.05,'MarginLeft',0.09,'MarginRight',0.00, 'MarginTop',0.05); 7 | 8 | ha = subaxis(2,2,1,1); 9 | plot_classify(y, virus, cuckoo_idx); 10 | pos1 = get(gca,'Position'); 11 | annotation(h,'textbox',[pos1(1)-0.09,pos1(2)+pos1(4)+0.00,.1,.05],'String','(A)','fontsize',14,'EdgeColor','none'); 12 | ylim([0, .1]); 13 | 14 | ha = subaxis(2,2,2,1); 15 | plot_dist(y, virus, cuckoo_idx) 16 | pos1 = get(gca,'Position'); 17 | annotation(h,'textbox',[pos1(1)-0.09,pos1(2)+pos1(4)+0.00,.1,.05],'String','(B)','fontsize',14,'EdgeColor','none'); 18 | ylim([0, .1]); 19 | 20 | 21 | ha = subaxis(2,2,1,2); 22 | plot_time(t_created); 23 | pos1 = get(gca,'Position'); 24 | annotation(h,'textbox',[pos1(1)-0.09,pos1(2)+pos1(4)+0.00,.1,.05],'String','(C)','fontsize',14,'EdgeColor','none'); 25 | 26 | 27 | ha = subaxis(2,2,2,2); 28 | plot_features(A, column_labels); 29 | pos1 = get(gca,'Position'); 30 | annotation(h,'textbox',[pos1(1)-0.09,pos1(2)+pos1(4)+0.00,.1,.05],'String','(D)','fontsize',14,'EdgeColor','none'); 31 | 32 | end 33 | 34 | function plot_dist(y, virus, cuckoo_idx) 35 | 36 | for iter=1:length(virus) 37 | %x = strsplit(virus{iter},'.'); 38 | %virus{iter} = lower(x{1}); 39 | %x = strsplit(virus{iter},'-'); 40 | %virus{iter} = lower(x{1}); 41 | 42 | end 43 | 44 | cmp = ~strcmp(virus, '') & ~strcmp(virus, 'Trojan.Win32.Generic'); 45 | virus = virus(cmp); 46 | 47 | %[unique_names,~,z] = unique(virus); 48 | %d = hist(z,length(unique_names)); 49 | 50 | %[~,I] = sort(d,'descend'); 51 | %unique_names(I(1:10)) 52 | 53 | 54 | [unique_names,~,z] = unique(virus); 55 | z = z/length(virus); 56 | 57 | %d = hist(max(0, z(y>0 & cuckoo_idx)),length(unique_names)); 58 | 59 | histogram(z, 50, 'Normalization', 'probability'); 60 | 61 | set(gca, 'YMinorTick','on', 'XMinorTick','on','TickLength',[0.025 0.025]); 62 | ylabel('Fraction of Malware Binaries'); 63 | xlabel('Relative Size of Malware Family'); 64 | 65 | end 66 | 67 | function plot_features(A, column_labels) 68 | 69 | %idx = false(size(A,2)); 70 | x = strfind(column_labels,','); 71 | idx = cellfun(@(x)isempty(x), x); 72 | 73 | s = sum(A(:,idx)>0,1); 74 | 75 | [~,I] = max(s) 76 | labels = column_labels(idx); 77 | labels(I) 78 | s(I) 79 | g = find(idx); 80 | sum(A(:,g(I))>0) 81 | g(I) 82 | 83 | val = sort(s,'descend'); 84 | x = 1:length(val); 85 | 86 | if (length(x)<1000000) 87 | x = [1:500, 501:10:10000, 10001:500:100000, 100001:5000:length(val)]; 88 | else 89 | x = [1:500, 501:10:10000, 10001:500:100000, 100001:5000:1000000, 1000001:50000:length(val)]; 90 | end 91 | 92 | val = val(x); 93 | 94 | %[n,edges] = hist(s,50); 95 | %n = n/sum(s); 96 | 97 | %bar(edges, n, 'barwidth', 1, 'basevalue', 1.0e-6); 98 | loglog(x, val, '.'); 99 | %set(gca,'YScale','log') 100 | 101 | %histogram(s, 50, 'Normalization', 'probability'); 102 | 103 | set(gca, 'YMinorTick','on', 'XMinorTick','on','TickLength',[0.025 0.025]); 104 | ylabel('Number of Occurances'); 105 | xlabel('Popularity Rank'); 106 | 107 | end 108 | 109 | function plot_time(t_created) 110 | 111 | t_smallest = round(86400 * (datenum('1995', 'yyyy') - datenum('1970', 'yyyy'))); 112 | t_largest = round(86400 * (datenum('2015', 'yyyy') - datenum('1970', 'yyyy'))); 113 | 114 | %adjust so the dates make sense 115 | t_created(t_createdt_largest) = 0; 116 | 117 | %t_created = t_created(t_created>0); 118 | 119 | d = t_created/86400+datenum('1970', 'yyyy'); 120 | 121 | histogram(d, 100, 'Normalization', 'probability'); 122 | datetick('x','yy','keeplimits'); 123 | 124 | set(gca, 'YMinorTick','on', 'XMinorTick','on','TickLength',[0.025 0.025]); 125 | ylabel('Fraction of Binaries'); 126 | xlabel('Year Created'); 127 | xlim([datenum('1995', 'yyyy'), datenum('2015', 'yyyy')]); 128 | ylim([0, .1]); 129 | 130 | end 131 | 132 | function plot_classify(y, virus, cuckoo_idx) 133 | 134 | y = y(y>0 & y<=1 & cuckoo_idx); 135 | 136 | %d = hist(max(0, z(y>0 & cuckoo_idx)),length(unique_names)); 137 | 138 | histogram(y, 50, 'Normalization', 'probability'); 139 | 140 | set(gca, 'YMinorTick','on', 'XMinorTick','on','TickLength',[0.025 0.025]); 141 | ylabel('Fraction of Classified Binaries'); 142 | xlabel('VirusTotal Score ({\it{s}})'); 143 | 144 | end -------------------------------------------------------------------------------- /src/main/matlab/make_fig2.m: -------------------------------------------------------------------------------- 1 | function make_fig2(A, y, cuckoo_idx, splunk_idx, t_created, virus, names) 2 | 3 | fold = 2; 4 | cutoff = .3; 5 | 6 | I_1 = cv_part(y, fold, cutoff, 'standard', t_created, cuckoo_idx, splunk_idx); 7 | I_2 = cv_part(y, fold, cutoff, 'creation', t_created, cuckoo_idx, splunk_idx); 8 | I_3 = cv_part(y, fold, cutoff, 'type', t_created, cuckoo_idx, splunk_idx, virus); 9 | 10 | h = figure; 11 | set(h,'Renderer','painters','Position',[100 900 1000 400]); 12 | 13 | subaxis(2,4,1,'Margin',0.00,'Padding',-0.00,'PaddingBottom',0.0,'PaddingTop',0.02,'PaddingLeft',0.02,'PaddingRight',-0.01,'MarginBottom',0.08,'MarginLeft',0.03,'MarginRight',0.10,'MarginTop',0.03); 14 | 15 | ha = subaxis(2,4,1,1); 16 | [pred02, pred05, logit_model1] = run(A, y, I_1, false, false, t_created, cuckoo_idx, splunk_idx); 17 | pos1 = get(gca,'Position'); 18 | annotation(h,'textbox',[pos1(1)-0.05,pos1(2)+pos1(4)-0.01,.1,.05],'String','(A)','fontsize',14,'EdgeColor','none'); 19 | ylabel('TPR'); 20 | 21 | ha = subaxis(2,4,2,1); 22 | [~, ~, logit_model2] = run(A, y, I_2, false, true, t_created, cuckoo_idx, splunk_idx); 23 | pos1 = get(gca,'Position'); 24 | annotation(h,'textbox',[pos1(1)-0.05,pos1(2)+pos1(4)-0.01,.1,.05],'String','(B)','fontsize',14,'EdgeColor','none'); 25 | 26 | ha = subaxis(2,4,3,1); 27 | [~, ~, logit_model3] = run(A, y, I_3, false, false, t_created, cuckoo_idx, splunk_idx); 28 | pos1 = get(gca,'Position'); 29 | annotation(h,'textbox',[pos1(1)-0.05,pos1(2)+pos1(4)-0.01,.1,.05],'String','(C)','fontsize',14,'EdgeColor','none'); 30 | 31 | ha = subaxis(2,4,4,1); 32 | [h1, h2] = virus_names(y, pred02, pred05, virus); 33 | pos1 = get(gca,'Position'); 34 | annotation(h,'textbox',[pos1(1)-0.05,pos1(2)+pos1(4)-0.01,.1,.05],'String','(D)','fontsize',14,'EdgeColor','none'); 35 | pos1 = get(h1,'Position'); 36 | set(h1,'Position', [pos1(1)+0.05, pos1(2), pos1(3), pos1(4)]); 37 | if (h1~=h2) 38 | pos1 = get(h2,'Position'); 39 | set(h2,'Position', [pos1(1)+0.05, pos1(2), pos1(3), pos1(4)]); 40 | 41 | end 42 | 43 | 44 | ha = subaxis(2,4,1,2); 45 | [pred02, pred05] = run(A, y, I_1, true, false, t_created, cuckoo_idx, splunk_idx, logit_model1); 46 | pos1 = get(gca,'Position'); 47 | annotation(h,'textbox',[pos1(1)-0.05,pos1(2)+pos1(4)-0.01,.1,.05],'String','(E)','fontsize',14,'EdgeColor','none'); 48 | ylabel('TPR'); 49 | xlabel('FPR'); 50 | 51 | ha = subaxis(2,4,2,2); 52 | run(A, y, I_2, true, true, t_created, cuckoo_idx, splunk_idx, logit_model2); 53 | pos1 = get(gca,'Position'); 54 | annotation(h,'textbox',[pos1(1)-0.05,pos1(2)+pos1(4)-0.01,.1,.05],'String','(F)','fontsize',14,'EdgeColor','none'); 55 | xlabel('FPR'); 56 | 57 | ha = subaxis(2,4,3,2); 58 | run(A, y, I_3, true, false, t_created, cuckoo_idx, splunk_idx, logit_model3); 59 | pos1 = get(gca,'Position'); 60 | annotation(h,'textbox',[pos1(1)-0.05,pos1(2)+pos1(4)-0.01,.1,.05],'String','(G)','fontsize',14,'EdgeColor','none'); 61 | xlabel('FPR'); 62 | 63 | ha = subaxis(2,4,4,2); 64 | [h1, h2] = virus_names(y, pred02, pred05, virus); 65 | pos1 = get(gca,'Position'); 66 | annotation(h,'textbox',[pos1(1)-0.05,pos1(2)+pos1(4)-0.01,.1,.05],'String','(H)','fontsize',14,'EdgeColor','none'); 67 | pos1 = get(h1,'Position'); 68 | set(h1,'Position', [pos1(1)+0.05, pos1(2), pos1(3), pos1(4)]); 69 | pos1 = get(h2,'Position'); 70 | if (h1~=h2) 71 | set(h2,'Position', [pos1(1)+0.05, pos1(2), pos1(3), pos1(4)]); 72 | end 73 | xlabel('Fraction Detected'); 74 | 75 | end 76 | 77 | function Ic = valid_cuckoo_only(I, cuckoo_idx) 78 | 79 | Ic = {}; 80 | for iter=1:length(I) 81 | Ic{iter}.training = I{iter}.training | (I{iter}.test & ~cuckoo_idx); 82 | Ic{iter}.test = I{iter}.test & cuckoo_idx; 83 | end 84 | end 85 | 86 | function [pred02, pred05, logit_model_out] = run(A, y, I, cuckoo_only, time_split, t_created, cuckoo_idx, splunk_idx, logit_model_in) 87 | 88 | A = double(A>0); 89 | y = double(y>0); 90 | 91 | logit = init(); 92 | bagged = init(); 93 | logit2 = init(); 94 | bagged2 = init(); 95 | logit3 = init(); 96 | bagged3 = init(); 97 | 98 | pred02 = NaN*ones(size(y)); 99 | pred05 = NaN*ones(size(y)); 100 | 101 | %compute valid 102 | valid = false(size(y)); 103 | for iter=1:length(I) 104 | valid = valid | I{iter}.training | I{iter}.test; 105 | end 106 | 107 | logit_model_out = {}; 108 | 109 | for iter=1:length(I) 110 | 111 | I_train = I{iter}.training; 112 | I_test = I{iter}.test; 113 | 114 | if (time_split) 115 | 116 | t_smallest = min(t_created(I_test & t_created>0 & y>0 & cuckoo_idx)); 117 | 118 | I_train = valid & I_train & (y==0 | ~cuckoo_idx | t_created<(t_smallest-86400 * 365 *2)); 119 | I_test = valid & ~I_train & (y==0 | ~cuckoo_idx | t_created<(t_smallest-86400 * 365 *2)); 120 | end 121 | 122 | if (sum(I_train & I_test)>0) 123 | error('Testing and training intersecting.'); 124 | end 125 | 126 | A_train = A(I_train, :); 127 | y_train = y(I_train, :); 128 | 129 | if (nargin<9) 130 | [logit_model, boost_model] = run_pipeline(A_train, y_train); 131 | logit_model_out{iter} = logit_model; 132 | else 133 | logit_model = logit_model_in{iter}; 134 | boost_model = logit_model; 135 | logit_model_out{iter} = logit_model; 136 | end 137 | 138 | %graft data 139 | if (cuckoo_only) 140 | [A_test, y_test, I_test] = gen_graft_data(A,y,cuckoo_idx,splunk_idx, I_test, logit_model); 141 | else 142 | I_test = I_test & ~splunk_idx; 143 | A_test = A(I_test, :); 144 | y_test = y(I_test, :); 145 | end 146 | 147 | [logit, T02, T05] = record_run(A_test, y_test, logit_model, logit); 148 | [bagged] = record_run(A_test, y_test, boost_model, bagged); 149 | 150 | %record the predictions 151 | pred02(I_test) = logit_model.predict(A_test)>=T02; 152 | pred05(I_test) = logit_model.predict(A_test)>=T05; 153 | 154 | if (time_split) 155 | 156 | I_test = valid & ~I_train & (y==0 | ~cuckoo_idx | t_created>=(t_smallest-86400 * 365 * 1)); 157 | 158 | if (sum(I_train & I_test)>0) 159 | error('Testing and training intersecting.'); 160 | end 161 | 162 | %A_train = A(I_train, :); 163 | %y_train = y(I_train, :); 164 | 165 | %[logit_model2, boost_model2] = run_pipeline(A_train, y_train); 166 | logit_model2 = logit_model; 167 | boost_model2 = boost_model; 168 | 169 | %graft data 170 | if (cuckoo_only) 171 | [A_test, y_test,I_test] = gen_graft_data(A,y,cuckoo_idx,splunk_idx, I_test, logit_model2); 172 | else 173 | I_test = I_test & ~splunk_idx; 174 | A_test = A(I_test, :); 175 | y_test = y(I_test, :); 176 | end 177 | 178 | 179 | [logit2] = record_run(A_test, y_test, logit_model2, logit2); 180 | [bagged2] = record_run(A_test, y_test, boost_model2, bagged2); 181 | 182 | I_test = valid & ~I_train & (y==0 | ~cuckoo_idx | t_created>=(t_smallest-86400 * 365 * 0)); 183 | 184 | if (sum(I_train & I_test)>0) 185 | error('Testing and training intersecting.'); 186 | end 187 | 188 | %A_train = A(I_train, :); 189 | %y_train = y(I_train, :); 190 | 191 | %[logit_model3, boost_model3] = run_pipeline(A_train, y_train); 192 | logit_model3 = logit_model; 193 | boost_model3 = boost_model; 194 | 195 | 196 | %graft data 197 | if (cuckoo_only) 198 | [A_test, y_test,I_test] = gen_graft_data(A,y,cuckoo_idx,splunk_idx, I_test, logit_model3); 199 | else 200 | I_test = I_test & ~splunk_idx; 201 | A_test = A(I_test, :); 202 | y_test = y(I_test, :); 203 | end 204 | 205 | [logit3] = record_run(A_test, y_test, logit_model3, logit3); 206 | [bagged3] = record_run(A_test, y_test, boost_model3, bagged3); 207 | 208 | end 209 | 210 | end 211 | 212 | if (time_split) 213 | semilogx(mean(logit.X) , mean(logit.Y),'-k', mean(logit2.X) , mean(logit2.Y),'-b', mean(logit3.X) , mean(logit3.Y),'-r', mean(bagged.X) , mean(bagged.Y), '--k', mean(bagged2.X) , mean(bagged2.Y), '--b', mean(bagged3.X) , mean(bagged3.Y), '--r'); 214 | legend(sprintf('0 year, AUC=%.2f', mean(logit.AUC)), sprintf('1 year, AUC=%.2f', mean(logit2.AUC)), sprintf('2 year, AUC=%.2f', mean(logit3.AUC)), 'Location','SouthEast'); 215 | %plot(mean(logit.X) , mean(logit.Y),'-k', mean(logit2.X) , mean(logit2.Y),'-r'); 216 | %legend(sprintf('LR, AUC=%.2f, 1 year', mean(logit.AUC)), sprintf('LR, AUC=%.2f, 2 years', mean(logit2.AUC)), 'Location','SouthEast'); 217 | else 218 | 219 | semilogx(mean(logit.X) , mean(logit.Y),'-k', mean(bagged.X) , mean(bagged.Y), '--k'); 220 | legend(sprintf('AUC=%.2f', mean(logit.AUC)), 'Location','SouthEast'); 221 | %plot(mean(logit.X) , mean(logit.Y),'-k'); 222 | end 223 | 224 | xlim([2e-4, 0.1]); 225 | ylim([0, 1.0]); 226 | set(gca, 'XMinorTick','on','YMinorTick','on','TickLength',[0.025 0.025]); 227 | 228 | end 229 | 230 | function [base] = init() 231 | base.X = []; 232 | base.Y = []; 233 | base.AUC = []; 234 | base.AUC_splunk = []; 235 | end 236 | 237 | function [base,T02, T05] = record_run(A, y, model, base) 238 | 239 | [X,Y,T,auc] = model.getROCPoints(A, y); 240 | 241 | T02 = T(find(X>=.01,1,'first')); 242 | T05 = T(find(X>=.001,1,'first')); 243 | AUC_splunk = 0; 244 | 245 | base.X(end+1,:) = X; 246 | base.Y(end+1,:) = Y; 247 | base.AUC(end+1) = auc; 248 | base.AUC_splunk(end+1) = AUC_splunk; 249 | 250 | end 251 | 252 | 253 | function [h1, h2] = virus_names(y, pred02, pred05, virus) 254 | 255 | if (sum(~isnan(pred02))==0) 256 | h1 = gca; 257 | %h2 = axes(); 258 | h2 = gca; 259 | return; 260 | end 261 | 262 | 263 | for iter=1:length(virus) 264 | if (strcmp(virus{iter}, 'Trojan.Win32.Generic')) 265 | virus{iter} = 'Generic'; 266 | else 267 | x = strsplit(virus{iter},'.'); 268 | virus{iter} = lower(x{1}); 269 | x = strsplit(virus{iter},'-'); 270 | virus{iter} = lower(x{1}); 271 | end 272 | end 273 | 274 | [unique_names,~,z] = unique(virus); 275 | d = hist(max(0, z(y>0 & ~isnan(pred02))),length(unique_names)); 276 | 277 | d_after = hist(z(y>0 & pred05>0),length(unique_names)); 278 | d_after2 = hist(z(y>0 & pred02>0),length(unique_names)); 279 | 280 | [~,I] = sort(d,'descend'); 281 | 282 | I_label = []; 283 | for iter=1:length(I) 284 | if (strcmp(unique_names(I(iter)),'')==0 && strcmp(unique_names(I(iter)),'Generic')==0) 285 | I_label = [I_label I(iter)]; 286 | end 287 | 288 | if (length(I_label)>=10) 289 | break; 290 | end 291 | end 292 | 293 | barh([d_after(I_label)'./d(I_label)', d_after2(I_label)'./d(I_label)']); 294 | 295 | hold on; 296 | 297 | set(gca, 'YDir','reverse'); 298 | set(gca,'YTickLabel',unique_names(I_label), 'XMinorTick','on','TickLength',[0.025 0.025]); 299 | ylabel('Kaspersky Classification'); 300 | xlim([0, 1.0]); 301 | h1 = gca; 302 | 303 | %legend('FPR=10^{-2}', 'FPR=10^{-3}','Location','SouthWest' ); 304 | %legend('boxoff'); 305 | 306 | h2 = axes('Position',get(gca,'Position'),... 307 | 'XAxisLocation','top',... 308 | 'YAxisLocation','right',... 309 | 'Color','none'); 310 | 311 | set(h2, 'Ylim', get(h1,'Ylim'), 'YDir','reverse', 'XTickLabel', [], 'YTickLabel', d(I_label), 'YTick', 1:length(I_label)); 312 | ylabel(h2, '# Observations'); 313 | axes(h1); 314 | 315 | end 316 | -------------------------------------------------------------------------------- /src/main/matlab/make_fig3.m: -------------------------------------------------------------------------------- 1 | function make_fig3(A, y, cuckoo_idx, splunk_idx, virus_kasp, virus_mcafee, virus_symantec, t_created) 2 | 3 | fold = 2; 4 | cutoff = .3; 5 | 6 | I = cv_part(y, fold, cutoff, 'standard', t_created, cuckoo_idx, splunk_idx, []); 7 | 8 | valid = I{1}.training | I{1}.test; 9 | 10 | kasp = get_active(virus_kasp); 11 | mcafee = get_active(virus_mcafee); 12 | symantec = get_active(virus_symantec); 13 | 14 | num_mal = sum(valid & y>0); 15 | fprintf('Number of malware: %d.\n', num_mal); 16 | fprintf('kasp fraction detected: %.3f.\n', sum(kasp & valid & y>0)/num_mal); 17 | fprintf('mcafee fraction detected: %.3f.\n', sum(mcafee & valid & y>0)/num_mal); 18 | fprintf('symantec fraction detected: %.3f.\n', sum(symantec & valid & y>0)/num_mal); 19 | fprintf('mcafee or symantec fraction detected: %.3f.\n', sum((mcafee | symantec) & valid & y>0)/num_mal); 20 | fprintf('mcafee or symantec or kasp fraction detected: %.3f.\n', sum((kasp | mcafee | symantec) & valid & y>0)/num_mal); 21 | 22 | h = figure; 23 | set(h,'Renderer','painters','Position',[100 900 650 200]); 24 | 25 | subaxis(1,3,1,'Margin',0.00,'Padding',-0.00,'PaddingBottom',0.0,'PaddingTop',0.07,'PaddingLeft',0.03,'PaddingRight',-0.00,'MarginBottom',0.15,'MarginLeft',0.04,'MarginRight',0.03,'MarginTop',0.00); 26 | 27 | ha = subaxis(1,3,1,1); 28 | [pred2, pred5] = run(A,y,I, kasp, mcafee, symantec, cuckoo_idx, splunk_idx); 29 | pos1 = get(gca,'Position'); 30 | annotation(h,'textbox',[pos1(1)-0.07,pos1(2)+pos1(4)+0.02,.1,.05],'String','(A)','fontsize',14,'EdgeColor','none'); 31 | ylabel('TPR'); 32 | xlabel('FPR'); 33 | 34 | ha = subaxis(1,3,2,1); 35 | run2(A,y,I, kasp, mcafee, symantec, cuckoo_idx, splunk_idx); 36 | pos1 = get(gca,'Position'); 37 | annotation(h,'textbox',[pos1(1)-0.07,pos1(2)+pos1(4)+0.03,.1,.05],'String','(B)','fontsize',14,'EdgeColor','none'); 38 | ylabel('TPR'); 39 | xlabel('FPR'); 40 | 41 | ha = subaxis(1,3,3,1); 42 | label_score(y, pred2, pred5); 43 | pos1 = get(gca,'Position'); 44 | annotation(h,'textbox',[pos1(1)-0.07,pos1(2)+pos1(4)+0.03,.1,.05],'String','(C)','fontsize',14,'EdgeColor','none'); 45 | ylabel('Fraction Detected'); 46 | xlabel('VirusTotal Score'); 47 | 48 | end 49 | 50 | function label = get_active(virus) 51 | 52 | val = strfind(virus, ''); 53 | label = false(size(virus)); 54 | for iter=1:length(val) 55 | if (isempty(val{iter})) 56 | label(iter) = true; 57 | end; 58 | end; 59 | end 60 | 61 | function [pred2, pred5] = run2(A,y,I, kasp, mcafee, symantec, cuckoo_idx, splunk_idx) 62 | 63 | [k1, k2, pred2, pred5] = get_roc(A,y,I, kasp | mcafee | symantec, cuckoo_idx, splunk_idx); 64 | [m1, m2] = get_roc(A,y,I, mcafee | symantec, cuckoo_idx, splunk_idx); 65 | 66 | %line_fewer_markers(mean(k1.X) , mean(k1.Y),5, '--sk', 'Spacing', 'logx', 'markersize',5); 67 | %hold on; 68 | line_fewer_markers(mean(k2.X) , mean(k2.Y),5, '-sk', 'Spacing', 'logx', 'markersize',5); 69 | hold on; 70 | %line_fewer_markers(mean(m1.X) , mean(m1.Y),5, '--xb', 'Spacing', 'logx', 'markersize',5); 71 | line_fewer_markers(mean(m2.X) , mean(m2.Y),5, '-xb', 'Spacing', 'logx', 'markersize',5); 72 | hold off; 73 | 74 | % legend(... 75 | % sprintf('AUC=%.2f', mean(k1.AUC)), sprintf('AUC=%.2f', mean(k2.AUC)),... 76 | % sprintf('AUC=%.2f', mean(m1.AUC)), sprintf('AUC=%.2f', mean(m2.AUC)),... 77 | % 'Location','SouthEast'); 78 | 79 | legend(... 80 | sprintf('AUC=%.2f', mean(k2.AUC)),... 81 | sprintf('AUC=%.2f', mean(m2.AUC)),... 82 | 'Location','SouthEast'); 83 | 84 | xlim([0, 0.1]); 85 | ylim([0, 1.0]); 86 | set(gca, 'XScale','log', 'XMinorTick','on','YMinorTick','on','TickLength',[0.025 0.025]); 87 | 88 | end 89 | 90 | function [pred2, pred5] = run(A,y,I, kasp, mcafee, symantec, cuckoo_idx, splunk_idx) 91 | 92 | [k1, k2, pred2, pred5] = get_roc(A,y,I, kasp, cuckoo_idx, splunk_idx); 93 | [m1, m2] = get_roc(A,y,I, mcafee, cuckoo_idx, splunk_idx); 94 | [s1, s2] = get_roc(A,y,I, symantec, cuckoo_idx, splunk_idx); 95 | 96 | %line_fewer_markers(mean(k1.X) , mean(k1.Y),5, '--sk', 'Spacing', 'logx', 'markersize',5); 97 | %hold on; 98 | line_fewer_markers(mean(k2.X) , mean(k2.Y),5, '-sk', 'Spacing', 'logx', 'markersize',5); 99 | hold on; 100 | %line_fewer_markers(mean(m1.X) , mean(m1.Y),5, '--xr', 'Spacing', 'logx', 'markersize',5); 101 | line_fewer_markers(mean(m2.X) , mean(m2.Y),5, '-xr', 'Spacing', 'logx', 'markersize',5); 102 | %line_fewer_markers(mean(s1.X) , mean(s1.Y),5, '--^b', 'Spacing', 'logx', 'markersize',5); 103 | line_fewer_markers(mean(s2.X) , mean(s2.Y),5, '-^b', 'Spacing', 'logx', 'markersize',5); 104 | hold off; 105 | 106 | % legend(... 107 | % sprintf('AUC=%.2f', mean(k1.AUC)), sprintf('AUC=%.2f', mean(k2.AUC)),... 108 | % sprintf('AUC=%.2f', mean(m1.AUC)), sprintf('AUC=%.2f', mean(m2.AUC)),... 109 | % sprintf('AUC=%.2f', mean(s1.AUC)), sprintf('AUC=%.2f', mean(s2.AUC)),... 110 | % 'Location','SouthEast'); 111 | 112 | legend(... 113 | sprintf('AUC=%.2f', mean(k2.AUC)),... 114 | sprintf('AUC=%.2f', mean(m2.AUC)),... 115 | sprintf('AUC=%.2f', mean(s2.AUC)),... 116 | 'Location','SouthEast'); 117 | 118 | 119 | xlim([0, 0.1]); 120 | ylim([0, 1.0]); 121 | set(gca, 'XScale','log', 'XMinorTick','on','YMinorTick','on','TickLength',[0.025 0.025]); 122 | 123 | end 124 | 125 | function [r1, r2, pred2, pred5] = get_roc(A,y,I, virus, cuckoo_idx, splunk_idx) 126 | 127 | r1 = init(); 128 | r2 = init(); 129 | 130 | A = double(A>0); 131 | y = double(y>0); 132 | 133 | pred2 = NaN*ones(size(y)); 134 | pred5 = NaN*ones(size(y)); 135 | 136 | %do kaspersky plots 137 | for iter=1:length(I) 138 | 139 | I_train = I{iter}.training & (y==0 | virus); 140 | %I_test = I{iter}.test & (y==0 | virus); 141 | 142 | A_train = A(I_train,:); 143 | y_train = y(I_train); 144 | 145 | logit_model = run_pipeline(A_train,y_train); 146 | %[A_test, y_test] = gen_graft_data(A,y,cuckoo_idx,splunk_idx, I_test, logit_model); 147 | %r1 = record_run(A_test,y_test, logit_model, r1); 148 | 149 | I_test = I{iter}.test & (y==0 | ~virus); 150 | 151 | [A_test, y_test, I_test] = gen_graft_data(A,y,cuckoo_idx,splunk_idx, I_test, logit_model); 152 | [r2, T02, T05] = record_run(A_test,y_test, logit_model, r2); 153 | r1 = r2; 154 | 155 | if (nargout>2) 156 | %record the predictions 157 | pred2(I_test) = logit_model.predict(A_test)>=T02; 158 | pred5(I_test) = logit_model.predict(A_test)>=T05; 159 | end 160 | end 161 | 162 | end 163 | 164 | function [h1] = label_score(y, pred2, pred5) 165 | 166 | edges = [.3, .4, .5, .6, .7, .8, .9, 1.0]; 167 | 168 | val = zeros(2, length(edges)-1); 169 | s = zeros(2, length(edges)-1); 170 | for iter=1:(length(edges)-1) 171 | 172 | v1 = ~isnan(pred2) & y>edges(iter) & y<=edges(iter+1); 173 | v2 = ~isnan(pred5) & y>edges(iter) & y<=edges(iter+1); 174 | 175 | t2 = pred2; 176 | t5 = pred5; 177 | 178 | t2(~v1) = 0; 179 | t5(~v2) = 0; 180 | 181 | d1 = sum(v1 & t2)/sum(v1); 182 | d2 = sum(v2 & t5)/sum(v2); 183 | 184 | val(2,iter) = d1; 185 | val(1,iter) = d2; 186 | 187 | s(2,iter) = sum(v1); 188 | s(1,iter) = sum(v2); 189 | end 190 | 191 | v1 = ~isnan(pred5); 192 | v2 = ~isnan(pred2); 193 | 194 | fprintf('TPR for known malware at 0.001 FPR: %.4f\n', sum(y(v1)>1 & pred5(v1))/sum(y(v1)>1)); 195 | fprintf('TPR for known malware at 0.01 FPR: %.4f\n', sum(y(v2)>1 & pred2(v2))/sum(y(v2)>1)); 196 | 197 | %val = ones(2,length(edges)-1); 198 | 199 | bar(val'); 200 | 201 | set(gca,'XTickLabel',edges(1:end-1), 'YMinorTick','on','TickLength',[0.025 0.025]); 202 | ylabel('VirusTotal Score'); 203 | h1 = gca; 204 | %xlim([edges(1), 1.1]); 205 | 206 | %legend('FPR=10^{-2}', 'FPR=10^{-3}','Location','SouthWest' ); 207 | %legend('boxoff'); 208 | 209 | end 210 | 211 | function [base] = init() 212 | base.X = []; 213 | base.Y = []; 214 | base.AUC = []; 215 | base.AUC_splunk = []; 216 | end 217 | 218 | function [base,T02, T05] = record_run(A, y, model, base) 219 | 220 | [X,Y,T,auc] = model.getROCPoints(A, y); 221 | 222 | T02 = T(find(X>=.01,1,'first')); 223 | T05 = T(find(X>=.001,1,'first')); 224 | AUC_splunk = 0; 225 | 226 | base.X(end+1,:) = X; 227 | base.Y(end+1,:) = Y; 228 | base.AUC(end+1) = auc; 229 | base.AUC_splunk(end+1) = AUC_splunk; 230 | 231 | end -------------------------------------------------------------------------------- /src/main/matlab/parseArgs.m: -------------------------------------------------------------------------------- 1 | function ArgStruct=parseArgs(args,ArgStruct,varargin) 2 | % Helper function for parsing varargin. 3 | % 4 | % 5 | % ArgStruct=parseArgs(varargin,ArgStruct[,FlagtypeParams[,Aliases]]) 6 | % 7 | % * ArgStruct is the structure full of named arguments with default values. 8 | % * Flagtype params is params that don't require a value. (the value will be set to 1 if it is present) 9 | % * Aliases can be used to map one argument-name to several argstruct fields 10 | % 11 | % 12 | % example usage: 13 | % -------------- 14 | % function parseargtest(varargin) 15 | % 16 | % %define the acceptable named arguments and assign default values 17 | % Args=struct('Holdaxis',0, ... 18 | % 'SpacingVertical',0.05,'SpacingHorizontal',0.05, ... 19 | % 'PaddingLeft',0,'PaddingRight',0,'PaddingTop',0,'PaddingBottom',0, ... 20 | % 'MarginLeft',.1,'MarginRight',.1,'MarginTop',.1,'MarginBottom',.1, ... 21 | % 'rows',[],'cols',[]); 22 | % 23 | % %The capital letters define abrreviations. 24 | % % Eg. parseargtest('spacingvertical',0) is equivalent to parseargtest('sv',0) 25 | % 26 | % Args=parseArgs(varargin,Args, ... % fill the arg-struct with values entered by the user 27 | % {'Holdaxis'}, ... %this argument has no value (flag-type) 28 | % {'Spacing' {'sh','sv'}; 'Padding' {'pl','pr','pt','pb'}; 'Margin' {'ml','mr','mt','mb'}}); 29 | % 30 | % disp(Args) 31 | % 32 | % 33 | % 34 | % 35 | % % Aslak Grinsted 2003 36 | 37 | Aliases={}; 38 | FlagTypeParams=''; 39 | 40 | if (length(varargin)>0) 41 | FlagTypeParams=strvcat(varargin{1}); 42 | if length(varargin)>1 43 | Aliases=varargin{2}; 44 | end 45 | end 46 | 47 | 48 | %---------------Get "numeric" arguments 49 | NumArgCount=1; 50 | while (NumArgCount<=size(args,2))&(~ischar(args{NumArgCount})) 51 | NumArgCount=NumArgCount+1; 52 | end 53 | NumArgCount=NumArgCount-1; 54 | if (NumArgCount>0) 55 | ArgStruct.NumericArguments={args{1:NumArgCount}}; 56 | else 57 | ArgStruct.NumericArguments={}; 58 | end 59 | 60 | 61 | %--------------Make an accepted fieldname matrix (case insensitive) 62 | Fnames=fieldnames(ArgStruct); 63 | for i=1:length(Fnames) 64 | name=lower(Fnames{i,1}); 65 | Fnames{i,2}=name; %col2=lower 66 | AbbrevIdx=find(Fnames{i,1}~=name); 67 | Fnames{i,3}=[name(AbbrevIdx) ' ']; %col3=abreviation letters (those that are uppercase in the ArgStruct) e.g. SpacingHoriz->sh 68 | %the space prevents strvcat from removing empty lines 69 | Fnames{i,4}=isempty(strmatch(Fnames{i,2},FlagTypeParams)); %Does this parameter have a value? (e.g. not flagtype) 70 | end 71 | FnamesFull=strvcat(Fnames{:,2}); 72 | FnamesAbbr=strvcat(Fnames{:,3}); 73 | 74 | if length(Aliases)>0 75 | for i=1:length(Aliases) 76 | name=lower(Aliases{i,1}); 77 | FieldIdx=strmatch(name,FnamesAbbr,'exact'); %try abbreviations (must be exact) 78 | if isempty(FieldIdx) 79 | FieldIdx=strmatch(name,FnamesFull); %&??????? exact or not? 80 | end 81 | Aliases{i,2}=FieldIdx; 82 | AbbrevIdx=find(Aliases{i,1}~=name); 83 | Aliases{i,3}=[name(AbbrevIdx) ' ']; %the space prevents strvcat from removing empty lines 84 | Aliases{i,1}=name; %dont need the name in uppercase anymore for aliases 85 | end 86 | %Append aliases to the end of FnamesFull and FnamesAbbr 87 | FnamesFull=strvcat(FnamesFull,strvcat(Aliases{:,1})); 88 | FnamesAbbr=strvcat(FnamesAbbr,strvcat(Aliases{:,3})); 89 | end 90 | 91 | %--------------get parameters-------------------- 92 | l=NumArgCount+1; 93 | while (l<=length(args)) 94 | a=args{l}; 95 | if ischar(a) 96 | paramHasValue=1; % assume that the parameter has is of type 'param',value 97 | a=lower(a); 98 | FieldIdx=strmatch(a,FnamesAbbr,'exact'); %try abbreviations (must be exact) 99 | if isempty(FieldIdx) 100 | FieldIdx=strmatch(a,FnamesFull); 101 | end 102 | if (length(FieldIdx)>1) %shortest fieldname should win 103 | [mx,mxi]=max(sum(FnamesFull(FieldIdx,:)==' ',2)); 104 | FieldIdx=FieldIdx(mxi); 105 | end 106 | if FieldIdx>length(Fnames) %then it's an alias type. 107 | FieldIdx=Aliases{FieldIdx-length(Fnames),2}; 108 | end 109 | 110 | if isempty(FieldIdx) 111 | error(['Unknown named parameter: ' a]) 112 | end 113 | for curField=FieldIdx' %if it is an alias it could be more than one. 114 | if (Fnames{curField,4}) 115 | val=args{l+1}; 116 | else 117 | val=1; %parameter is of flag type and is set (1=true).... 118 | end 119 | ArgStruct.(Fnames{curField,1})=val; 120 | end 121 | l=l+1+Fnames{FieldIdx(1),4}; %if a wildcard matches more than one 122 | else 123 | error(['Expected a named parameter: ' num2str(a)]) 124 | end 125 | end 126 | 127 | -------------------------------------------------------------------------------- /src/main/matlab/read_data.m: -------------------------------------------------------------------------------- 1 | function [A, y, names, virus_kasp, virus_mcafee, virus_symantec, column_labels, t_created, cuckoo_idx, splunk_idx] = read_data(dirName, filter) 2 | 3 | if (nargin<2) 4 | filter = {}; 5 | filter{1} = '[cuckoo]'; 6 | %filter = {}; 7 | end 8 | 9 | names = {}; 10 | column_labels = {}; 11 | splunk_idx = []; 12 | enterprise_idx = []; 13 | t_created = []; 14 | 15 | A = read_A(fullfile(dirName, 'pace_feature_matrix.txt')); 16 | y = dlmread(fullfile(dirName, 'pace_classification.txt')); 17 | cuckoo_idx = 1:length(y); 18 | 19 | try 20 | t_created = dlmread(fullfile(dirName, 'pace_created_labels.txt')); 21 | catch 22 | fprintf('Could not read time stamps.\n'); 23 | end 24 | 25 | try 26 | A_splunk = read_A(fullfile(dirName, 'pace_splunk_feature_matrix.txt')); 27 | y_splunk = zeros(size(A_splunk,1),1); 28 | 29 | splunk_idx = (1:size(A_splunk,1))+size(A,1); 30 | 31 | A = [A; A_splunk]; 32 | y = [y; y_splunk]; 33 | 34 | %adjust also the timestamps for splunk 35 | t_largest = round(86400 * (now - datenum('1970', 'yyyy'))); 36 | t_created = [t_created; t_largest*ones(length(splunk_idx), 1)]; 37 | catch 38 | fprintf('Could not read splunk data.\n'); 39 | end 40 | 41 | try 42 | A_enterprise = read_A(fullfile(dirName, 'pace_enterprise_feature_matrix.txt')); 43 | y_enterprise = zeros(size(A_enterprise,1),1); 44 | 45 | enterprise_idx = (1:size(A_enterprise,1))+size(A,1); 46 | 47 | A = [A; A_enterprise]; 48 | y = [y; y_enterprise]; 49 | 50 | %adjust also the timestamps for splunk 51 | t_largest = round(86400 * (now - datenum('1970', 'yyyy'))); 52 | t_created = [t_created; t_largest*ones(length(enterprise_idx), 1)]; 53 | catch 54 | fprintf('Could not read enterprise data.\n'); 55 | end 56 | 57 | try 58 | names = read_lines(fullfile(dirName, 'pace_row_labels.txt')); 59 | names{size(A,1),1} = []; 60 | try 61 | names_splunk = read_lines(fullfile(dirName, 'pace_splunk_row_labels_anon.txt')); 62 | names(splunk_idx) = names_splunk; 63 | catch 64 | fprintf('Could not read splunk ware names.\n'); 65 | end 66 | 67 | try 68 | names_enterprise = read_lines(fullfile(dirName, 'pace_enterprise_row_labels_anon.txt')); 69 | names(enterprise_idx) = names_enterprise; 70 | catch 71 | fprintf('Could not read enterprise ware names.\n'); 72 | end 73 | catch 74 | fprintf('Could not read ware names.\n'); 75 | end 76 | 77 | try 78 | virus_kasp = read_lines(fullfile(dirName, 'pace_malware_kaspersky_labels.txt')); 79 | s = size(virus_kasp,1); 80 | if (s=2; 131 | A = A(:,em); 132 | if (~isempty(column_labels)) 133 | column_labels = column_labels(em); 134 | end 135 | 136 | %now apply the filter 137 | if (~isempty(column_labels) && ~isempty(filter)) 138 | I_bad = []; 139 | %filter the columns 140 | for iter=1:length(filter) 141 | v = strfind(column_labels, filter{iter}); 142 | for iter2=1:length(v) 143 | if (~isempty(v{iter2})) 144 | I_bad = [I_bad, iter2]; 145 | end 146 | end 147 | end 148 | I_bad = unique(I_bad); 149 | indx = setdiff(1:length(column_labels), I_bad); 150 | 151 | A = A(:,indx); 152 | column_labels = column_labels(indx); 153 | end 154 | 155 | 156 | if (~islogical(cuckoo_idx)) 157 | cuckoo_idx_temp = false(length(y),1); 158 | cuckoo_idx_temp(cuckoo_idx) = true; 159 | cuckoo_idx = cuckoo_idx_temp; 160 | end 161 | if (~islogical(splunk_idx)) 162 | splunk_idx_temp = false(length(y),1); 163 | splunk_idx_temp(splunk_idx) = true; 164 | splunk_idx = splunk_idx_temp; 165 | end 166 | 167 | valid = true(size(A,1),1); 168 | 169 | %remove empty rows from cuckoo results 170 | num_items = sum(A(cuckoo_idx,:),2); 171 | valid(cuckoo_idx) = num_items>=(mean(num_items)-3*std(num_items)); 172 | 173 | A = A(valid,:); 174 | y = y(valid,:); 175 | t_created = t_created(valid); 176 | if (~isempty(virus_kasp)) 177 | virus_kasp = virus_kasp(valid); 178 | end 179 | if (~isempty(virus_symantec)) 180 | virus_symantec = virus_symantec(valid); 181 | end 182 | if (~isempty(virus_mcafee)) 183 | virus_mcafee = virus_mcafee(valid); 184 | end 185 | if (~isempty(names)) 186 | names = names(valid); 187 | end 188 | 189 | cuckoo_idx = cuckoo_idx(valid); 190 | splunk_idx = splunk_idx(valid); 191 | 192 | 193 | function [A] = read_A(fileName) 194 | 195 | S = dlmread(fileName,'\t'); 196 | 197 | if (size(S,2)~=3) 198 | A = S; 199 | return; 200 | end 201 | 202 | %create the matrix 203 | A = spconvert([S(2:end,1)+1, S(2:end,2)+1, S(2:end,3)]); 204 | 205 | %check that the sizes are valid 206 | if (size(A,2)size(txt,1)) 222 | % txt{count*2,1} = []; 223 | % end 224 | % txt{count,1} = tline; 225 | % tline = fgetl(fid); 226 | % count = count+1; 227 | % end 228 | % 229 | % %now cut 230 | % txt = txt(1:(count-1), :); 231 | 232 | txt = textscan(fid,'%s','Delimiter','\n'); 233 | 234 | txt = txt{1}; 235 | 236 | fclose(fid); -------------------------------------------------------------------------------- /src/main/matlab/rotateticklabel.m: -------------------------------------------------------------------------------- 1 | function th=rotateticklabel(h,rot,demo) 2 | %ROTATETICKLABEL rotates tick labels 3 | % TH=ROTATETICKLABEL(H,ROT) is the calling form where H is a handle to 4 | % the axis that contains the XTickLabels that are to be rotated. ROT is 5 | % an optional parameter that specifies the angle of rotation. The default 6 | % angle is 90. TH is a handle to the text objects created. For long 7 | % strings such as those produced by datetick, you may have to adjust the 8 | % position of the axes so the labels don't get cut off. 9 | % 10 | % Of course, GCA can be substituted for H if desired. 11 | % 12 | % TH=ROTATETICKLABEL([],[],'demo') shows a demo figure. 13 | % 14 | % Known deficiencies: if tick labels are raised to a power, the power 15 | % will be lost after rotation. 16 | % 17 | % See also datetick. 18 | 19 | % Written Oct 14, 2005 by Andy Bliss 20 | % Copyright 2005 by Andy Bliss 21 | 22 | %DEMO: 23 | if nargin==3 24 | x=[now-.7 now-.3 now]; 25 | y=[20 35 15]; 26 | figure 27 | plot(x,y,'.-') 28 | datetick('x',0,'keepticks') 29 | h=gca; 30 | set(h,'position',[0.13 0.35 0.775 0.55]) 31 | rot=90; 32 | end 33 | 34 | %set the default rotation if user doesn't specify 35 | if nargin==1 36 | rot=90; 37 | end 38 | %make sure the rotation is in the range 0:360 (brute force method) 39 | while rot>360 40 | rot=rot-360; 41 | end 42 | while rot<0 43 | rot=rot+360; 44 | end 45 | %get current tick labels 46 | a=get(h,'XTickLabel'); 47 | %erase current tick labels from figure 48 | set(h,'XTickLabel',[]); 49 | %get tick label positions 50 | b=get(h,'XTick'); 51 | c=get(h,'YTick'); 52 | %make new tick labels 53 | if rot<180 54 | th=text(b,repmat(c(1)-.1*(c(2)-c(1)),length(b),1),a,'HorizontalAlignment','right','rotation',rot); 55 | else 56 | th=text(b,repmat(c(1)-.1*(c(2)-c(1)),length(b),1),a,'HorizontalAlignment','left','rotation',rot); 57 | end 58 | 59 | -------------------------------------------------------------------------------- /src/main/matlab/run_pipeline.m: -------------------------------------------------------------------------------- 1 | function [logit_model, boost_model] = run_pipeline(A, y) 2 | 3 | %[boost_model] = ensemble_model(A, y); 4 | %logit_model = boost_model; 5 | 6 | %now perform logistic regression on training set 7 | logit_model = LogitLinearModel(A, y,[],setoptions('SelectFeatures', 50000)); 8 | boost_model = logit_model; 9 | 10 | fprintf('Finished running pipeline.\n'); 11 | end 12 | 13 | function [model] = ensemble_model(A, y) 14 | 15 | model = BoostModel(A, y,[],setoptions('Subsample',.40,'SelectFeatures', 50000)); 16 | model.addBagModels(3,'logit'); 17 | 18 | models = model.getModels(); 19 | 20 | w = models{1}.getModelWeights(); 21 | total = abs(w); 22 | for iter=2:length(models) 23 | w = models{iter}.getModelWeights(); 24 | total = total+abs(w); 25 | end 26 | 27 | maxw = max(total); 28 | 29 | model = LogitLinearModel(A, y,[],setoptions('SelectFeatures', abs(total)>maxw*1.0e-6)); 30 | 31 | end -------------------------------------------------------------------------------- /src/main/matlab/setoptions.m: -------------------------------------------------------------------------------- 1 | function options = setoptions(varargin) 2 | 3 | start = 1; 4 | if (~isempty(varargin) && isstruct(varargin{1})) 5 | options = varargin{1}; 6 | start = start+1; 7 | else 8 | %defaults 9 | options.feature_select = 10000; 10 | options.subsample = 0.2; 11 | options.out_bag_estimate = false; 12 | options.lasso_alpha = 1.0; 13 | end 14 | 15 | for iter=start:2:length(varargin) 16 | 17 | if (strcmpi(varargin{iter},'SelectFeatures')) 18 | options.feature_select = varargin{iter+1}; 19 | elseif (strcmpi(varargin{iter},'Subsample')) 20 | options.subsample = varargin{iter+1}; 21 | elseif (strcmpi(varargin{iter},'LassoAlpha')) 22 | options.lasso_alpha = varargin{iter+1}; 23 | elseif (strcmpi(varargin{iter},'OutBagEstimate')) 24 | options.subsample = varargin{iter+1}; 25 | else 26 | error(['Unknown option ',varargin{iter}]); 27 | end 28 | 29 | end; 30 | -------------------------------------------------------------------------------- /src/main/matlab/subaxis.m: -------------------------------------------------------------------------------- 1 | function h=subaxis(varargin) 2 | %SUBAXIS Create axes in tiled positions. (just like subplot) 3 | % Usage: 4 | % h=subaxis(rows,cols,cellno[,settings]) 5 | % h=subaxis(rows,cols,cellx,celly[,settings]) 6 | % h=subaxis(rows,cols,cellx,celly,spanx,spany[,settings]) 7 | % 8 | % SETTINGS: Spacing,SpacingHoriz,SpacingVert 9 | % Padding,PaddingRight,PaddingLeft,PaddingTop,PaddingBottom 10 | % Margin,MarginRight,MarginLeft,MarginTop,MarginBottom 11 | % Holdaxis 12 | % 13 | % all units are relative (e.g from 0 to 1) 14 | % 15 | % Abbreviations of parameters can be used.. (Eg MR instead of MarginRight) 16 | % (holdaxis means that it wont delete any axes below.) 17 | % 18 | % 19 | % Example: 20 | % 21 | % >> subaxis(2,1,1,'SpacingVert',0,'MR',0); 22 | % >> imagesc(magic(3)) 23 | % >> subaxis(2,'p',.02); 24 | % >> imagesc(magic(4)) 25 | % 26 | % 2001 / Aslak Grinsted (Feel free to modify this code.) 27 | f=gcf; 28 | 29 | 30 | Args=[]; 31 | UserDataArgsOK=0; 32 | Args=get(f,'UserData'); 33 | if isstruct(Args) 34 | UserDataArgsOK=isfield(Args,'SpacingHorizontal')&isfield(Args,'Holdaxis')&isfield(Args,'rows')&isfield(Args,'cols'); 35 | end 36 | OKToStoreArgs=isempty(Args)|UserDataArgsOK; 37 | 38 | if isempty(Args)&(~UserDataArgsOK) 39 | Args=struct('Holdaxis',0, ... 40 | 'SpacingVertical',0.05,'SpacingHorizontal',0.05, ... 41 | 'PaddingLeft',0,'PaddingRight',0,'PaddingTop',0,'PaddingBottom',0, ... 42 | 'MarginLeft',.1,'MarginRight',.1,'MarginTop',.1,'MarginBottom',.1, ... 43 | 'rows',[],'cols',[]); 44 | end 45 | Args=parseArgs(varargin,Args,{'Holdaxis'},{'Spacing' {'sh','sv'}; 'Padding' {'pl','pr','pt','pb'}; 'Margin' {'ml','mr','mt','mb'}}); 46 | 47 | if (length(Args.NumericArguments)>1) 48 | Args.rows=Args.NumericArguments{1}; 49 | Args.cols=Args.NumericArguments{2}; 50 | %remove these 2 numerical arguments 51 | Args.NumericArguments={Args.NumericArguments{3:end}}; 52 | end 53 | 54 | if OKToStoreArgs 55 | set(f,'UserData',Args); 56 | end 57 | 58 | 59 | 60 | 61 | switch length(Args.NumericArguments) 62 | case 0 63 | return % no arguments but rows/cols.... 64 | case 1 65 | x1=mod((Args.NumericArguments{1}-1),Args.cols)+1; x2=x1; 66 | y1=floor((Args.NumericArguments{1}-1)/Args.cols)+1; y2=y1; 67 | case 2 68 | x1=Args.NumericArguments{1};x2=x1; 69 | y1=Args.NumericArguments{2};y2=y1; 70 | case 4 71 | x1=Args.NumericArguments{1};x2=x1+Args.NumericArguments{3}-1; 72 | y1=Args.NumericArguments{2};y2=y1+Args.NumericArguments{4}-1; 73 | otherwise 74 | error('subaxis argument error') 75 | end 76 | 77 | 78 | cellwidth=((1-Args.MarginLeft-Args.MarginRight)-(Args.cols-1)*Args.SpacingHorizontal)/Args.cols; 79 | cellheight=((1-Args.MarginTop-Args.MarginBottom)-(Args.rows-1)*Args.SpacingVertical)/Args.rows; 80 | xpos1=Args.MarginLeft+Args.PaddingLeft+cellwidth*(x1-1)+Args.SpacingHorizontal*(x1-1); 81 | xpos2=Args.MarginLeft-Args.PaddingRight+cellwidth*x2+Args.SpacingHorizontal*(x2-1); 82 | ypos1=Args.MarginTop+Args.PaddingTop+cellheight*(y1-1)+Args.SpacingVertical*(y1-1); 83 | ypos2=Args.MarginTop-Args.PaddingBottom+cellheight*y2+Args.SpacingVertical*(y2-1); 84 | 85 | if Args.Holdaxis 86 | h=axes('position',[xpos1 1-ypos2 xpos2-xpos1 ypos2-ypos1]); 87 | else 88 | h=subplot('position',[xpos1 1-ypos2 xpos2-xpos1 ypos2-ypos1]); 89 | end 90 | 91 | 92 | set(h,'box','on'); 93 | %h=axes('position',[x1 1-y2 x2-x1 y2-y1]); 94 | set(h,'units',get(gcf,'defaultaxesunits')); 95 | set(h,'tag','subaxis'); 96 | 97 | 98 | 99 | if (nargout==0) clear h; end; 100 | 101 | --------------------------------------------------------------------------------