├── LICENSE
├── README.md
├── code
    ├── equality.py
    ├── forests.py
    ├── malware.py
    └── symbols.py
└── data
    ├── pg23428.txt
    └── pg5711.txt


/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2014 George Danezis (g.danezis@ucl.ac.uk)
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | trees
 2 | =====
 3 | 
 4 | A quick educational implementation of a random forest classifier and a decision jungle classifier.
 5 | 
 6 | References:
 7 | 
 8 |  * A. Criminisi, J. Shotton, and E. Konukoglu, Decision Forests: 
 9 |    A Unified Framework for Classification, Regression, Density Estimation, 
10 |    Manifold Learning and Semi-Supervised Learning. Foundations and Trends in 
11 |    Computer Graphics and Computer Vision. NOW Publishers. Vol.7: No 2-3, pp 81-227. 2012.
12 | 
13 |  * Jamie Shotton, Toby Sharp, Pushmeet Kohli, Sebastian Nowozin, John Winn, 
14 |    and Antonio Criminisi, Decision Jungles: Compact and Rich Models for 
15 |    Classification, in Proc. NIPS, 2013.
16 | 


--------------------------------------------------------------------------------
/code/equality.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from collections import Counter, defaultdict
  3 | import numpy as np
  4 | import copy
  5 | import traceback
  6 | 
  7 | from malware import Forest, traverse
  8 | 
  9 | def split_data(data, length=50):
 10 |     'Take a large text and divide it into chunks'
 11 |     strings = [data[i:i+length] for i in range(0, len(data) - length, length)]
 12 | 
 13 |     string_data = dict([(s, prefixes(s)) for s in strings])
 14 |     all_features = defaultdict(int)
 15 |     for k,v in string_data.iteritems():
 16 |         for vi in v.keys():
 17 |             all_features[vi] += 1
 18 | 
 19 |     Sz = len(string_data)
 20 |     print "Samples length = ", Sz
 21 |     MINS = 200
 22 |     new_features = dict([(f,v) for f,v in all_features.iteritems() if MINS < v < Sz - MINS  ])
 23 |     #for f, v in new_features.iteritems():
 24 |     #    print v, f
 25 | 
 26 |     for s in string_data:
 27 |         for vi in string_data[s].keys():
 28 |             if vi not in new_features:
 29 |                 del string_data[s][vi]
 30 | 
 31 |     spairs = []
 32 |     for i in range(int(len(strings) / 2)):
 33 |         spairs += [(strings[2*i], strings[2*i+1])]
 34 | 
 35 |     random.shuffle(spairs)
 36 |     train0 = spairs[:len(spairs) / 2]
 37 |     train1 = [(t0, random.choice(train0)[1]) for t0, _ in train0]
 38 |     train_labels = [0] * len(train0) + [1] * len(train1)
 39 |     train = (train0 + train1, train_labels)
 40 | 
 41 |     test0 = spairs[len(spairs) / 2:]
 42 |     test1 = [(t0, random.choice(test0)[1]) for t0, _ in test0]
 43 |     test_labels = [0] * len(test0) + [1] * len(test1)
 44 |     test = (test0 + test1, test_labels)
 45 | 
 46 |     return string_data, train, test
 47 | 
 48 | 
 49 | def prefixes(string, min_len=3, max_len=10):
 50 |     pfx = defaultdict(int)
 51 |     for i in range(len(string)):
 52 |         for j in range(min_len, max_len):
 53 |             if i+j <= len(string):
 54 |                 pfx[string[i:i+j]] += 1
 55 |     return pfx
 56 | 
 57 | 
 58 | def process_data(data, pairs, labels):
 59 |     items = []
 60 |     labs = []
 61 |     for (p1, p2), l in zip(pairs, labels):
 62 |         assert p1 in data
 63 |         assert p2 in data
 64 |         items += [p1,      p2]
 65 |         labs += [(p2, l), (p1, l)]
 66 |     return items, labs
 67 | 
 68 | 
 69 | class EqRecords():
 70 | 
 71 |     def __init__(self, data, items, labels=None, sID=1):
 72 |         self.data = data
 73 |         self.items = items
 74 |         self.item_set = set(items)
 75 |         if labels:
 76 |             assert len(items) == len(labels)
 77 |         self.labels = labels
 78 |         self.sID = sID
 79 | 
 80 |     def _filter(self, f, b):
 81 |         new_items = [(idx, i) for idx, i in enumerate(self.items) if (f in self.data[i]) == b ]
 82 |         new_labels = None
 83 |         if self.labels:
 84 |             new_labels = [self.labels[idx] for idx, _ in new_items]
 85 | 
 86 |         new_items = [i for _, i in new_items]
 87 |         return EqRecords(self.data, new_items, new_labels, 2*self.sID + [0, 1][b])
 88 | 
 89 | 
 90 |     def size(self):
 91 |         return len(self.items)
 92 | 
 93 |     def indexes(self):
 94 |         return self.items
 95 | 
 96 |     def label_distribution(self):
 97 |         assert self.labels is not None
 98 |         d = {0:0, 1:0}
 99 |         for (s, l) in self.labels:
100 |             res = 0 if s in self.item_set else 1
101 |             d[int(res == l)] += 1
102 |         return d, self.sID
103 | 
104 |     def H(self):
105 |         d, _ = self.label_distribution()
106 |         S = d[1] + d[0]
107 |         if S == 0:
108 |             return -1
109 |         return (float((d[1] - d[0])) / S) - 1.0
110 | 
111 |     def get_random_feature(self):
112 |         while True:
113 |             try:
114 |                 i = random.choice(self.items)
115 |                 return random.choice(self.data[i].keys())
116 |             except:
117 |                 print "No feature!!"
118 | 
119 |     def split_on_feature(self, feature):
120 |         L = self._filter(feature, False)
121 |         R = self._filter(feature, True)
122 | 
123 |         dH = self.H()
124 |         S = float(self.size())
125 |         dNew = (L.size() / S) * L.H() + (R.size() / S) * R.H()
126 |         return dNew - dH, L, R
127 | 
128 | def test_init():
129 |     dataEN = file("../data/pg42671.txt").read()
130 | 
131 |     features, train, test = split_data(dataEN, length=200)
132 |     train_data, train_labels = train
133 | 
134 |     items, labs = process_data(features, train_data, train_labels)
135 |     rec = EqRecords(features, items, labs)
136 |     assert rec.labels
137 | 
138 |     assert rec.size() == len(train_data) * 2
139 | 
140 |     d, _ = rec.label_distribution()
141 | 
142 |     #print d
143 |     #print rec.H()
144 | 
145 |     #for _ in range(100):
146 |     #    f = rec.get_random_feature()
147 |     #    dh, L, R = rec.split_on_feature(f)
148 |     #    print "%f\t\"%s\"\t%s" % (dh, f, (L.size(), R.size()))
149 | 
150 |     F = Forest(trees = 14, numfeatures = 100, levels=10)
151 |     # R = Record(training_labels, training_records)
152 |     F.train(rec, multicore=False)
153 | 
154 |     for t in F.root:
155 |         print "-" * 30
156 |         for xxx in traverse(t):
157 |             terms, (labels, sID) = xxx
158 |             s = " ".join(["%s\"%s\"" % (["-", "+"][b], term) for term, b in terms])
159 |             s += " (-%s, +%s)" % (labels[0], labels[1])
160 |             print s
161 | 
162 | if __name__ == "__main__":
163 |     # dataEN = file("../data/pg23428.txt").read()
164 |     # dataFR = file("../data/pg5711.txt").read()
165 |     pass


--------------------------------------------------------------------------------
/code/forests.py:
--------------------------------------------------------------------------------
  1 | ## This is an educational random forest implementation
  2 | 
  3 | ## References:
  4 | ## * A. Criminisi, J. Shotton, and E. Konukoglu, Decision Forests: 
  5 | ##   A Unified Framework for Classification, Regression, Density Estimation, 
  6 | ##   Manifold Learning and Semi-Supervised Learning. Foundations and Trends in 
  7 | ##   Computer Graphics and Computer Vision. NOW Publishers. Vol.7: No 2-3, pp 81-227. 2012.
  8 | ##
  9 | ## * Jamie Shotton, Toby Sharp, Pushmeet Kohli, Sebastian Nowozin, John Winn, 
 10 | ##   and Antonio Criminisi, Decision Jungles: Compact and Rich Models for 
 11 | ##   Classification, in Proc. NIPS, 2013
 12 | 
 13 | import random
 14 | from collections import Counter
 15 | import numpy as np
 16 | import copy
 17 | 
 18 | def split_data(data, label=0, length=50):
 19 |     'Take a large text and divide it into chunks'
 20 |     strings = [data[i:i+length] for i in range(0, len(data) - length, length)]
 21 |     random.shuffle(strings)
 22 |     strings = [(s, label) for s in strings]
 23 | 
 24 |     test = strings[:len(strings) * 10 / 100]
 25 |     training = strings[len(strings) * 10 / 100:]
 26 |     return test, training
 27 | 
 28 | 
 29 | def entropy(data):
 30 |     'Computes the binary entropy of labelled data'
 31 |     v = Counter([b for _, b in data]).values()
 32 |     d = np.array(v) / float(sum(v))
 33 |     return - sum(d * np.log(d))
 34 | 
 35 | 
 36 | def split(train, feat):
 37 |     'Split data according to an infromation gain criterium'
 38 |     ## first compute the entropy
 39 |     Hx = entropy(train)
 40 |     if Hx < 0.000001:
 41 |         raise Exception("Entropy very low")
 42 |     L1 = []
 43 |     L2 = []
 44 |     for t in train:
 45 |         if feat in t[0]:
 46 |             L1 += [t]
 47 |         else:
 48 |             L2 += [t]
 49 | 
 50 |     E1 = entropy(L1)
 51 |     E2 = entropy(L2)
 52 |     L = float(len(train))
 53 | 
 54 |     H = Hx - E1 * len(L1)/L - E2 * len(L2)/L
 55 |     return H, L1, L2, feat
 56 | 
 57 | ## --------------------------
 58 | ## - The random forest code -
 59 | ## --------------------------
 60 | 
 61 | 
 62 | def build_tree(train, features, levels=5, numfeatures=100):
 63 |     'Train a decision tree based on labeled data and features'
 64 |     if levels == 0:
 65 |         C1 = Counter([b for _, b in train])
 66 |         Leaf = (None, C1)
 67 |         return Leaf
 68 |     else:
 69 |         try:
 70 |             X = (split(train, F) for F in random.sample(features, numfeatures))
 71 |             H, L1, L2, F = max(X)
 72 |             M1 = build_tree(L1, features, levels - 1, numfeatures)
 73 |             M2 = build_tree(L2, features, levels - 1, numfeatures)
 74 |             Branch = (F, M1, M2)
 75 |             return Branch
 76 |         except:
 77 |             return build_tree(train, features, levels=0)
 78 | 
 79 | 
 80 | def classify(tree, item):
 81 |     'Get a decision for an item using a tree'
 82 |     if len(tree) == 2:
 83 |         assert tree[0] is None
 84 |         return tree[1]
 85 |     else:
 86 |         fet, L1, L2 = tree
 87 |         if fet in item:
 88 |             return classify(L1, item)
 89 |         else:
 90 |             return classify(L2, item)
 91 | 
 92 | ## ----------------------------
 93 | ## - The decision jungle code -
 94 | ## ----------------------------
 95 | 
 96 | 
 97 | def build_jungle(train, features, levels=20, numfeatures=100):
 98 |     DAG = {0: copy.copy(train)}
 99 |     Candidate_sets = [0]
100 |     next_ID = 0
101 |     M = 20
102 | 
103 |     for level in range(levels):
104 |         result_sets = []
105 |         for tdata_idx in Candidate_sets:
106 |             tdata = DAG[tdata_idx]
107 | 
108 |             if entropy(tdata) == 0.0:
109 |                 next_ID += 1
110 |                 idx1 = next_ID
111 |                 result_sets += [idx1]
112 |                 DAG[idx1] = tdata + []
113 |                 del DAG[tdata_idx][:]
114 |                 DAG[tdata_idx] += [True, idx1, idx1]
115 |                 continue
116 | 
117 |             X = (split(tdata, F) for F in random.sample(features, numfeatures))
118 |             H, L1, L2, F = max(X)
119 | 
120 |             # Branch = (F, M1, M2)
121 |             next_ID += 1
122 |             idx1 = next_ID
123 |             DAG[idx1] = L1
124 |             next_ID += 1
125 |             idx2 = next_ID
126 |             DAG[idx2] = L2
127 | 
128 |             result_sets += [idx1, idx2]
129 |             del DAG[tdata_idx][:]
130 |             DAG[tdata_idx] += [F, idx1, idx2]
131 | 
132 |         ## Now optimize the result sets here
133 |         random.shuffle(result_sets)
134 | 
135 |         basic = result_sets[:M]
136 |         for r in result_sets[M:]:
137 |             maxv = None
138 |             maxi = None
139 |             for b in basic:
140 |                 L = float(len(DAG[r] + DAG[b]))
141 |                 e1 = len(DAG[r]) * entropy(DAG[r])
142 |                 e2 = len(DAG[b]) * entropy(DAG[b])
143 |                 newe = L * entropy(DAG[r] + DAG[b])
144 |                 score = abs(e1 + e2 - newe)
145 |                 if maxv is None:
146 |                     maxv = score
147 |                     maxi = b
148 |                     continue
149 |                 if score < maxv:
150 |                     maxv = score
151 |                     maxi = b
152 |             DAG[maxi] += DAG[r]
153 |             del DAG[r]
154 |             DAG[r] = DAG[maxi]
155 | 
156 |         Candidate_sets = basic
157 | 
158 |     for tdata_idx in Candidate_sets:
159 |         tdata = DAG[tdata_idx]
160 |         C1 = Counter([b for _, b in tdata])
161 |         del DAG[tdata_idx][:]
162 |         DAG[tdata_idx] += [None, C1]
163 | 
164 |     return DAG
165 | 
166 | 
167 | def classify_jungle(DAG, item):
168 |     branch = DAG[0]
169 |     while branch[0] is not None:
170 |         try:
171 |             fet, L1, L2 = branch
172 |             if fet == True or fet in item:
173 |                 branch = DAG[L1]
174 |             else:
175 |                 branch = DAG[L2]
176 |         except:
177 |             print len(branch)
178 |             raise
179 |     return branch[1]
180 | 
181 | ## -------------------------
182 | ## - Sample classification -
183 | ## -------------------------
184 | 
185 | if __name__ == "__main__":
186 |     # dataEN = file("../data/pg23428.txt").read()
187 |     # dataFR = file("../data/pg5711.txt").read()
188 |     dataEN = file("../data/pg110.txt").read()
189 |     dataFR = file("../data/pg42671.txt").read()
190 | 
191 |     length = 200
192 | 
193 |     testEN, trainEN = split_data(dataEN, label=0, length=length)
194 |     testFR, trainFR = split_data(dataFR, label=1, length=length)
195 | 
196 |     print "training: EN=%s FR=%s" % (len(trainEN), len(trainFR))
197 | 
198 |     train = trainEN + trainFR
199 |     random.shuffle(train)
200 |     test = testEN + testFR
201 |     random.shuffle(test)
202 | 
203 |     ## Now make a bunch of features
204 |     ## A feature is in at least 10% of strings
205 |     ## but also at most in 90% of strings
206 | 
207 |     sometrain = random.sample(train, 1000)
208 |     features = set()
209 |     while len(features) < 700:
210 |         fragment, _ = random.choice(sometrain)
211 |         l = int(round(random.expovariate(0.20)))
212 |         b = random.randint(0, max(0, length - l))
213 |         feat = fragment[b:b+l]
214 | 
215 |         ## Test
216 |         C = 0
217 |         for st, _ in sometrain:
218 |             if feat in st:
219 |                 C += 1
220 | 
221 |         f = float(C) / 1000
222 |         if f > 0.01 and f < 0.99 and feat not in features:
223 |             features.add(feat)
224 | 
225 |     features = list(features)
226 | 
227 |     manytrees = []
228 |     jungle = []
229 |     for i in range(10):
230 |         print "Build tree %s" % i
231 |         size = len(train) / 3
232 |         training_sample = random.sample(train, size)
233 | 
234 |         tree = build_jungle(training_sample, features, numfeatures=100)
235 |         jungle += [tree]
236 | 
237 |         tree = build_tree(training_sample, features, numfeatures=100)
238 |         manytrees += [tree]
239 | 
240 |     testdata = test
241 |     results_tree = Counter()
242 |     results_jungle = Counter()
243 |     for item, cat in testdata:
244 |         # Trees
245 |         c = Counter()
246 |         for tree in manytrees:
247 |             c += classify(tree, item)
248 |         res = (max(c, key=lambda x: c[x]), cat)
249 |         results_tree.update([res])
250 | 
251 |         # Jungle
252 |         c = Counter()
253 |         for tree in jungle:
254 |             c += classify_jungle(tree, item)
255 |         res = (max(c, key=lambda x: c[x]), cat)
256 |         results_jungle.update([res])
257 | 
258 |     print
259 |     print "Results         Tree   Jungle"
260 |     print "True positives:  %4d    %4d" \
261 |         % (results_tree[(1, 1)], results_jungle[(1, 1)])
262 |     print "True negatives:  %4d    %4d" \
263 |         % (results_tree[(0, 0)], results_jungle[(0, 0)])
264 |     print "False positives: %4d    %4d" \
265 |         % (results_tree[(1, 0)], results_jungle[(1, 0)])
266 |     print "False negatives: %4d    %4d" \
267 |         % (results_tree[(0, 1)], results_jungle[(0, 1)])
268 | 


--------------------------------------------------------------------------------
/code/malware.py:
--------------------------------------------------------------------------------
  1 | # Standard library
  2 | import unittest
  3 | from collections import Counter, defaultdict
  4 | from multiprocessing import Pool
  5 | 
  6 | # External libraries
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | 
 11 | class Record:
 12 |     'The aim of this class is to hide the representation of labels and'
 13 |     'records from the random forest building and classification algorithm'
 14 | 
 15 |     def __init__(self, labels, features, names=None):
 16 |         'Initialize with labels and records'
 17 |         self.labels = labels
 18 |         self.features = features
 19 |         if names is None:
 20 |             self.names = np.arange(len(labels), dtype=int)
 21 |         else:
 22 |             self.names = names
 23 | 
 24 |         assert len(self.labels) == self.features.shape[0]
 25 | 
 26 |     def size(self):
 27 |         'The number of records'
 28 |         return self.features.shape[0]
 29 | 
 30 |     def indexes(self):
 31 |         return self.names
 32 | 
 33 |     def label_distribution(self):
 34 |         'The count of labels'
 35 |         return Counter(self.labels)
 36 | 
 37 |     def H(self):
 38 |         'Computes the binary entropy of labelled data'
 39 |         v = np.array(self.label_distribution().values())
 40 |         d = np.array(v) / float(sum(v))
 41 |         return - sum(d * np.log(d))
 42 | 
 43 |     def get_random_feature(self):
 44 |         'Select a random feature and a random threshold'
 45 |         rec_num, f_num = self.features.shape
 46 |         fID = np.random.random_integers(0, f_num-1)
 47 |         record = np.random.random_integers(0, rec_num-1)
 48 |         value = self.features[record, fID]
 49 |         return (fID, value)
 50 | 
 51 |     def split_on_feature(self, feature):
 52 |         'Split the records according to a feature'
 53 | 
 54 |         # Define the indicator
 55 |         fID, value = feature
 56 |         indicator = (self.features[:, fID] <= value)
 57 |         
 58 |         ## Split into two recrod sets
 59 |         labelsLeft = self.labels[indicator]
 60 |         featuresLeft = self.features[indicator, :]
 61 |         namesLeft = None
 62 |         if self.names is not None:
 63 |             namesLeft = self.names[indicator]
 64 | 
 65 |         labelsRight = self.labels[~indicator]
 66 |         featuresRight = self.features[~indicator, :]
 67 |         namesRight = None
 68 |         if self.names is not None:
 69 |             namesRight = self.names[~indicator]
 70 | 
 71 |         L = Record(labelsLeft, featuresLeft, namesLeft)
 72 |         R = Record(labelsRight, featuresRight, namesRight)
 73 | 
 74 |         ## What is the info gain?
 75 |         HL, SL = L.H(), L.size()
 76 |         HR, SR = R.H(), R.size()
 77 |         Ha, Sa = self.H(), float(self.size())
 78 | 
 79 |         dH = Ha - HL * (SL/Sa) - HR * (SR/Sa)
 80 |         return dH, L, R
 81 | 
 82 | ## --------------------------
 83 | ## - The random forest code -
 84 | ## --------------------------
 85 | 
 86 | 
 87 | def build_tree(records, levels=5, numfeatures=100, cutoff=0.001):
 88 |     'Train a decision tree based on labeled data and features'
 89 | 
 90 |     tree = [None]
 91 |     i = 0
 92 |     candidates = [(i, records, levels)]
 93 | 
 94 |     while candidates != []:
 95 |         idx, records, levels = candidates.pop()
 96 | 
 97 |         if levels == 0 or records.H() == 0.0:
 98 |             C1 = records.label_distribution()
 99 |             Leaf = (None, C1)
100 |             tree[idx] = Leaf
101 |         else:
102 |             gain = 0.0
103 |             data = None
104 |             for _ in xrange(numfeatures):
105 |                 F = records.get_random_feature()
106 |                 dH, L, R = records.split_on_feature(F)
107 |                 
108 |                 if gain < dH and cutoff < dH:
109 |                     gain = dH
110 |                     data = (F, L, R)
111 | 
112 |             if data is None:
113 |                 candidates.append((idx, records, 0))
114 |                 continue
115 | 
116 |             (F, L, R) = data
117 |             # print F, gain
118 | 
119 |             candidates.append((i+1, L, levels - 1))
120 |             candidates.append((i+2, R, levels - 1))
121 | 
122 |             tree[idx] = (F, i+1, i+2)
123 |             i += 2
124 |             tree += [None, None]
125 | 
126 |     return tree
127 | 
128 | def parallel_build_tree(params):
129 |     (records, levels, numfeatures, cutoff, num_trees) = params
130 |     R = records # Record(labels, features)
131 |     root = []
132 |     
133 |     for t in xrange(num_trees):
134 |             print "Make tree %s" % t
135 |             T = build_tree(R, levels, numfeatures, cutoff)
136 |             #print T
137 |             root += [T]
138 |     return root
139 |     
140 | 
141 | class Forest:
142 |     def __init__(self, trees=10, levels=5, numfeatures=100, cutoff=0.001):
143 |         self.root = None
144 | 
145 |         self.trees = trees
146 |         self.levels = levels
147 |         self.numfeatures = numfeatures
148 |         self.cutoff = cutoff
149 | 
150 |     def train(self, records, workers=7, multicore=True):
151 |         p = Pool(workers)
152 |         try:
153 |             per_worker = int(1 + (float(self.trees) / workers))
154 |             params = [(records, self.levels, self.numfeatures, self.cutoff, per_worker)] * workers
155 |             if multicore:
156 |                 x = p.map(parallel_build_tree, params)
157 |             else:
158 |                 x = map(parallel_build_tree, params)
159 |             forest =  sum(x,[])[:self.trees]
160 |             self.root = forest
161 |         finally:
162 |             p.close()
163 |             p.join()
164 | 
165 | 
166 |     def classify(self, features, show=False):
167 |         recs, _ = features.shape
168 |         result_shape = (features.shape[0], len(self.root))
169 |         scores = np.zeros(result_shape)
170 |         print scores.shape
171 |         R = Record(np.arange(recs, dtype=int), features)
172 | 
173 |         for i, T in enumerate(self.root):
174 |             for idxs, result in classify(T, R):
175 |                 for idx in idxs.indexes():
176 |                     scores[idx, i] = float(result[0]) / sum(result.values())
177 | 
178 | 
179 |         if show:
180 |             plt.cla()
181 |             plt.clf()
182 |             plt.close()
183 | 
184 |             plt.imshow(scores, cmap=plt.cm.gray)
185 |             plt.title('Scores matrix')
186 |             plt.savefig(r"../scratch/tree_scores.png", bbox_inches='tight')
187 |         
188 |         return scores
189 | 
190 | 
191 | def classify(tree, records):
192 |     'Get a decision for an item using a tree'
193 |  
194 |     candidates = [(0, records)]
195 | 
196 |     while candidates != []:
197 |         idx, irecords = candidates.pop()
198 |         node = tree[idx]
199 | 
200 |         if len(node) == 2:
201 |             assert node[0] is None
202 |             yield (irecords, node[1])
203 |         else:
204 |             F, LT, RT = tree[idx]
205 |             _, L, R = irecords.split_on_feature(F)
206 | 
207 |             ## Check we did not drop any records
208 |             assert irecords.size() == (L.size() + R.size())
209 | 
210 |             if L.size() > 0:
211 |                 candidates.append((LT, L))
212 |             if R.size() > 0:
213 |                 candidates.append((RT, R))
214 | 
215 | def traverse(tree):
216 |     candidates = [(0, [])]
217 | 
218 |     while candidates != []:
219 |         idx, path = candidates.pop()
220 |         node = tree[idx]
221 | 
222 |         if len(node) == 2:
223 |             yield path, node[1]
224 |         else:
225 |             F, LT, RT = tree[idx]
226 | 
227 |             candidates.append((LT, path + [(F, False)]))
228 |             candidates.append((RT, path + [(F, True)]))
229 | 
230 | 
231 | def get_features(tree):
232 |     for x in tree:
233 |         if len(x) == 3:
234 |             yield x[0]
235 | 
236 | 
237 | def ROC_data(scores, labels, names, name="STD"):
238 |     P = len(labels[labels==1])
239 |     N = len(labels[labels==0])
240 | 
241 |     ## Make an ROC curve
242 |     final_scores = np.mean(scores, axis=1)
243 |     TP = [0.0]
244 |     FP = [0.0]
245 |     # TP = []
246 |     # FP = []
247 |     ACC = []
248 |     for sc in sorted(set(final_scores)):
249 |         L = labels[final_scores <= sc]
250 |         Pi = len(L[L==1])
251 |         Ni = len(L[L==0])
252 |         tp = float(Pi) / P
253 |         fp = float(Ni) / N
254 | 
255 |         if tp == 0.0 or tp == 1.0:
256 |             continue
257 |         
258 |         while len(TP) > 0 and (tp == TP[-1]):
259 |             del TP[-1]
260 |             del FP[-1]
261 | 
262 |         TP += [tp]
263 |         FP += [fp]
264 |         ACC += [(float(Pi) + (N - float(Ni)))/(P + N)]
265 | 
266 |     TP += [1.0]
267 |     FP += [1.0]
268 | 
269 |     fp_regular = np.arange(0.0, 1.0, 0.001)
270 |     tp_regular = np.zeros(len(fp_regular))
271 | 
272 |     j = 0
273 |     for i, fp_i in enumerate(fp_regular):
274 | 
275 |         while not (FP[j] <= fp_i < FP[j+1]):
276 |             j += 1
277 | 
278 |         #print i, fp_i, FP[j], TP[j], FP[j+1], TP[j+1]
279 |         tp_i = (1 - ((FP[j+1] - fp_i) / (FP[j+1] - FP[j]))) * (TP[j+1] - TP[j]) + TP[j]
280 |         tp_regular[i] = tp_i
281 |         #print tp_i
282 | 
283 |     return max(ACC), tp_regular, fp_regular
284 | 
285 | def graph_ROC(max_ACC, TP, FP, name="STD"):
286 |     aTP = np.vstack(TP)
287 |     n = len(TP)
288 |     mean_TP = np.mean(aTP, axis=0)
289 |     stderr_TP = np.std(aTP, axis=0) / (n ** 0.5)
290 |     var_TP = np.var(aTP, axis=0)
291 |     max_TP = mean_TP + 3 * stderr_TP
292 |     min_TP = mean_TP - 3 * stderr_TP
293 | 
294 |     # sTP = sum(TP) / len(TP)
295 |     sFP = FP[0]
296 |     print len(sFP), len(mean_TP), len(TP[0])
297 |     smax_ACC = np.mean(max_ACC)
298 | 
299 |     plt.cla()
300 |     plt.clf()
301 |     plt.close()
302 | 
303 |     plt.plot(sFP, mean_TP)
304 |     plt.fill_between(sFP, min_TP, max_TP, color='black', alpha=0.2)
305 |     plt.xlim((0,0.1))
306 |     plt.ylim((0,1))
307 |     plt.title('ROC Curve (accuracy=%.3f)' % smax_ACC)
308 |     plt.xlabel('False Positive Rate')
309 |     plt.ylabel('True Positive Rate')
310 |     plt.savefig(r"../scratch/"+name+"_ROC_curve.pdf", bbox_inches='tight')
311 | 
312 |     # Write the data to the file
313 |     f = file(r"../scratch/"+name+"_ROC_curve.csv", "w")
314 |     f.write("FalsePositive,TruePositive,std_err, var, n\n")
315 |     for fp, tp, err, var in zip(sFP, mean_TP, stderr_TP, var_TP):
316 |         f.write("%s, %s, %s, %s, %s\n" % (fp, tp, err, var, n))
317 |     f.close()
318 | 
319 | 
320 | def ROC(scores, labels, names, name="STD"):
321 | 
322 |     max_ACC, TP, FP = ROC_data(scores, labels, names, name)
323 |     graph_ROC([max_ACC], [TP], [FP], name)
324 | 
325 |     #P = len(labels[labels==1])
326 |     #N = len(labels[labels==0])
327 | 
328 |     ## Save raw results in a file:
329 |     #fr = file(r"../scratch/"+name+"_results.txt","w")
330 |     #for s, l, n in sorted(zip(scores,labels, names), key=lambda x: np.mean(x[0])):
331 |     #    fr.write("%.4f\t%s\t%s\n" % (np.mean(s), int(l), n))
332 |     #fr.close()
333 | 
334 |     ## Make an ROC curve
335 |     
336 |     # acc_max = "%.2f" % max(ACC)
337 | 
338 |     #plt.cla()
339 |     #plt.clf()
340 |     #plt.close()
341 | 
342 |     #plt.plot(FP, TP)
343 |     #plt.xlim((0,0.1))
344 |     #plt.ylim((0,1))
345 |     #plt.title('ROC Curve (accuracy=%.2f)' % max_ACC)
346 |     #plt.xlabel('False Positive Rate')
347 |     #plt.ylabel('True Positive Rate')
348 |     #plt.savefig(r"../scratch/"+name+"_ROC_curve.png", bbox_inches='tight')
349 | 
350 |     #f = file(r"../scratch/"+name+"_ROC_curve.csv", "w")
351 |     #f.write("FalsePositive,TruePositive,Accuracy\n")
352 |     #for fp, tp, acc in zip(FP,TP, ACC):
353 |     #    f.write("%s,%s,%s\n" % (fp, tp, acc))
354 |     #f.close()
355 | 
356 | 
357 | 
358 | ## Read the csv files
359 | def read_data(labelsname, distancename):
360 |     ## Extract labels
361 |     rawlabels = np.genfromtxt(labelsname, delimiter=',', dtype=None)
362 |     labelmap = {}
363 |     row_len = 0
364 |     for row in rawlabels:
365 |         row_len = max(row_len, len(row)-1)
366 |         name = row[0]
367 |         labelmap[name] = list(row)[1:]
368 | 
369 |     ## Extract distances
370 |     rawdistances = np.genfromtxt(distancename, delimiter=',', dtype=None)
371 |     names = rawdistances[0][1:]
372 |     distances = np.array(rawdistances[1:, 1:], dtype=float)
373 |     labels = np.zeros((len(names), row_len))
374 |     
375 |     for i, name in enumerate(names):
376 |         labels[i, 0:(len(row))] = labelmap[name]
377 | 
378 |     del labelmap
379 |     return distances, labels, names
380 | 
381 | 
382 | def visualizedistances(data, figname=None):
383 |     D, L, N = data
384 |     sorted_indexes = np.argsort(L[:,0])
385 | 
386 |     D2 = D[sorted_indexes, :]
387 |     D2 = D2[:, sorted_indexes]
388 | 
389 |     plt.cla()
390 |     plt.clf()
391 |     plt.close()
392 | 
393 |     plt.imshow(D2, cmap=plt.cm.gray)
394 |     plt.title('Distance matrix')
395 |     plt.savefig(figname, bbox_inches='tight')
396 | 
397 | 
398 | def selectsubsets(data, features=200, training=400, testing=100, fraction_negative = 0.5):
399 |     D, L, N = data
400 | 
401 |     ## First identify the indexes of positives and negatives
402 |     negatives = np.where(L[:,0] == 0)[0]
403 |     positives = np.where(L[:,0] == 1)[0]
404 | 
405 |     Neg_training = 2 * int(training * fraction_negative) + features
406 |     Pos_training = 2 * int(training * (1-fraction_negative)) + features
407 | 
408 |     np.random.shuffle(negatives)
409 |     np.random.shuffle(positives)
410 | 
411 |     feature_labels = ["N%s" % n for n in negatives[:features]] + ["P%s" % p for p in positives[:features]] + ["C"]
412 |     feature_set = np.hstack((negatives[:features], positives[:features]))
413 |     training_set = np.hstack((negatives[features:Neg_training], positives[features:Neg_training]))
414 | 
415 |     test_size = testing # min(len(negatives[Neg_training:]), len(positives[Pos_training:]))
416 |     
417 |     test_set = np.hstack((negatives[Neg_training:Neg_training+test_size], positives[Pos_training:Pos_training+test_size]))
418 | 
419 |     assert len(test_set) == 2 * test_size
420 |     print "Feature size: %s Training size: %s Test size: %s" % (len(feature_set), len(training_set), len(test_set))
421 | 
422 |     training_records = D[training_set, :]
423 |     training_records = training_records[::, feature_set]
424 |     training_records = np.hstack((training_records, L[training_set, 1:]))
425 |     training_labels = L[training_set, 0]
426 | 
427 |     #training_labels = np.zeros(len(training_set), dtype=int)
428 |     #training_labels[len(training_set)/2:] = np.ones(len(training_set)/2, dtype=int)
429 | 
430 |     print training_records.shape, training_labels.shape
431 | 
432 |     test_records = D[test_set, :]
433 |     test_records = test_records[::, feature_set]
434 |     test_records = np.hstack((test_records, L[test_set, 1:]))
435 |     test_labels = L[test_set, 0]
436 | 
437 |     #test_labels = np.zeros(len(test_set), dtype=int)
438 |     #test_labels[len(test_set)/2:] = np.ones(len(test_set)/2, dtype=int)
439 | 
440 |     training_data = (training_set, training_records, training_labels)
441 |     test_data = (test_set, test_records, test_labels)
442 | 
443 |     return feature_labels, feature_set, training_data, test_data
444 | 
445 | 
446 | class TestSeq(unittest.TestCase):
447 | 
448 |     def setUp(self):
449 |         self.repeats = 3 # 30
450 |         self.trees = 10 # 400
451 |         self.features = 100
452 |         self.training = 300
453 |         self.testing = 300
454 |         self.bias = 0.5
455 |         self.proposed_features = 30
456 | 
457 |     def test_visualize(self):
458 |         data = read_data('../data/filelabels.csv', '../data/ncdvals.csv')
459 | 
460 |         D, L, N = data
461 |         assert D.shape == (2000, 2000)
462 |         assert len(L) == 2000
463 |         assert len(L[0]) == 1
464 |         assert len(N) == 2000
465 | 
466 |         visualizedistances(data, '../scratch/distances.png')
467 | 
468 |     def feature_importance_test(self, data, D, L, N, name):
469 |         max_ACC, TP, FP = [], [], []
470 | 
471 |         L = defaultdict(list)
472 | 
473 |         for _ in range(30):
474 |             feature_labels, feature_set, training_data, test_data = selectsubsets(data, features=self.features, training=self.training, testing=self.testing, fraction_negative=self.bias)
475 |             (training_set, training_records, training_labels) = training_data
476 |             (test_set, test_records, test_labels) = test_data
477 | 
478 |             F = Forest(trees = self.trees, numfeatures = self.proposed_features)
479 |             R = Record(training_labels, training_records)
480 |             F.train(R)
481 | 
482 |             features = []
483 |             for t in F.root:
484 |                 features += [X for  X, _ in list(get_features(t))]
485 | 
486 | 
487 |             c = Counter(features)
488 |             items =  sorted(c.items(), key=lambda x: x[1], reverse=True)
489 |             for l, v in items:
490 |                 L[feature_labels[l]].append(v)
491 |             #V = [v for _, v in items]
492 |             #L = [l for l, _ in items]
493 | 
494 |         return L
495 | 
496 |         #         return max_ACC, TP, FP
497 | 
498 |     def test_features(self):
499 |         data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv')
500 | 
501 |         D, L, N = data
502 |         assert len(L[0]) == 2
503 |         
504 |         self.bias = 0.5
505 |         self.trees = 100
506 |         # self.proposed_features = 75
507 |         L = self.feature_importance_test(data, D, L, N, name="FEATURES")
508 | 
509 |         freq = [(l, np.mean(vs)) for l, vs in L.items()]
510 |         freq = sorted(freq, key=lambda x:-x[1])
511 | 
512 |         labels = [l for l, _ in freq]
513 |         middle = [v for _, v in freq]
514 |         top = [max(L[l]) for l, _ in freq]
515 |         bottom = [min(L[l]) for l, _ in freq]
516 | 
517 |         #print V,L
518 | 
519 |         plt.cla()
520 |         plt.clf()
521 |         plt.close()
522 | 
523 |         zero_elem = labels.index("C")
524 | 
525 |         ticks = range(len(middle))
526 |         plt.plot(ticks, middle, "b.")
527 |         plt.fill_between(ticks, bottom, top, color='black', alpha=0.2)
528 |         #plt.fill_between(sFP, min_TP, max_TP, color='black', alpha=0.2)
529 |         plt.xlim((-25,len(middle) + 25))
530 |         plt.ylim((0,max(top)+5))
531 |         #plt.title('ROC Curve (accuracy=%.3f)' % smax_ACC)
532 | 
533 |         plt.plot([zero_elem], [middle[zero_elem]], 'ro')
534 | 
535 |         plt.xlabel('Feature')
536 |         plt.ylabel('Prevalence')
537 |         print "Saving ..."
538 |         #plt.show()
539 |         plt.savefig(r"../scratch/Features_BAR.pdf", bbox_inches='tight')
540 | 
541 | 
542 | 
543 |     def classifier_test(self, data, D, L, N, name):
544 |         repeats = self.repeats
545 |         max_ACC, TP, FP = [], [], []
546 |         for _ in xrange(repeats):
547 |             _, feature_set, training_data, test_data = selectsubsets(data, features=self.features, training=self.training, testing=self.testing, fraction_negative=self.bias)
548 |             (training_set, training_records, training_labels) = training_data
549 |             (test_set, test_records, test_labels) = test_data
550 | 
551 |             R = Record(training_labels, training_records, training_set)
552 |             T = build_tree(R)
553 | 
554 |             for (r, d) in classify(T, R):
555 |                 assert r.label_distribution() == d
556 | 
557 |             F = Forest(trees = self.trees, numfeatures = self.proposed_features)
558 |             R = Record(training_labels, training_records)
559 |             F.train(R)
560 |             scores = F.classify(test_records)
561 | 
562 |             max_ACCi, TPi, FPi = ROC_data(scores, test_labels, N[test_set], name)
563 |             
564 |             max_ACC += [ max_ACCi ]
565 |             TP += [ TPi ] 
566 |             FP += [ FPi ] 
567 |             
568 |         graph_ROC(max_ACC, TP, FP, name)
569 |         return max_ACC, TP, FP
570 | 
571 | 
572 |     def test_malwareanalysis(self):
573 |         data = read_data('../data/filelabels.csv', '../data/ncdvals.csv')
574 |         D, L, N = data
575 |         self.classifier_test(data, D, L, N, name="MATRIX_ONLY")
576 | 
577 |     def test_malwareanalysis_unbalanced(self):
578 |         data = read_data('../data/filelabels.csv', '../data/ncdvals.csv')
579 |         D, L, N = data
580 | 
581 |         self.bias = 0.9
582 |         self.classifier_test(data, D, L, N, name="MATRIX_ONLY_BIASED")
583 | 
584 |     def test_malwareanalysis_compress(self):
585 |         data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv')
586 | 
587 |         D, L, N = data
588 |         assert len(L[0]) == 2
589 |         
590 |         self.classifier_test(data, D, L, N, name="MATRIX_COMPRESS")
591 | 
592 |     def test_malwareanalysis_compress_unbalanced(self):
593 |         data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv')
594 | 
595 |         D, L, N = data
596 |         assert len(L[0]) == 2
597 |         
598 |         self.bias = 0.9
599 |         self.classifier_test(data, D, L, N, name="MATRIX_COMPRESS_BIASED")
600 | 
601 | 
602 |     def test_compress_only(self):
603 |         data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv')
604 | 
605 |         D, L, N = data
606 |         
607 |         self.features = 0
608 |         self.classifier_test(data, D, L, N, name="COMPRESS_ONLY")
609 | 
610 | import sys
611 | if __name__ == '__main__':
612 |     # print sys.argv
613 |     if len(sys.argv) > 1:
614 |         print "Executing test for " + sys.argv[1]
615 |         TestSeq("test_"+sys.argv[1]).run()
616 |     else:
617 |         unittest.main()


--------------------------------------------------------------------------------
/code/symbols.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | from sklearn.cluster import SpectralClustering, KMeans
  7 | 
  8 | # Always make a fuss in case of numerical error
  9 | np.seterr(all="raise")
 10 | 
 11 | 
 12 | ## Helper functions
 13 | ## Estimate params
 14 | def estimate_norm(datas):
 15 |     if datas.shape[0] < 2:
 16 |         return None, None, 0.0
 17 | 
 18 |     mp = np.mean(datas, axis=0)
 19 |     sp = np.cov(datas.transpose())
 20 | 
 21 |     sign, logdet = np.linalg.slogdet(sp)
 22 |     if np.isnan(logdet) or np.isinf(logdet):
 23 |         return mp, sp, 0.0
 24 | 
 25 |     ent = sign * logdet
 26 |     return mp, sp, ent
 27 | 
 28 | 
 29 | def split(data, feature, old_ent):
 30 |     len_data = float(data.shape[0])
 31 |     axis, value = feature
 32 |     L1 = data[data[:, axis] <= value]
 33 |     L2 = data[data[:, axis] > value]
 34 |     _, _, entL1 = estimate_norm(L1)
 35 |     _, _, entL2 = estimate_norm(L2)
 36 | 
 37 |     ent = (L1.shape[0] / len_data) * entL1 + \
 38 |           (L2.shape[0] / len_data) * entL2
 39 |     D_ent = old_ent - ent
 40 |     return D_ent, L1, L2, feature
 41 | 
 42 | 
 43 | ## Constants
 44 | PROC, BRANCH, LEAF = 0, 1, 2
 45 | 
 46 | ## Train a clustering tree
 47 | def make_tree(datas, feature):
 48 |     tree = [(PROC, datas)]
 49 |     process = [0]
 50 | 
 51 |     while len(process) > 0:
 52 |         next_item = process.pop(0)
 53 |         xtype, dat = tree[next_item]
 54 |         if dat.shape[0] < 50:
 55 |             tree[next_item] = (LEAF, dat)
 56 |             continue
 57 | 
 58 |         assert xtype == PROC
 59 |         _, _, old_ent = estimate_norm(dat)
 60 | 
 61 |         sample_features = random.sample(feature, 100)
 62 |         lot = [split(dat, f, old_ent) for f in sample_features]
 63 |         ret = max(lot, key=lambda x: x[0])
 64 |         D_ent, L1, L2, feat = ret
 65 | 
 66 |         if D_ent < 1.0:
 67 |             tree[next_item] = (LEAF, dat)
 68 |             continue
 69 | 
 70 |         newID_L1 = len(tree)
 71 |         tree += [(PROC, L1)]
 72 |         newID_L2 = newID_L1 + 1
 73 |         tree += [(PROC, L2)]
 74 |         process += [newID_L1, newID_L2]
 75 | 
 76 |         tree[next_item] = (BRANCH, feat, L1, L2)
 77 |     return tree
 78 | 
 79 | 
 80 | # Set some undelying structure here
 81 | points = [(0, 2),
 82 |           (1, 1),
 83 |           (2, 0),
 84 |           (2, 2),
 85 |           (3, 1)]
 86 | 
 87 | ## The spread of the samples around the points
 88 | var = 0.03
 89 | cov = np.array([[var, 0], [0, var]])
 90 | 
 91 | # Generate some synthetic data around the points
 92 | datas = None
 93 | for p in points:
 94 |     samples = np.random.multivariate_normal(p, cov, 100)
 95 |     if datas is None:
 96 |         datas = samples
 97 |     else:
 98 |         datas = np.concatenate([datas, samples])
 99 | 
100 | # Add a splat across all data points to simulate noise
101 | mu, sig, _ = estimate_norm(datas)
102 | splat = np.random.multivariate_normal(mu, sig, 500)
103 | datas = np.concatenate([datas, splat])
104 | 
105 | # Make up some features we could split on
106 | feature = []
107 | len_datas = datas.shape[0]
108 | for _ in range(1000):
109 |     i = random.randint(0, len_datas-1)
110 |     j = random.choice([0, 1])
111 |     feature += [(j, datas[i, j])]
112 | 
113 | 
114 | def tokey(x):
115 |     return tuple(x)
116 | 
117 | ## Make a profile for each data point
118 | profiles = {}
119 | keys = []
120 | for i in range(datas.shape[0]):
121 |     k = tokey(datas[i, :])
122 |     keys += [k]
123 |     profiles[k] = []
124 | 
125 | ## Train a number of clustering trees
126 | NUM_TREES = 200
127 | for j in range(NUM_TREES):
128 |     print "Training tree %s" % j
129 |     t = make_tree(datas, feature)
130 |     cluster_id = 0
131 |     for item in t:
132 |         if item[0] == LEAF:
133 |             dat = item[1]
134 |             for i in range(dat.shape[0]):
135 |                 k = tokey(dat[i, :])
136 |                 profiles[k] += [cluster_id]
137 |             cluster_id += 1
138 | 
139 | ## Build a affinity matrix from co-occupancy of clusters
140 | for p in profiles:
141 |     profiles[p] = np.array(profiles[p], dtype=int)
142 | 
143 | covar = np.zeros((len(keys), len(keys)))
144 | for ik1, k1 in enumerate(keys):
145 |     for ik2, k2 in enumerate(keys):
146 |         D = float(np.sum(profiles[k1] == profiles[k2])) / NUM_TREES
147 |         covar[ik1, ik2] = D
148 | 
149 | ## Perform clustering on the affinity matrix
150 | clustering = SpectralClustering(affinity="precomputed", n_clusters=5)
151 | X = clustering.fit(covar)
152 | 
153 | # Example of using K-means
154 | # clustering = KMeans(n_clusters=5)
155 | # X = clustering.fit(covar)
156 | 
157 | ## Make a picture
158 | colors = ["r", "g", "b", "k", "c", "y", "m"]
159 | cols = [colors[i % 7] for i in X.labels_]
160 | plt.scatter(datas[:, 0], datas[:, 1], c=cols)
161 | plt.show()
162 | 


--------------------------------------------------------------------------------