├── LICENSE ├── README.md ├── code ├── equality.py ├── forests.py ├── malware.py └── symbols.py └── data ├── pg23428.txt └── pg5711.txt /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2014 George Danezis (g.danezis@ucl.ac.uk) 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | trees 2 | ===== 3 | 4 | A quick educational implementation of a random forest classifier and a decision jungle classifier. 5 | 6 | References: 7 | 8 | * A. Criminisi, J. Shotton, and E. Konukoglu, Decision Forests: 9 | A Unified Framework for Classification, Regression, Density Estimation, 10 | Manifold Learning and Semi-Supervised Learning. Foundations and Trends in 11 | Computer Graphics and Computer Vision. NOW Publishers. Vol.7: No 2-3, pp 81-227. 2012. 12 | 13 | * Jamie Shotton, Toby Sharp, Pushmeet Kohli, Sebastian Nowozin, John Winn, 14 | and Antonio Criminisi, Decision Jungles: Compact and Rich Models for 15 | Classification, in Proc. NIPS, 2013. 16 | -------------------------------------------------------------------------------- /code/equality.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import Counter, defaultdict 3 | import numpy as np 4 | import copy 5 | import traceback 6 | 7 | from malware import Forest, traverse 8 | 9 | def split_data(data, length=50): 10 | 'Take a large text and divide it into chunks' 11 | strings = [data[i:i+length] for i in range(0, len(data) - length, length)] 12 | 13 | string_data = dict([(s, prefixes(s)) for s in strings]) 14 | all_features = defaultdict(int) 15 | for k,v in string_data.iteritems(): 16 | for vi in v.keys(): 17 | all_features[vi] += 1 18 | 19 | Sz = len(string_data) 20 | print "Samples length = ", Sz 21 | MINS = 200 22 | new_features = dict([(f,v) for f,v in all_features.iteritems() if MINS < v < Sz - MINS ]) 23 | #for f, v in new_features.iteritems(): 24 | # print v, f 25 | 26 | for s in string_data: 27 | for vi in string_data[s].keys(): 28 | if vi not in new_features: 29 | del string_data[s][vi] 30 | 31 | spairs = [] 32 | for i in range(int(len(strings) / 2)): 33 | spairs += [(strings[2*i], strings[2*i+1])] 34 | 35 | random.shuffle(spairs) 36 | train0 = spairs[:len(spairs) / 2] 37 | train1 = [(t0, random.choice(train0)[1]) for t0, _ in train0] 38 | train_labels = [0] * len(train0) + [1] * len(train1) 39 | train = (train0 + train1, train_labels) 40 | 41 | test0 = spairs[len(spairs) / 2:] 42 | test1 = [(t0, random.choice(test0)[1]) for t0, _ in test0] 43 | test_labels = [0] * len(test0) + [1] * len(test1) 44 | test = (test0 + test1, test_labels) 45 | 46 | return string_data, train, test 47 | 48 | 49 | def prefixes(string, min_len=3, max_len=10): 50 | pfx = defaultdict(int) 51 | for i in range(len(string)): 52 | for j in range(min_len, max_len): 53 | if i+j <= len(string): 54 | pfx[string[i:i+j]] += 1 55 | return pfx 56 | 57 | 58 | def process_data(data, pairs, labels): 59 | items = [] 60 | labs = [] 61 | for (p1, p2), l in zip(pairs, labels): 62 | assert p1 in data 63 | assert p2 in data 64 | items += [p1, p2] 65 | labs += [(p2, l), (p1, l)] 66 | return items, labs 67 | 68 | 69 | class EqRecords(): 70 | 71 | def __init__(self, data, items, labels=None, sID=1): 72 | self.data = data 73 | self.items = items 74 | self.item_set = set(items) 75 | if labels: 76 | assert len(items) == len(labels) 77 | self.labels = labels 78 | self.sID = sID 79 | 80 | def _filter(self, f, b): 81 | new_items = [(idx, i) for idx, i in enumerate(self.items) if (f in self.data[i]) == b ] 82 | new_labels = None 83 | if self.labels: 84 | new_labels = [self.labels[idx] for idx, _ in new_items] 85 | 86 | new_items = [i for _, i in new_items] 87 | return EqRecords(self.data, new_items, new_labels, 2*self.sID + [0, 1][b]) 88 | 89 | 90 | def size(self): 91 | return len(self.items) 92 | 93 | def indexes(self): 94 | return self.items 95 | 96 | def label_distribution(self): 97 | assert self.labels is not None 98 | d = {0:0, 1:0} 99 | for (s, l) in self.labels: 100 | res = 0 if s in self.item_set else 1 101 | d[int(res == l)] += 1 102 | return d, self.sID 103 | 104 | def H(self): 105 | d, _ = self.label_distribution() 106 | S = d[1] + d[0] 107 | if S == 0: 108 | return -1 109 | return (float((d[1] - d[0])) / S) - 1.0 110 | 111 | def get_random_feature(self): 112 | while True: 113 | try: 114 | i = random.choice(self.items) 115 | return random.choice(self.data[i].keys()) 116 | except: 117 | print "No feature!!" 118 | 119 | def split_on_feature(self, feature): 120 | L = self._filter(feature, False) 121 | R = self._filter(feature, True) 122 | 123 | dH = self.H() 124 | S = float(self.size()) 125 | dNew = (L.size() / S) * L.H() + (R.size() / S) * R.H() 126 | return dNew - dH, L, R 127 | 128 | def test_init(): 129 | dataEN = file("../data/pg42671.txt").read() 130 | 131 | features, train, test = split_data(dataEN, length=200) 132 | train_data, train_labels = train 133 | 134 | items, labs = process_data(features, train_data, train_labels) 135 | rec = EqRecords(features, items, labs) 136 | assert rec.labels 137 | 138 | assert rec.size() == len(train_data) * 2 139 | 140 | d, _ = rec.label_distribution() 141 | 142 | #print d 143 | #print rec.H() 144 | 145 | #for _ in range(100): 146 | # f = rec.get_random_feature() 147 | # dh, L, R = rec.split_on_feature(f) 148 | # print "%f\t\"%s\"\t%s" % (dh, f, (L.size(), R.size())) 149 | 150 | F = Forest(trees = 14, numfeatures = 100, levels=10) 151 | # R = Record(training_labels, training_records) 152 | F.train(rec, multicore=False) 153 | 154 | for t in F.root: 155 | print "-" * 30 156 | for xxx in traverse(t): 157 | terms, (labels, sID) = xxx 158 | s = " ".join(["%s\"%s\"" % (["-", "+"][b], term) for term, b in terms]) 159 | s += " (-%s, +%s)" % (labels[0], labels[1]) 160 | print s 161 | 162 | if __name__ == "__main__": 163 | # dataEN = file("../data/pg23428.txt").read() 164 | # dataFR = file("../data/pg5711.txt").read() 165 | pass -------------------------------------------------------------------------------- /code/forests.py: -------------------------------------------------------------------------------- 1 | ## This is an educational random forest implementation 2 | 3 | ## References: 4 | ## * A. Criminisi, J. Shotton, and E. Konukoglu, Decision Forests: 5 | ## A Unified Framework for Classification, Regression, Density Estimation, 6 | ## Manifold Learning and Semi-Supervised Learning. Foundations and Trends in 7 | ## Computer Graphics and Computer Vision. NOW Publishers. Vol.7: No 2-3, pp 81-227. 2012. 8 | ## 9 | ## * Jamie Shotton, Toby Sharp, Pushmeet Kohli, Sebastian Nowozin, John Winn, 10 | ## and Antonio Criminisi, Decision Jungles: Compact and Rich Models for 11 | ## Classification, in Proc. NIPS, 2013 12 | 13 | import random 14 | from collections import Counter 15 | import numpy as np 16 | import copy 17 | 18 | def split_data(data, label=0, length=50): 19 | 'Take a large text and divide it into chunks' 20 | strings = [data[i:i+length] for i in range(0, len(data) - length, length)] 21 | random.shuffle(strings) 22 | strings = [(s, label) for s in strings] 23 | 24 | test = strings[:len(strings) * 10 / 100] 25 | training = strings[len(strings) * 10 / 100:] 26 | return test, training 27 | 28 | 29 | def entropy(data): 30 | 'Computes the binary entropy of labelled data' 31 | v = Counter([b for _, b in data]).values() 32 | d = np.array(v) / float(sum(v)) 33 | return - sum(d * np.log(d)) 34 | 35 | 36 | def split(train, feat): 37 | 'Split data according to an infromation gain criterium' 38 | ## first compute the entropy 39 | Hx = entropy(train) 40 | if Hx < 0.000001: 41 | raise Exception("Entropy very low") 42 | L1 = [] 43 | L2 = [] 44 | for t in train: 45 | if feat in t[0]: 46 | L1 += [t] 47 | else: 48 | L2 += [t] 49 | 50 | E1 = entropy(L1) 51 | E2 = entropy(L2) 52 | L = float(len(train)) 53 | 54 | H = Hx - E1 * len(L1)/L - E2 * len(L2)/L 55 | return H, L1, L2, feat 56 | 57 | ## -------------------------- 58 | ## - The random forest code - 59 | ## -------------------------- 60 | 61 | 62 | def build_tree(train, features, levels=5, numfeatures=100): 63 | 'Train a decision tree based on labeled data and features' 64 | if levels == 0: 65 | C1 = Counter([b for _, b in train]) 66 | Leaf = (None, C1) 67 | return Leaf 68 | else: 69 | try: 70 | X = (split(train, F) for F in random.sample(features, numfeatures)) 71 | H, L1, L2, F = max(X) 72 | M1 = build_tree(L1, features, levels - 1, numfeatures) 73 | M2 = build_tree(L2, features, levels - 1, numfeatures) 74 | Branch = (F, M1, M2) 75 | return Branch 76 | except: 77 | return build_tree(train, features, levels=0) 78 | 79 | 80 | def classify(tree, item): 81 | 'Get a decision for an item using a tree' 82 | if len(tree) == 2: 83 | assert tree[0] is None 84 | return tree[1] 85 | else: 86 | fet, L1, L2 = tree 87 | if fet in item: 88 | return classify(L1, item) 89 | else: 90 | return classify(L2, item) 91 | 92 | ## ---------------------------- 93 | ## - The decision jungle code - 94 | ## ---------------------------- 95 | 96 | 97 | def build_jungle(train, features, levels=20, numfeatures=100): 98 | DAG = {0: copy.copy(train)} 99 | Candidate_sets = [0] 100 | next_ID = 0 101 | M = 20 102 | 103 | for level in range(levels): 104 | result_sets = [] 105 | for tdata_idx in Candidate_sets: 106 | tdata = DAG[tdata_idx] 107 | 108 | if entropy(tdata) == 0.0: 109 | next_ID += 1 110 | idx1 = next_ID 111 | result_sets += [idx1] 112 | DAG[idx1] = tdata + [] 113 | del DAG[tdata_idx][:] 114 | DAG[tdata_idx] += [True, idx1, idx1] 115 | continue 116 | 117 | X = (split(tdata, F) for F in random.sample(features, numfeatures)) 118 | H, L1, L2, F = max(X) 119 | 120 | # Branch = (F, M1, M2) 121 | next_ID += 1 122 | idx1 = next_ID 123 | DAG[idx1] = L1 124 | next_ID += 1 125 | idx2 = next_ID 126 | DAG[idx2] = L2 127 | 128 | result_sets += [idx1, idx2] 129 | del DAG[tdata_idx][:] 130 | DAG[tdata_idx] += [F, idx1, idx2] 131 | 132 | ## Now optimize the result sets here 133 | random.shuffle(result_sets) 134 | 135 | basic = result_sets[:M] 136 | for r in result_sets[M:]: 137 | maxv = None 138 | maxi = None 139 | for b in basic: 140 | L = float(len(DAG[r] + DAG[b])) 141 | e1 = len(DAG[r]) * entropy(DAG[r]) 142 | e2 = len(DAG[b]) * entropy(DAG[b]) 143 | newe = L * entropy(DAG[r] + DAG[b]) 144 | score = abs(e1 + e2 - newe) 145 | if maxv is None: 146 | maxv = score 147 | maxi = b 148 | continue 149 | if score < maxv: 150 | maxv = score 151 | maxi = b 152 | DAG[maxi] += DAG[r] 153 | del DAG[r] 154 | DAG[r] = DAG[maxi] 155 | 156 | Candidate_sets = basic 157 | 158 | for tdata_idx in Candidate_sets: 159 | tdata = DAG[tdata_idx] 160 | C1 = Counter([b for _, b in tdata]) 161 | del DAG[tdata_idx][:] 162 | DAG[tdata_idx] += [None, C1] 163 | 164 | return DAG 165 | 166 | 167 | def classify_jungle(DAG, item): 168 | branch = DAG[0] 169 | while branch[0] is not None: 170 | try: 171 | fet, L1, L2 = branch 172 | if fet == True or fet in item: 173 | branch = DAG[L1] 174 | else: 175 | branch = DAG[L2] 176 | except: 177 | print len(branch) 178 | raise 179 | return branch[1] 180 | 181 | ## ------------------------- 182 | ## - Sample classification - 183 | ## ------------------------- 184 | 185 | if __name__ == "__main__": 186 | # dataEN = file("../data/pg23428.txt").read() 187 | # dataFR = file("../data/pg5711.txt").read() 188 | dataEN = file("../data/pg110.txt").read() 189 | dataFR = file("../data/pg42671.txt").read() 190 | 191 | length = 200 192 | 193 | testEN, trainEN = split_data(dataEN, label=0, length=length) 194 | testFR, trainFR = split_data(dataFR, label=1, length=length) 195 | 196 | print "training: EN=%s FR=%s" % (len(trainEN), len(trainFR)) 197 | 198 | train = trainEN + trainFR 199 | random.shuffle(train) 200 | test = testEN + testFR 201 | random.shuffle(test) 202 | 203 | ## Now make a bunch of features 204 | ## A feature is in at least 10% of strings 205 | ## but also at most in 90% of strings 206 | 207 | sometrain = random.sample(train, 1000) 208 | features = set() 209 | while len(features) < 700: 210 | fragment, _ = random.choice(sometrain) 211 | l = int(round(random.expovariate(0.20))) 212 | b = random.randint(0, max(0, length - l)) 213 | feat = fragment[b:b+l] 214 | 215 | ## Test 216 | C = 0 217 | for st, _ in sometrain: 218 | if feat in st: 219 | C += 1 220 | 221 | f = float(C) / 1000 222 | if f > 0.01 and f < 0.99 and feat not in features: 223 | features.add(feat) 224 | 225 | features = list(features) 226 | 227 | manytrees = [] 228 | jungle = [] 229 | for i in range(10): 230 | print "Build tree %s" % i 231 | size = len(train) / 3 232 | training_sample = random.sample(train, size) 233 | 234 | tree = build_jungle(training_sample, features, numfeatures=100) 235 | jungle += [tree] 236 | 237 | tree = build_tree(training_sample, features, numfeatures=100) 238 | manytrees += [tree] 239 | 240 | testdata = test 241 | results_tree = Counter() 242 | results_jungle = Counter() 243 | for item, cat in testdata: 244 | # Trees 245 | c = Counter() 246 | for tree in manytrees: 247 | c += classify(tree, item) 248 | res = (max(c, key=lambda x: c[x]), cat) 249 | results_tree.update([res]) 250 | 251 | # Jungle 252 | c = Counter() 253 | for tree in jungle: 254 | c += classify_jungle(tree, item) 255 | res = (max(c, key=lambda x: c[x]), cat) 256 | results_jungle.update([res]) 257 | 258 | print 259 | print "Results Tree Jungle" 260 | print "True positives: %4d %4d" \ 261 | % (results_tree[(1, 1)], results_jungle[(1, 1)]) 262 | print "True negatives: %4d %4d" \ 263 | % (results_tree[(0, 0)], results_jungle[(0, 0)]) 264 | print "False positives: %4d %4d" \ 265 | % (results_tree[(1, 0)], results_jungle[(1, 0)]) 266 | print "False negatives: %4d %4d" \ 267 | % (results_tree[(0, 1)], results_jungle[(0, 1)]) 268 | -------------------------------------------------------------------------------- /code/malware.py: -------------------------------------------------------------------------------- 1 | # Standard library 2 | import unittest 3 | from collections import Counter, defaultdict 4 | from multiprocessing import Pool 5 | 6 | # External libraries 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | class Record: 12 | 'The aim of this class is to hide the representation of labels and' 13 | 'records from the random forest building and classification algorithm' 14 | 15 | def __init__(self, labels, features, names=None): 16 | 'Initialize with labels and records' 17 | self.labels = labels 18 | self.features = features 19 | if names is None: 20 | self.names = np.arange(len(labels), dtype=int) 21 | else: 22 | self.names = names 23 | 24 | assert len(self.labels) == self.features.shape[0] 25 | 26 | def size(self): 27 | 'The number of records' 28 | return self.features.shape[0] 29 | 30 | def indexes(self): 31 | return self.names 32 | 33 | def label_distribution(self): 34 | 'The count of labels' 35 | return Counter(self.labels) 36 | 37 | def H(self): 38 | 'Computes the binary entropy of labelled data' 39 | v = np.array(self.label_distribution().values()) 40 | d = np.array(v) / float(sum(v)) 41 | return - sum(d * np.log(d)) 42 | 43 | def get_random_feature(self): 44 | 'Select a random feature and a random threshold' 45 | rec_num, f_num = self.features.shape 46 | fID = np.random.random_integers(0, f_num-1) 47 | record = np.random.random_integers(0, rec_num-1) 48 | value = self.features[record, fID] 49 | return (fID, value) 50 | 51 | def split_on_feature(self, feature): 52 | 'Split the records according to a feature' 53 | 54 | # Define the indicator 55 | fID, value = feature 56 | indicator = (self.features[:, fID] <= value) 57 | 58 | ## Split into two recrod sets 59 | labelsLeft = self.labels[indicator] 60 | featuresLeft = self.features[indicator, :] 61 | namesLeft = None 62 | if self.names is not None: 63 | namesLeft = self.names[indicator] 64 | 65 | labelsRight = self.labels[~indicator] 66 | featuresRight = self.features[~indicator, :] 67 | namesRight = None 68 | if self.names is not None: 69 | namesRight = self.names[~indicator] 70 | 71 | L = Record(labelsLeft, featuresLeft, namesLeft) 72 | R = Record(labelsRight, featuresRight, namesRight) 73 | 74 | ## What is the info gain? 75 | HL, SL = L.H(), L.size() 76 | HR, SR = R.H(), R.size() 77 | Ha, Sa = self.H(), float(self.size()) 78 | 79 | dH = Ha - HL * (SL/Sa) - HR * (SR/Sa) 80 | return dH, L, R 81 | 82 | ## -------------------------- 83 | ## - The random forest code - 84 | ## -------------------------- 85 | 86 | 87 | def build_tree(records, levels=5, numfeatures=100, cutoff=0.001): 88 | 'Train a decision tree based on labeled data and features' 89 | 90 | tree = [None] 91 | i = 0 92 | candidates = [(i, records, levels)] 93 | 94 | while candidates != []: 95 | idx, records, levels = candidates.pop() 96 | 97 | if levels == 0 or records.H() == 0.0: 98 | C1 = records.label_distribution() 99 | Leaf = (None, C1) 100 | tree[idx] = Leaf 101 | else: 102 | gain = 0.0 103 | data = None 104 | for _ in xrange(numfeatures): 105 | F = records.get_random_feature() 106 | dH, L, R = records.split_on_feature(F) 107 | 108 | if gain < dH and cutoff < dH: 109 | gain = dH 110 | data = (F, L, R) 111 | 112 | if data is None: 113 | candidates.append((idx, records, 0)) 114 | continue 115 | 116 | (F, L, R) = data 117 | # print F, gain 118 | 119 | candidates.append((i+1, L, levels - 1)) 120 | candidates.append((i+2, R, levels - 1)) 121 | 122 | tree[idx] = (F, i+1, i+2) 123 | i += 2 124 | tree += [None, None] 125 | 126 | return tree 127 | 128 | def parallel_build_tree(params): 129 | (records, levels, numfeatures, cutoff, num_trees) = params 130 | R = records # Record(labels, features) 131 | root = [] 132 | 133 | for t in xrange(num_trees): 134 | print "Make tree %s" % t 135 | T = build_tree(R, levels, numfeatures, cutoff) 136 | #print T 137 | root += [T] 138 | return root 139 | 140 | 141 | class Forest: 142 | def __init__(self, trees=10, levels=5, numfeatures=100, cutoff=0.001): 143 | self.root = None 144 | 145 | self.trees = trees 146 | self.levels = levels 147 | self.numfeatures = numfeatures 148 | self.cutoff = cutoff 149 | 150 | def train(self, records, workers=7, multicore=True): 151 | p = Pool(workers) 152 | try: 153 | per_worker = int(1 + (float(self.trees) / workers)) 154 | params = [(records, self.levels, self.numfeatures, self.cutoff, per_worker)] * workers 155 | if multicore: 156 | x = p.map(parallel_build_tree, params) 157 | else: 158 | x = map(parallel_build_tree, params) 159 | forest = sum(x,[])[:self.trees] 160 | self.root = forest 161 | finally: 162 | p.close() 163 | p.join() 164 | 165 | 166 | def classify(self, features, show=False): 167 | recs, _ = features.shape 168 | result_shape = (features.shape[0], len(self.root)) 169 | scores = np.zeros(result_shape) 170 | print scores.shape 171 | R = Record(np.arange(recs, dtype=int), features) 172 | 173 | for i, T in enumerate(self.root): 174 | for idxs, result in classify(T, R): 175 | for idx in idxs.indexes(): 176 | scores[idx, i] = float(result[0]) / sum(result.values()) 177 | 178 | 179 | if show: 180 | plt.cla() 181 | plt.clf() 182 | plt.close() 183 | 184 | plt.imshow(scores, cmap=plt.cm.gray) 185 | plt.title('Scores matrix') 186 | plt.savefig(r"../scratch/tree_scores.png", bbox_inches='tight') 187 | 188 | return scores 189 | 190 | 191 | def classify(tree, records): 192 | 'Get a decision for an item using a tree' 193 | 194 | candidates = [(0, records)] 195 | 196 | while candidates != []: 197 | idx, irecords = candidates.pop() 198 | node = tree[idx] 199 | 200 | if len(node) == 2: 201 | assert node[0] is None 202 | yield (irecords, node[1]) 203 | else: 204 | F, LT, RT = tree[idx] 205 | _, L, R = irecords.split_on_feature(F) 206 | 207 | ## Check we did not drop any records 208 | assert irecords.size() == (L.size() + R.size()) 209 | 210 | if L.size() > 0: 211 | candidates.append((LT, L)) 212 | if R.size() > 0: 213 | candidates.append((RT, R)) 214 | 215 | def traverse(tree): 216 | candidates = [(0, [])] 217 | 218 | while candidates != []: 219 | idx, path = candidates.pop() 220 | node = tree[idx] 221 | 222 | if len(node) == 2: 223 | yield path, node[1] 224 | else: 225 | F, LT, RT = tree[idx] 226 | 227 | candidates.append((LT, path + [(F, False)])) 228 | candidates.append((RT, path + [(F, True)])) 229 | 230 | 231 | def get_features(tree): 232 | for x in tree: 233 | if len(x) == 3: 234 | yield x[0] 235 | 236 | 237 | def ROC_data(scores, labels, names, name="STD"): 238 | P = len(labels[labels==1]) 239 | N = len(labels[labels==0]) 240 | 241 | ## Make an ROC curve 242 | final_scores = np.mean(scores, axis=1) 243 | TP = [0.0] 244 | FP = [0.0] 245 | # TP = [] 246 | # FP = [] 247 | ACC = [] 248 | for sc in sorted(set(final_scores)): 249 | L = labels[final_scores <= sc] 250 | Pi = len(L[L==1]) 251 | Ni = len(L[L==0]) 252 | tp = float(Pi) / P 253 | fp = float(Ni) / N 254 | 255 | if tp == 0.0 or tp == 1.0: 256 | continue 257 | 258 | while len(TP) > 0 and (tp == TP[-1]): 259 | del TP[-1] 260 | del FP[-1] 261 | 262 | TP += [tp] 263 | FP += [fp] 264 | ACC += [(float(Pi) + (N - float(Ni)))/(P + N)] 265 | 266 | TP += [1.0] 267 | FP += [1.0] 268 | 269 | fp_regular = np.arange(0.0, 1.0, 0.001) 270 | tp_regular = np.zeros(len(fp_regular)) 271 | 272 | j = 0 273 | for i, fp_i in enumerate(fp_regular): 274 | 275 | while not (FP[j] <= fp_i < FP[j+1]): 276 | j += 1 277 | 278 | #print i, fp_i, FP[j], TP[j], FP[j+1], TP[j+1] 279 | tp_i = (1 - ((FP[j+1] - fp_i) / (FP[j+1] - FP[j]))) * (TP[j+1] - TP[j]) + TP[j] 280 | tp_regular[i] = tp_i 281 | #print tp_i 282 | 283 | return max(ACC), tp_regular, fp_regular 284 | 285 | def graph_ROC(max_ACC, TP, FP, name="STD"): 286 | aTP = np.vstack(TP) 287 | n = len(TP) 288 | mean_TP = np.mean(aTP, axis=0) 289 | stderr_TP = np.std(aTP, axis=0) / (n ** 0.5) 290 | var_TP = np.var(aTP, axis=0) 291 | max_TP = mean_TP + 3 * stderr_TP 292 | min_TP = mean_TP - 3 * stderr_TP 293 | 294 | # sTP = sum(TP) / len(TP) 295 | sFP = FP[0] 296 | print len(sFP), len(mean_TP), len(TP[0]) 297 | smax_ACC = np.mean(max_ACC) 298 | 299 | plt.cla() 300 | plt.clf() 301 | plt.close() 302 | 303 | plt.plot(sFP, mean_TP) 304 | plt.fill_between(sFP, min_TP, max_TP, color='black', alpha=0.2) 305 | plt.xlim((0,0.1)) 306 | plt.ylim((0,1)) 307 | plt.title('ROC Curve (accuracy=%.3f)' % smax_ACC) 308 | plt.xlabel('False Positive Rate') 309 | plt.ylabel('True Positive Rate') 310 | plt.savefig(r"../scratch/"+name+"_ROC_curve.pdf", bbox_inches='tight') 311 | 312 | # Write the data to the file 313 | f = file(r"../scratch/"+name+"_ROC_curve.csv", "w") 314 | f.write("FalsePositive,TruePositive,std_err, var, n\n") 315 | for fp, tp, err, var in zip(sFP, mean_TP, stderr_TP, var_TP): 316 | f.write("%s, %s, %s, %s, %s\n" % (fp, tp, err, var, n)) 317 | f.close() 318 | 319 | 320 | def ROC(scores, labels, names, name="STD"): 321 | 322 | max_ACC, TP, FP = ROC_data(scores, labels, names, name) 323 | graph_ROC([max_ACC], [TP], [FP], name) 324 | 325 | #P = len(labels[labels==1]) 326 | #N = len(labels[labels==0]) 327 | 328 | ## Save raw results in a file: 329 | #fr = file(r"../scratch/"+name+"_results.txt","w") 330 | #for s, l, n in sorted(zip(scores,labels, names), key=lambda x: np.mean(x[0])): 331 | # fr.write("%.4f\t%s\t%s\n" % (np.mean(s), int(l), n)) 332 | #fr.close() 333 | 334 | ## Make an ROC curve 335 | 336 | # acc_max = "%.2f" % max(ACC) 337 | 338 | #plt.cla() 339 | #plt.clf() 340 | #plt.close() 341 | 342 | #plt.plot(FP, TP) 343 | #plt.xlim((0,0.1)) 344 | #plt.ylim((0,1)) 345 | #plt.title('ROC Curve (accuracy=%.2f)' % max_ACC) 346 | #plt.xlabel('False Positive Rate') 347 | #plt.ylabel('True Positive Rate') 348 | #plt.savefig(r"../scratch/"+name+"_ROC_curve.png", bbox_inches='tight') 349 | 350 | #f = file(r"../scratch/"+name+"_ROC_curve.csv", "w") 351 | #f.write("FalsePositive,TruePositive,Accuracy\n") 352 | #for fp, tp, acc in zip(FP,TP, ACC): 353 | # f.write("%s,%s,%s\n" % (fp, tp, acc)) 354 | #f.close() 355 | 356 | 357 | 358 | ## Read the csv files 359 | def read_data(labelsname, distancename): 360 | ## Extract labels 361 | rawlabels = np.genfromtxt(labelsname, delimiter=',', dtype=None) 362 | labelmap = {} 363 | row_len = 0 364 | for row in rawlabels: 365 | row_len = max(row_len, len(row)-1) 366 | name = row[0] 367 | labelmap[name] = list(row)[1:] 368 | 369 | ## Extract distances 370 | rawdistances = np.genfromtxt(distancename, delimiter=',', dtype=None) 371 | names = rawdistances[0][1:] 372 | distances = np.array(rawdistances[1:, 1:], dtype=float) 373 | labels = np.zeros((len(names), row_len)) 374 | 375 | for i, name in enumerate(names): 376 | labels[i, 0:(len(row))] = labelmap[name] 377 | 378 | del labelmap 379 | return distances, labels, names 380 | 381 | 382 | def visualizedistances(data, figname=None): 383 | D, L, N = data 384 | sorted_indexes = np.argsort(L[:,0]) 385 | 386 | D2 = D[sorted_indexes, :] 387 | D2 = D2[:, sorted_indexes] 388 | 389 | plt.cla() 390 | plt.clf() 391 | plt.close() 392 | 393 | plt.imshow(D2, cmap=plt.cm.gray) 394 | plt.title('Distance matrix') 395 | plt.savefig(figname, bbox_inches='tight') 396 | 397 | 398 | def selectsubsets(data, features=200, training=400, testing=100, fraction_negative = 0.5): 399 | D, L, N = data 400 | 401 | ## First identify the indexes of positives and negatives 402 | negatives = np.where(L[:,0] == 0)[0] 403 | positives = np.where(L[:,0] == 1)[0] 404 | 405 | Neg_training = 2 * int(training * fraction_negative) + features 406 | Pos_training = 2 * int(training * (1-fraction_negative)) + features 407 | 408 | np.random.shuffle(negatives) 409 | np.random.shuffle(positives) 410 | 411 | feature_labels = ["N%s" % n for n in negatives[:features]] + ["P%s" % p for p in positives[:features]] + ["C"] 412 | feature_set = np.hstack((negatives[:features], positives[:features])) 413 | training_set = np.hstack((negatives[features:Neg_training], positives[features:Neg_training])) 414 | 415 | test_size = testing # min(len(negatives[Neg_training:]), len(positives[Pos_training:])) 416 | 417 | test_set = np.hstack((negatives[Neg_training:Neg_training+test_size], positives[Pos_training:Pos_training+test_size])) 418 | 419 | assert len(test_set) == 2 * test_size 420 | print "Feature size: %s Training size: %s Test size: %s" % (len(feature_set), len(training_set), len(test_set)) 421 | 422 | training_records = D[training_set, :] 423 | training_records = training_records[::, feature_set] 424 | training_records = np.hstack((training_records, L[training_set, 1:])) 425 | training_labels = L[training_set, 0] 426 | 427 | #training_labels = np.zeros(len(training_set), dtype=int) 428 | #training_labels[len(training_set)/2:] = np.ones(len(training_set)/2, dtype=int) 429 | 430 | print training_records.shape, training_labels.shape 431 | 432 | test_records = D[test_set, :] 433 | test_records = test_records[::, feature_set] 434 | test_records = np.hstack((test_records, L[test_set, 1:])) 435 | test_labels = L[test_set, 0] 436 | 437 | #test_labels = np.zeros(len(test_set), dtype=int) 438 | #test_labels[len(test_set)/2:] = np.ones(len(test_set)/2, dtype=int) 439 | 440 | training_data = (training_set, training_records, training_labels) 441 | test_data = (test_set, test_records, test_labels) 442 | 443 | return feature_labels, feature_set, training_data, test_data 444 | 445 | 446 | class TestSeq(unittest.TestCase): 447 | 448 | def setUp(self): 449 | self.repeats = 3 # 30 450 | self.trees = 10 # 400 451 | self.features = 100 452 | self.training = 300 453 | self.testing = 300 454 | self.bias = 0.5 455 | self.proposed_features = 30 456 | 457 | def test_visualize(self): 458 | data = read_data('../data/filelabels.csv', '../data/ncdvals.csv') 459 | 460 | D, L, N = data 461 | assert D.shape == (2000, 2000) 462 | assert len(L) == 2000 463 | assert len(L[0]) == 1 464 | assert len(N) == 2000 465 | 466 | visualizedistances(data, '../scratch/distances.png') 467 | 468 | def feature_importance_test(self, data, D, L, N, name): 469 | max_ACC, TP, FP = [], [], [] 470 | 471 | L = defaultdict(list) 472 | 473 | for _ in range(30): 474 | feature_labels, feature_set, training_data, test_data = selectsubsets(data, features=self.features, training=self.training, testing=self.testing, fraction_negative=self.bias) 475 | (training_set, training_records, training_labels) = training_data 476 | (test_set, test_records, test_labels) = test_data 477 | 478 | F = Forest(trees = self.trees, numfeatures = self.proposed_features) 479 | R = Record(training_labels, training_records) 480 | F.train(R) 481 | 482 | features = [] 483 | for t in F.root: 484 | features += [X for X, _ in list(get_features(t))] 485 | 486 | 487 | c = Counter(features) 488 | items = sorted(c.items(), key=lambda x: x[1], reverse=True) 489 | for l, v in items: 490 | L[feature_labels[l]].append(v) 491 | #V = [v for _, v in items] 492 | #L = [l for l, _ in items] 493 | 494 | return L 495 | 496 | # return max_ACC, TP, FP 497 | 498 | def test_features(self): 499 | data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv') 500 | 501 | D, L, N = data 502 | assert len(L[0]) == 2 503 | 504 | self.bias = 0.5 505 | self.trees = 100 506 | # self.proposed_features = 75 507 | L = self.feature_importance_test(data, D, L, N, name="FEATURES") 508 | 509 | freq = [(l, np.mean(vs)) for l, vs in L.items()] 510 | freq = sorted(freq, key=lambda x:-x[1]) 511 | 512 | labels = [l for l, _ in freq] 513 | middle = [v for _, v in freq] 514 | top = [max(L[l]) for l, _ in freq] 515 | bottom = [min(L[l]) for l, _ in freq] 516 | 517 | #print V,L 518 | 519 | plt.cla() 520 | plt.clf() 521 | plt.close() 522 | 523 | zero_elem = labels.index("C") 524 | 525 | ticks = range(len(middle)) 526 | plt.plot(ticks, middle, "b.") 527 | plt.fill_between(ticks, bottom, top, color='black', alpha=0.2) 528 | #plt.fill_between(sFP, min_TP, max_TP, color='black', alpha=0.2) 529 | plt.xlim((-25,len(middle) + 25)) 530 | plt.ylim((0,max(top)+5)) 531 | #plt.title('ROC Curve (accuracy=%.3f)' % smax_ACC) 532 | 533 | plt.plot([zero_elem], [middle[zero_elem]], 'ro') 534 | 535 | plt.xlabel('Feature') 536 | plt.ylabel('Prevalence') 537 | print "Saving ..." 538 | #plt.show() 539 | plt.savefig(r"../scratch/Features_BAR.pdf", bbox_inches='tight') 540 | 541 | 542 | 543 | def classifier_test(self, data, D, L, N, name): 544 | repeats = self.repeats 545 | max_ACC, TP, FP = [], [], [] 546 | for _ in xrange(repeats): 547 | _, feature_set, training_data, test_data = selectsubsets(data, features=self.features, training=self.training, testing=self.testing, fraction_negative=self.bias) 548 | (training_set, training_records, training_labels) = training_data 549 | (test_set, test_records, test_labels) = test_data 550 | 551 | R = Record(training_labels, training_records, training_set) 552 | T = build_tree(R) 553 | 554 | for (r, d) in classify(T, R): 555 | assert r.label_distribution() == d 556 | 557 | F = Forest(trees = self.trees, numfeatures = self.proposed_features) 558 | R = Record(training_labels, training_records) 559 | F.train(R) 560 | scores = F.classify(test_records) 561 | 562 | max_ACCi, TPi, FPi = ROC_data(scores, test_labels, N[test_set], name) 563 | 564 | max_ACC += [ max_ACCi ] 565 | TP += [ TPi ] 566 | FP += [ FPi ] 567 | 568 | graph_ROC(max_ACC, TP, FP, name) 569 | return max_ACC, TP, FP 570 | 571 | 572 | def test_malwareanalysis(self): 573 | data = read_data('../data/filelabels.csv', '../data/ncdvals.csv') 574 | D, L, N = data 575 | self.classifier_test(data, D, L, N, name="MATRIX_ONLY") 576 | 577 | def test_malwareanalysis_unbalanced(self): 578 | data = read_data('../data/filelabels.csv', '../data/ncdvals.csv') 579 | D, L, N = data 580 | 581 | self.bias = 0.9 582 | self.classifier_test(data, D, L, N, name="MATRIX_ONLY_BIASED") 583 | 584 | def test_malwareanalysis_compress(self): 585 | data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv') 586 | 587 | D, L, N = data 588 | assert len(L[0]) == 2 589 | 590 | self.classifier_test(data, D, L, N, name="MATRIX_COMPRESS") 591 | 592 | def test_malwareanalysis_compress_unbalanced(self): 593 | data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv') 594 | 595 | D, L, N = data 596 | assert len(L[0]) == 2 597 | 598 | self.bias = 0.9 599 | self.classifier_test(data, D, L, N, name="MATRIX_COMPRESS_BIASED") 600 | 601 | 602 | def test_compress_only(self): 603 | data = read_data('../data/filelabels2.csv', '../data/ncdvals.csv') 604 | 605 | D, L, N = data 606 | 607 | self.features = 0 608 | self.classifier_test(data, D, L, N, name="COMPRESS_ONLY") 609 | 610 | import sys 611 | if __name__ == '__main__': 612 | # print sys.argv 613 | if len(sys.argv) > 1: 614 | print "Executing test for " + sys.argv[1] 615 | TestSeq("test_"+sys.argv[1]).run() 616 | else: 617 | unittest.main() -------------------------------------------------------------------------------- /code/symbols.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.cluster import SpectralClustering, KMeans 7 | 8 | # Always make a fuss in case of numerical error 9 | np.seterr(all="raise") 10 | 11 | 12 | ## Helper functions 13 | ## Estimate params 14 | def estimate_norm(datas): 15 | if datas.shape[0] < 2: 16 | return None, None, 0.0 17 | 18 | mp = np.mean(datas, axis=0) 19 | sp = np.cov(datas.transpose()) 20 | 21 | sign, logdet = np.linalg.slogdet(sp) 22 | if np.isnan(logdet) or np.isinf(logdet): 23 | return mp, sp, 0.0 24 | 25 | ent = sign * logdet 26 | return mp, sp, ent 27 | 28 | 29 | def split(data, feature, old_ent): 30 | len_data = float(data.shape[0]) 31 | axis, value = feature 32 | L1 = data[data[:, axis] <= value] 33 | L2 = data[data[:, axis] > value] 34 | _, _, entL1 = estimate_norm(L1) 35 | _, _, entL2 = estimate_norm(L2) 36 | 37 | ent = (L1.shape[0] / len_data) * entL1 + \ 38 | (L2.shape[0] / len_data) * entL2 39 | D_ent = old_ent - ent 40 | return D_ent, L1, L2, feature 41 | 42 | 43 | ## Constants 44 | PROC, BRANCH, LEAF = 0, 1, 2 45 | 46 | ## Train a clustering tree 47 | def make_tree(datas, feature): 48 | tree = [(PROC, datas)] 49 | process = [0] 50 | 51 | while len(process) > 0: 52 | next_item = process.pop(0) 53 | xtype, dat = tree[next_item] 54 | if dat.shape[0] < 50: 55 | tree[next_item] = (LEAF, dat) 56 | continue 57 | 58 | assert xtype == PROC 59 | _, _, old_ent = estimate_norm(dat) 60 | 61 | sample_features = random.sample(feature, 100) 62 | lot = [split(dat, f, old_ent) for f in sample_features] 63 | ret = max(lot, key=lambda x: x[0]) 64 | D_ent, L1, L2, feat = ret 65 | 66 | if D_ent < 1.0: 67 | tree[next_item] = (LEAF, dat) 68 | continue 69 | 70 | newID_L1 = len(tree) 71 | tree += [(PROC, L1)] 72 | newID_L2 = newID_L1 + 1 73 | tree += [(PROC, L2)] 74 | process += [newID_L1, newID_L2] 75 | 76 | tree[next_item] = (BRANCH, feat, L1, L2) 77 | return tree 78 | 79 | 80 | # Set some undelying structure here 81 | points = [(0, 2), 82 | (1, 1), 83 | (2, 0), 84 | (2, 2), 85 | (3, 1)] 86 | 87 | ## The spread of the samples around the points 88 | var = 0.03 89 | cov = np.array([[var, 0], [0, var]]) 90 | 91 | # Generate some synthetic data around the points 92 | datas = None 93 | for p in points: 94 | samples = np.random.multivariate_normal(p, cov, 100) 95 | if datas is None: 96 | datas = samples 97 | else: 98 | datas = np.concatenate([datas, samples]) 99 | 100 | # Add a splat across all data points to simulate noise 101 | mu, sig, _ = estimate_norm(datas) 102 | splat = np.random.multivariate_normal(mu, sig, 500) 103 | datas = np.concatenate([datas, splat]) 104 | 105 | # Make up some features we could split on 106 | feature = [] 107 | len_datas = datas.shape[0] 108 | for _ in range(1000): 109 | i = random.randint(0, len_datas-1) 110 | j = random.choice([0, 1]) 111 | feature += [(j, datas[i, j])] 112 | 113 | 114 | def tokey(x): 115 | return tuple(x) 116 | 117 | ## Make a profile for each data point 118 | profiles = {} 119 | keys = [] 120 | for i in range(datas.shape[0]): 121 | k = tokey(datas[i, :]) 122 | keys += [k] 123 | profiles[k] = [] 124 | 125 | ## Train a number of clustering trees 126 | NUM_TREES = 200 127 | for j in range(NUM_TREES): 128 | print "Training tree %s" % j 129 | t = make_tree(datas, feature) 130 | cluster_id = 0 131 | for item in t: 132 | if item[0] == LEAF: 133 | dat = item[1] 134 | for i in range(dat.shape[0]): 135 | k = tokey(dat[i, :]) 136 | profiles[k] += [cluster_id] 137 | cluster_id += 1 138 | 139 | ## Build a affinity matrix from co-occupancy of clusters 140 | for p in profiles: 141 | profiles[p] = np.array(profiles[p], dtype=int) 142 | 143 | covar = np.zeros((len(keys), len(keys))) 144 | for ik1, k1 in enumerate(keys): 145 | for ik2, k2 in enumerate(keys): 146 | D = float(np.sum(profiles[k1] == profiles[k2])) / NUM_TREES 147 | covar[ik1, ik2] = D 148 | 149 | ## Perform clustering on the affinity matrix 150 | clustering = SpectralClustering(affinity="precomputed", n_clusters=5) 151 | X = clustering.fit(covar) 152 | 153 | # Example of using K-means 154 | # clustering = KMeans(n_clusters=5) 155 | # X = clustering.fit(covar) 156 | 157 | ## Make a picture 158 | colors = ["r", "g", "b", "k", "c", "y", "m"] 159 | cols = [colors[i % 7] for i in X.labels_] 160 | plt.scatter(datas[:, 0], datas[:, 1], c=cols) 161 | plt.show() 162 | --------------------------------------------------------------------------------