├── README.md ├── md5s.zip ├── pom.xml └── src └── main ├── java-templates └── edu │ └── lps │ └── acs │ └── ml │ └── autoyara │ └── Version.java └── java └── edu └── lps └── acs └── ml └── autoyara ├── AutoYaraCluster.java ├── Bytes2Bloom.java ├── CountingBloom.java ├── CountingBloomInfo.java ├── SigCandidate.java ├── SpectralCoClusteringVBMM.java └── YaraRuleContainerConjunctive.java /README.md: -------------------------------------------------------------------------------- 1 | # AutoYara 2 | 3 | This is the java code implementing the AutoYara algorithm, from out paper [_Automatic Yara Rule Generation Using Biclustering_](https://arxiv.org/abs/2009.03779). Given a set up input files that belong to a given malware family, AutoYara can create [Yara](https://yara.readthedocs.io/en/stable/) rules from the input samples. Our testing indicates it can be successful with as few as 2 samples files, and can achieve very low false positive rates. The goal is to help analysts that need to create rules to weed out the easy families first, so that they can work on the samples that do not yield to automation. 4 | 5 | This is research code, and comes with no warranty or support. 6 | 7 | 8 | ## Quick Start 9 | 10 | You can download a pre-built binary of Autoyara from the release tab. If you have Java 11 (or greater) installed, you can get started by using the `-i` flag and providing a path to a file. If you give a folder, files will be selected from that folder recursively. Multiple files/paths can be specified using multiple `-i` arguments. 11 | 12 | ``` 13 | java -jar AutoYara.jar -i ~/family_dataset/test/azero/ 14 | ``` 15 | 16 | The final output will be written to the current directory. If you want to change the output directory or output file name, you can use `--out /path/to/name.yara` to change that. 17 | 18 | Unless you run on a few hundred files or more, the results should be done in a minute or two. The output is a standard Yara rule, like the below truncated example. 19 | ``` 20 | rule test 21 | { 22 | //Input TP Rate: 23 | //170/184 24 | strings: 25 | //Benign FP est: -0.0 Malicious FP est: -0.0 Entropy: 2.375 Found in 24 files 26 | $x705 = { 6C 00 65 00 20 00 6E 00 6F 00 74 00 20 00 66 00 } //This might be a string? Looks like:le not f 27 | //Benign FP est: -0.0 Malicious FP est: -0.0 Entropy: 4.0 Found in 14 files 28 | $x706 = { 44 24 04 59 5A 5E 5B C3 8D 40 00 55 8B EC 51 53 } //This might be a string? Looks like:D$YZ^[@UQS 29 | //Benign FP est: -0.0 Malicious FP est: -0.0 Entropy: 1.9237949406953985 Found in 20 files 30 | $x1 = { 83 C4 10 C2 08 00 CC CC CC CC CC CC CC CC CC CC } 31 | //Benign FP est: -0.0 Malicious FP est: -0.0 Entropy: 2.5 Found in 28 files 32 | $x708 = "`3d3h3l3p3t3x3|3" ascii 33 | //Benign FP est: -0.0 Malicious FP est: -0.0 Entropy: 3.875 Found in 13 files 34 | $x709 = { 5B 8B E5 5D C3 90 55 8B EC 83 C4 F0 53 56 57 89 } //This might be a string? Looks like:[]USVW 35 | //Benign FP est: -0.0 Malicious FP est: -0.0 Entropy: 3.202819531114783 Found in 14 files 36 | $x711 = { 00 00 00 46 69 6C 65 54 69 6D 65 54 6F 4C 6F 63 } //This might be a string? Looks like:FileTimeToLoc 37 | //Benign FP est: -0.0 Malicious FP est: -0.0 Entropy: 3.75 Found in 39 files 38 | $x5 = { 6D 3A 61 73 6D 2E 76 33 22 3E 3C 73 65 63 75 72 } //This might be a string? Looks like:m:asm.v3"> 73 | Edward Raff 74 | Richard Zak 75 | -------------------------------------------------------------------------------- /md5s.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FutureComputing4AI/AutoYara/16eb13024c833f7f498034297b19f417da737f12/md5s.zip -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | edu.lps.acs.ml 5 | AutoYara 6 | 1.0-SNAPSHOT 7 | jar 8 | 9 | 10 | jitpack.io 11 | https://jitpack.io 12 | 13 | 14 | 15 | 16 | me.tongfei 17 | progressbar 18 | 0.7.3 19 | 20 | 21 | com.github.NeuromorphicComputationResearchProgram 22 | KiloGrams 23 | -78300c4072-1 24 | 25 | 26 | dk.brics 27 | automaton 28 | 1.12-1 29 | 30 | 31 | 32 | UTF-8 33 | 11 34 | 11 35 | ${maven.build.timestamp} 36 | yyyy-MM-dd HH:mm:ss 37 | 38 | 39 | 40 | 41 | org.apache.maven.plugins 42 | maven-shade-plugin 43 | 3.2.2 44 | 45 | 46 | package 47 | 48 | shade 49 | 50 | 51 | true 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | edu.lps.acs.ml.autoyara.AutoYaraCluster 60 | ${maven.compile.source} 61 | ${maven.compile.target} 62 | 63 | 64 | 65 | 66 | 67 | 68 | org.codehaus.mojo 69 | templating-maven-plugin 70 | 1.0-alpha-3 71 | 72 | 73 | filter-src 74 | 75 | filter-sources 76 | 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /src/main/java-templates/edu/lps/acs/ml/autoyara/Version.java: -------------------------------------------------------------------------------- 1 | package edu.lps.acs.ml.autoyara; 2 | 3 | public class Version { 4 | public static final String buildTime = "${timestamp}"; 5 | public static final String pomVersion = "${project.version}"; 6 | } 7 | -------------------------------------------------------------------------------- /src/main/java/edu/lps/acs/ml/autoyara/AutoYaraCluster.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package edu.lps.acs.ml.autoyara; 7 | 8 | import com.beust.jcommander.JCommander; 9 | import com.beust.jcommander.Parameter; 10 | import com.beust.jcommander.ParameterException; 11 | import com.beust.jcommander.converters.EnumConverter; 12 | import edu.lps.acs.ml.ngram3.NGramGeneric; 13 | import edu.lps.acs.ml.ngram3.alphabet.AlphabetGram; 14 | import edu.lps.acs.ml.ngram3.utils.FileConverter; 15 | import edu.lps.acs.ml.ngram3.utils.GZIPHelper; 16 | import java.io.BufferedInputStream; 17 | import java.io.BufferedWriter; 18 | import java.io.File; 19 | import java.io.FileInputStream; 20 | import java.io.FileWriter; 21 | import java.io.IOException; 22 | import java.io.InputStream; 23 | import java.io.ObjectInputStream; 24 | import java.nio.file.FileVisitOption; 25 | import java.nio.file.Files; 26 | import java.nio.file.LinkOption; 27 | import java.nio.file.OpenOption; 28 | import java.nio.file.Path; 29 | import java.nio.file.StandardOpenOption; 30 | import java.util.ArrayList; 31 | import java.util.Arrays; 32 | import java.util.Collection; 33 | import java.util.Collections; 34 | import java.util.Comparator; 35 | import java.util.HashMap; 36 | import java.util.HashSet; 37 | import java.util.List; 38 | import java.util.Map; 39 | import java.util.Set; 40 | import java.util.SortedSet; 41 | import java.util.concurrent.ConcurrentHashMap; 42 | import java.util.concurrent.ConcurrentSkipListSet; 43 | import java.util.concurrent.atomic.AtomicBoolean; 44 | import java.util.concurrent.atomic.AtomicInteger; 45 | import java.util.function.Function; 46 | import java.util.logging.Level; 47 | import java.util.logging.Logger; 48 | import java.util.stream.BaseStream; 49 | import java.util.stream.Collectors; 50 | import java.util.stream.IntStream; 51 | import java.util.stream.Stream; 52 | import jsat.SimpleDataSet; 53 | import jsat.classifiers.DataPoint; 54 | import jsat.clustering.HDBSCAN; 55 | import jsat.clustering.VBGMM; 56 | import jsat.clustering.biclustering.Bicluster; 57 | import jsat.clustering.biclustering.SpectralCoClustering; 58 | import jsat.linear.IndexValue; 59 | import jsat.linear.SparseVector; 60 | import jsat.linear.Vec; 61 | import jsat.math.OnLineStatistics; 62 | import jsat.utils.IntList; 63 | import jsat.utils.concurrent.AtomicDouble; 64 | import me.tongfei.progressbar.ProgressBar; 65 | 66 | /** 67 | * 68 | * @author edraff 69 | */ 70 | public class AutoYaraCluster 71 | { 72 | 73 | @Parameter(names={"--false-pos-benign", "-fpb"}, description = "The maximum false-positive rate among other benign files to consider using a given signature") 74 | double false_pos_b = 0.001; 75 | 76 | @Parameter(names={"--false-pos-malicious", "-fpm"}, description = "The maximum false-positive rate among other malicious files to consider using a given signature") 77 | double false_pos_m = 0.001; 78 | 79 | @Parameter(names={"--min-support-ratio", "-msr"}, description = "The minimum fraction of input files that must be covered by an n-gram for the n-gram to be considered as a potential signature") 80 | double support_ratio = 0.5; 81 | 82 | @Parameter(names={"--min-entropy", "-me"}, description = "The minimum entropy level required of an n-gram to be considered") 83 | double min_entropy = 1.0; 84 | 85 | @Parameter(names={"--max-filter-size", "-mfs"}, description = "Maximum filter size to use in signature creation. Larger values may improve rule quality, but increase RAM usage") 86 | int max_filter_size = 214748383;//default value is a prime that will use ~1 GB of RAM 87 | 88 | @Parameter(names={"--min-support-count", "-msc"}, description = "The minimum number of files that a potential signature must catch to be considered for inclusion in the larger rule.") 89 | int support_count = 2; 90 | 91 | /** 92 | * How many different rules do we want that cover the same input files? 93 | */ 94 | @Parameter(names="--target-coverage", description = "During rule construction, how many sub-rules do you want to hit on each example? Larger values lead to larger rules.") 95 | int ways_covered = 1; 96 | 97 | 98 | @Parameter(names={"--to-keep", "-k"}, description="The number of n-gram candidates to create at every step of the process") 99 | int toKeep = 100000; 100 | 101 | @Parameter(names={"--benign", "-b"}, converter = FileConverter.class, description="Directory of bloom filters for benign files") 102 | File benign_bloom_dir = new File("benign-bytes"); 103 | 104 | @Parameter(names={"--malicious", "-m"}, converter = FileConverter.class, description="Directory of bloom filters for Malicious Files") 105 | File malicious_bloom_dir = new File("malicious-bytes"); 106 | 107 | @Parameter(names={"--fp-dirs", "-fpds"}, converter = FileConverter.class, required=false, 108 | variableArity = true, 109 | description="Directories of files to check against for false positivesas part of evaluation. These will not be used to alter the rule generated.") 110 | List fpEvalDirs = new ArrayList<>(); 111 | 112 | @Parameter(names={"--tp-dirs", "-tpds"}, converter = FileConverter.class, required=false, 113 | variableArity = true, 114 | description="Directories of files to check against for true positives as part of evaluation. These will not be used to alter the rule generated.") 115 | List tpEvalDirs = new ArrayList<>(); 116 | 117 | @Parameter(names={"--input-dir", "-i"}, converter = FileConverter.class, required=true, 118 | variableArity = true, 119 | description="Directory of files to n-gram") 120 | List inDir; 121 | 122 | @Parameter(names = "--save-all-rules", 123 | description = "If true, all yara rules created will be saved, rather " 124 | + "than just the best-found rule. This may be useful if the " 125 | + "selection heuristics do not actually select the best rule, or" 126 | + " you wish to do more testing / investigation. ") 127 | boolean save_all_rules = false; 128 | 129 | @Parameter(names = "--help", help = true) 130 | boolean help = false; 131 | 132 | @Parameter(names = "--silent") 133 | boolean silent = false; 134 | 135 | @Parameter(names = "--print-rules", description = "If true, print out the yara-rules onto the command line.") 136 | boolean print_rules = false; 137 | 138 | @Parameter(names={"--out", "-o"}, converter = FileConverter.class, 139 | description="Output file/directory. If only one rule is to be created, " 140 | + "and output is a directory, the name will be infered from the " 141 | + "first input file. If multiple rule options are to be saved, " 142 | + "the first directory in the given path will be used. Multiple " 143 | + "rules will be saved with a pre-fix of the rule size type. By " 144 | + "default, rules are writen out to the current directory. ") 145 | File out_file = null; 146 | 147 | public static void main(String... args) throws IOException 148 | { 149 | System.out.println("AutoYara version " + Version.pomVersion + ", compile date: " + Version.buildTime); 150 | AutoYaraCluster main = new AutoYaraCluster(); 151 | 152 | JCommander optionParser = JCommander.newBuilder() 153 | .addObject(main) 154 | .build(); 155 | try { 156 | optionParser.parse(args); 157 | } catch(ParameterException ex) { 158 | optionParser.usage(); 159 | return; 160 | } 161 | 162 | if(main.help) 163 | { 164 | optionParser.usage(); 165 | return; 166 | } 167 | 168 | main.run(); 169 | } 170 | 171 | public static int log2( int bits ) 172 | { 173 | if( bits == 0 ) 174 | return 0; 175 | return 31 - Integer.numberOfLeadingZeros(bits); 176 | } 177 | 178 | public void run() throws IOException 179 | { 180 | if(out_file == null) 181 | out_file = new File(inDir.get(0).getName() + ".yara"); 182 | final String name = out_file.getName().replace(".yara", ""); 183 | final File out_dir; 184 | if(out_file.isDirectory()) 185 | { 186 | out_dir = out_file; 187 | out_file = new File(out_dir, name); 188 | } 189 | else 190 | out_dir = out_file.getParentFile(); 191 | 192 | //sort from high to low 193 | SortedSet bloomSizes = new ConcurrentSkipListSet<>((a, b) -> a.compareTo(b)); 194 | collectBloomSizes(bloomSizes, benign_bloom_dir, malicious_bloom_dir); 195 | 196 | Map ben_blooms = collectBloomFilters(benign_bloom_dir); 197 | Map mal_blooms = collectBloomFilters(malicious_bloom_dir); 198 | 199 | ///////////////////////// 200 | //we now have our filters 201 | //////////////////////// 202 | 203 | List targets = getAllChildrenFiles(inDir); 204 | 205 | 206 | /** 207 | * A n-gram must occur in at least this many files to be a candidate for selection 208 | */ 209 | 210 | ///////////// 211 | //, lets find some potential yara rules! 212 | ///////////// 213 | 214 | final Collection best_rule = new ArrayList(); 215 | final AtomicDouble best_rule_coverage = new AtomicDouble(0); 216 | /** 217 | * Whether or not we meet the goal of having at least 5 terms/features 218 | * in conjunctions 219 | */ 220 | final AtomicBoolean meets_min_desired_coverage = new AtomicBoolean(false); 221 | final AtomicInteger best_rule_gram_size = new AtomicInteger(0); 222 | 223 | final Map, SigCandidate> multi_gram_working_set = new HashMap<>(); 224 | 225 | bloomSizes.stream().forEach(gram_size-> 226 | { 227 | 228 | if(best_rule_coverage.get() >= 1.0 && meets_min_desired_coverage.get()) 229 | return;//STOP, you can't get any better 230 | 231 | List finalCandidates = buildCandidateSet(targets, gram_size, ben_blooms, mal_blooms, 232 | max_filter_size, toKeep, silent, Math.max(false_pos_b, false_pos_m)); 233 | 234 | Set alreadyFrailedOn = new HashSet<>(); 235 | for(SpectralCoClustering.InputNormalization norm : SpectralCoClustering.InputNormalization.values()) 236 | { 237 | Set rows_covered = new HashSet<>(); 238 | 239 | YaraRuleContainerConjunctive yara= buildRule(finalCandidates, targets, rows_covered, name, norm, 240 | gram_size, alreadyFrailedOn); 241 | 242 | double fp_rate = fpEvalDirs.isEmpty() ? 0 : addMatchEval("False Positives:", fpEvalDirs, yara); 243 | double tp_rate = tpEvalDirs.isEmpty() ? 0 : addMatchEval("True Positives:", tpEvalDirs, yara); 244 | double input_tp_rate = addMatchEval("Input TP Rate:", inDir, yara); 245 | 246 | if(print_rules) 247 | { 248 | System.out.println(yara); 249 | // System.out.println("Selected " + toUse.size() + " grams to cover " + this_coverage); 250 | } 251 | 252 | if(save_all_rules) 253 | { 254 | try(BufferedWriter bw = new BufferedWriter(new FileWriter(new File(out_dir, name + "_" + gram_size + "_" + norm.name() + ".yara")))) 255 | { 256 | bw.write(yara.toString()); 257 | } 258 | catch (IOException ex) 259 | { 260 | Logger.getLogger(AutoYaraCluster.class.getName()).log(Level.SEVERE, null, ex); 261 | } 262 | } 263 | 264 | int log_diff_gram_size = log2(gram_size)-log2(best_rule_gram_size.get()); 265 | boolean this_rule_strong = yara.minConjunctionSize() >= 5; 266 | double penalty = Math.min(yara.minConjunctionSize()/5.0, 1); 267 | 268 | if(input_tp_rate*penalty > best_rule_coverage.get() + log_diff_gram_size/100.0)//give a slight favor to smaller rules! 269 | { 270 | best_rule.clear(); 271 | best_rule.add(yara); 272 | 273 | best_rule_coverage.set(input_tp_rate*penalty); 274 | best_rule_gram_size.set(gram_size); 275 | meets_min_desired_coverage.set(this_rule_strong); 276 | } 277 | } 278 | }); 279 | 280 | if(best_rule.isEmpty()) 281 | { 282 | System.out.println("Could not create yara-rule that matched constraints :("); 283 | return; 284 | } 285 | 286 | if(!silent) 287 | System.out.println("Saving rule to " + out_file.getAbsolutePath()); 288 | try(BufferedWriter bw = new BufferedWriter(new FileWriter(out_file))) 289 | { 290 | YaraRuleContainerConjunctive yara = best_rule.stream().findFirst().get(); 291 | bw.write(yara.toString()); 292 | } 293 | 294 | 295 | } 296 | 297 | static public List getAllChildrenFiles(Path... sourceDirs) 298 | { 299 | return getAllChildrenFiles(Arrays.asList(sourceDirs).stream().map(f->f.toFile()).collect(Collectors.toList())); 300 | } 301 | 302 | static public List getAllChildrenFiles(File... sourceDirs) 303 | { 304 | return getAllChildrenFiles(Arrays.asList(sourceDirs)); 305 | } 306 | 307 | static public List getAllChildrenFiles(List sourceDirs) 308 | { 309 | 310 | 311 | List targets = sourceDirs.stream().flatMap(f-> 312 | { 313 | try 314 | { 315 | return Files.walk(f.toPath(), FileVisitOption.FOLLOW_LINKS).filter(Files::isRegularFile); 316 | } 317 | catch (IOException ex) 318 | { 319 | Logger.getLogger(AutoYaraCluster.class.getName()).log(Level.SEVERE, null, ex); 320 | return new ArrayList().stream(); 321 | } 322 | }).collect(Collectors.toList()); 323 | return targets; 324 | } 325 | 326 | /** 327 | * This method does the work to create a final list of n-gram candidates to be used in the rule creation process 328 | * @param targets the list of files to create a rule that matches 329 | * @param gram_size the n-gram size to use 330 | * @param ben_blooms the set of known benign bloom filters 331 | * @param mal_blooms the set of known malicious bloom filters 332 | * @return a set of signature candidate objects 333 | */ 334 | public static List buildCandidateSet(List targets, int gram_size, 335 | Map ben_blooms, Map mal_blooms, 336 | long max_filter_size, int toKeep, boolean silent, double fp_rate) 337 | { 338 | long totalbytes = targets.stream().mapToLong(p-> 339 | { 340 | try 341 | { 342 | return Files.size(p); 343 | } 344 | catch (IOException ex) 345 | { 346 | Logger.getLogger(AutoYaraCluster.class.getName()).log(Level.SEVERE, null, ex); 347 | return 0L; 348 | } 349 | }).sum(); 350 | 351 | NGramGeneric ngram = new NGramGeneric(); 352 | ngram.setAlphabetSize(256); 353 | long filter_size = Math.min(totalbytes/4, max_filter_size); 354 | ngram.setFilterSize((int) filter_size); 355 | ngram.setGramSize(gram_size); 356 | ngram.setTooKeep(toKeep); 357 | 358 | ngram.init(); 359 | 360 | wrap(targets.parallelStream(), "Finding candidate " + gram_size +"-byte sequences", silent) 361 | .forEach(p-> 362 | { 363 | try(InputStream in = new BufferedInputStream(GZIPHelper.getStream(Files.newInputStream(p)))) 364 | { 365 | ngram.hashCount(in); 366 | } 367 | catch (IOException | InterruptedException ex) 368 | { 369 | Logger.getLogger(AutoYaraCluster.class.getName()).log(Level.SEVERE, null, ex); 370 | } 371 | }); 372 | 373 | 374 | ngram.finishHashCount(); 375 | 376 | wrap(targets.parallelStream(), "Finding final " + gram_size +"-byte sequences", silent) 377 | .forEach(p-> 378 | { 379 | try(InputStream in = new BufferedInputStream(GZIPHelper.getStream(Files.newInputStream(p)))) 380 | { 381 | ngram.exactCount(in); 382 | } 383 | catch (IOException ex) 384 | { 385 | Logger.getLogger(AutoYaraCluster.class.getName()).log(Level.SEVERE, null, ex); 386 | } 387 | }); 388 | 389 | Map final_candidates = ngram.finishExactCount(); 390 | //We now have a set of potential n-grams to use as yara rules. 391 | //Lets go through and remove non-vaiable candidates 392 | 393 | CountingBloom ben_bloom = ben_blooms.get(gram_size); 394 | CountingBloom mal_bloom = mal_blooms.get(gram_size); 395 | //your FP rates are too high! Remove them! 396 | final_candidates.entrySet().removeIf(e-> 397 | { 398 | AlphabetGram candidate = e.getKey(); 399 | 400 | int num_00_ff = 0; 401 | for(int i = 0; i < candidate.size(); i++) 402 | if(candidate.get(i) == 0x00 || candidate.get(i) == 0xFF) 403 | num_00_ff++; 404 | if(num_00_ff > candidate.size()/2) 405 | return true; 406 | 407 | double ben_fp = ben_bloom.get(candidate) / (double) ben_bloom.divisor; 408 | double mal_fp = mal_bloom.get(candidate) / (double) mal_bloom.divisor; 409 | return ben_fp > fp_rate || mal_fp > fp_rate; 410 | }); 411 | 412 | //Now we need to scan the data again. We have rough hit rates 413 | //but some of our input rules may be very corelated with eachother 414 | //so lets figure that out 415 | 416 | Map> files_occred_in = new HashMap<>(); 417 | final_candidates.keySet().forEach(k->files_occred_in.put(k, new ConcurrentSkipListSet())); 418 | 419 | AtomicInteger simpleID = new AtomicInteger(); 420 | wrap(targets.parallelStream(), "Determining co-occurance of " + gram_size +"-byte sequences", silent) 421 | .forEach(p-> 422 | { 423 | try(InputStream in = new BufferedInputStream(GZIPHelper.getStream(Files.newInputStream(p)))) 424 | { 425 | ngram.incrementConuts(in, simpleID.getAndIncrement(), files_occred_in); 426 | } 427 | catch (IOException ex) 428 | { 429 | Logger.getLogger(AutoYaraCluster.class.getName()).log(Level.SEVERE, null, ex); 430 | } 431 | }); 432 | 433 | 434 | Map> cur_working_set = new ConcurrentHashMap<>(files_occred_in.size()); 435 | 436 | //Populate current working set to try and filter down. 437 | files_occred_in.entrySet().parallelStream().forEach(e-> 438 | { 439 | AlphabetGram candidate = e.getKey(); 440 | double ben_fp = ben_bloom.get(candidate) / (double) ben_bloom.divisor; 441 | double mal_fp = mal_bloom.get(candidate) / (double) mal_bloom.divisor; 442 | cur_working_set.put(new SigCandidate(candidate, ben_fp, mal_fp, e.getValue()), e.getValue()); 443 | }); 444 | 445 | 446 | List sigCandidates = Collections.EMPTY_LIST; 447 | sigCandidates = cur_working_set.keySet().parallelStream() 448 | .filter(s->s.getEntropy()>1.0) 449 | .collect(Collectors.toList()); 450 | 451 | if(sigCandidates.isEmpty())//We need to try a different n-gram size 452 | return Collections.EMPTY_LIST; 453 | 454 | //Create a final candidate list, remove signatures that have 100% correlation with others 455 | List finalCandidates = new ArrayList<>(); 456 | Map, List> coverageGrouped = new ConcurrentHashMap<>(); 457 | //done i parallel b/c hashing cost on the Set can be a bit pricy 458 | sigCandidates.parallelStream().forEach(s-> 459 | { 460 | //slightly odd call structure ensures we don't fall victim to any race condition 461 | coverageGrouped.putIfAbsent(s.coverage, new ArrayList<>()); 462 | List storage = coverageGrouped.get(s.coverage); 463 | 464 | //copute entropy now in parallel for use later 465 | synchronized(storage) 466 | { 467 | storage.add(s); 468 | } 469 | }); 470 | 471 | for(List group : coverageGrouped.values()) 472 | { 473 | //Pick gram with maximum entropy 474 | if(!group.isEmpty()) 475 | finalCandidates.add(Collections.max(group, (SigCandidate arg0, SigCandidate arg1) -> Double.compare(arg0.getEntropy(), arg1.getEntropy()))); 476 | } 477 | 478 | return finalCandidates; 479 | } 480 | 481 | /** 482 | * 483 | * @param header A string to add to the begining of the comment for the results 484 | * @param evalDirs the list of directories to perform evaluations on 485 | * @param yara the yara rule to evaluate, for which we will add comments to the Yara rule with the rules on each directory 486 | * @return the match rate against all files in the given directories 487 | */ 488 | public static double addMatchEval(String header, List evalDirs, YaraRuleContainerConjunctive yara) 489 | { 490 | //Lets check against false positive directories to make sure all is kosher in the world 491 | if(!evalDirs.isEmpty()) 492 | { 493 | double numer = 0; 494 | double denom = 0; 495 | StringBuilder comment = new StringBuilder(); 496 | comment.append(header).append("\n"); 497 | 498 | /** 499 | * If there are sub folders, we will add comments to delineate by 500 | * folder what the hits where. If this is just a list of files, we 501 | * will change naming style of the comment. 502 | */ 503 | boolean added_based_on_folders = false; 504 | List looseFiles = new ArrayList<>(); 505 | for(File dir : evalDirs) 506 | { 507 | if(dir.isFile()) 508 | { 509 | looseFiles.add(dir); 510 | continue; 511 | } 512 | 513 | try 514 | { 515 | List toTest = Files.walk(dir.toPath(), FileVisitOption.FOLLOW_LINKS) 516 | .filter(Files::isRegularFile).collect(Collectors.toList()); 517 | for(Path p : toTest) 518 | looseFiles.add(p.toFile()); 519 | if(!toTest.isEmpty()) 520 | continue; 521 | comment.append(dir.getAbsoluteFile() + ":"); 522 | List fps = toTest.parallelStream().filter(p-> 523 | { 524 | try(BufferedInputStream bis = new BufferedInputStream(GZIPHelper.getStream(Files.newInputStream(p)))) 525 | { 526 | return yara.match(bis); 527 | } 528 | catch (IOException ex) 529 | { 530 | return false; 531 | } 532 | }).collect(Collectors.toList()); 533 | 534 | denom += toTest.size(); 535 | numer += fps.size(); 536 | comment.append(fps.size() + "/" + toTest.size() + "\n"); 537 | added_based_on_folders = true; 538 | if(!fps.isEmpty()) 539 | { 540 | //TODO, write out the files that we FPd on 541 | } 542 | 543 | } 544 | catch (IOException ex) 545 | { 546 | Logger.getLogger(AutoYaraCluster.class.getName()).log(Level.SEVERE, null, ex); 547 | } 548 | 549 | yara.addComment(comment.toString()); 550 | } 551 | 552 | //The loose files now get done in one go 553 | List fps = looseFiles.parallelStream().filter(p-> 554 | { 555 | try(BufferedInputStream inputStream = new BufferedInputStream(GZIPHelper.getStream(Files.newInputStream(p.toPath())))) 556 | { 557 | return yara.match(inputStream); 558 | } 559 | catch (IOException ex) 560 | { 561 | return false; 562 | } 563 | }).collect(Collectors.toList()); 564 | 565 | denom += looseFiles.size(); 566 | numer += fps.size(); 567 | if(added_based_on_folders) 568 | comment.append("Other Files:"); 569 | //else, its not "other", but all 570 | comment.append(fps.size() + "/" + looseFiles.size() + "\n"); 571 | if(!fps.isEmpty()) 572 | { 573 | //TODO, write out the files that we FPd on 574 | } 575 | yara.addComment(comment.toString()); 576 | 577 | return numer/denom; 578 | } 579 | else 580 | return 1.0; 581 | } 582 | 583 | static public YaraRuleContainerConjunctive buildRule(List finalCandidates, List targets, Set rows_covered, final String name, SpectralCoClustering.InputNormalization normalization, int gram_size, Set alreadyFailedOn) 584 | { 585 | int D = finalCandidates.size(); 586 | int N = targets.size(); 587 | YaraRuleContainerConjunctive yara = new YaraRuleContainerConjunctive(N, name); 588 | if(D == 0)//Nothing to do :( 589 | return yara; 590 | // System.out.println("We have " + D + " potential features"); 591 | //Lets build a dataset object representing the files and which signatures (features) occured in each 592 | List dataRep = new ArrayList<>(); 593 | for(int i = 0; i < N; i++) 594 | dataRep.add(new SparseVector(D, 4)); 595 | for(int d = 0; d < D; d++) 596 | { 597 | for(int i : finalCandidates.get(d).coverage) 598 | dataRep.get(i).set(d, 1.0); 599 | } 600 | SimpleDataSet sigDataset = new SimpleDataSet(dataRep.stream().map(v->new DataPoint(v)).collect(Collectors.toList())); 601 | List> conjuction_sets = new ArrayList<>(); 602 | // 603 | int min_rows = 5; 604 | int min_features = 5; 605 | List> row_clusters = new ArrayList<>(); 606 | List> col_clusters = new ArrayList<>(); 607 | 608 | if(D == 1)//not much to do... 609 | { 610 | col_clusters.add(IntList.range(D)); 611 | row_clusters.add(IntList.range(N)); 612 | } 613 | else 614 | { 615 | try 616 | { 617 | getCoClustering(sigDataset, row_clusters, col_clusters, normalization); 618 | } 619 | catch(Exception ex) 620 | { 621 | } 622 | 623 | if(!alreadyFailedOn.contains(gram_size) && (row_clusters.isEmpty() || col_clusters.isEmpty())) 624 | { 625 | alreadyFailedOn.add(gram_size); 626 | row_clusters.clear(); 627 | col_clusters.clear(); 628 | try 629 | { 630 | getCoClusteringH(sigDataset, row_clusters, col_clusters); 631 | } 632 | catch(Exception ex2) 633 | { 634 | //we give up 635 | row_clusters.clear(); 636 | col_clusters.clear(); 637 | } 638 | } 639 | } 640 | 641 | int max_row_size_seen = row_clusters.stream().mapToInt(r->r.size()).max().orElse(1); 642 | if(max_row_size_seen < min_rows) 643 | min_rows = max_row_size_seen; 644 | List feature_counts_all = new ArrayList<>(); 645 | //in ordr to know the true minimum count, we need to first perform a 646 | //filtering based on the counts, becasue we will perform a filtering 647 | //later. So lets collect this information now 648 | for(List row_c : row_clusters) 649 | { 650 | int[] feature_counts = new int[D]; 651 | for(int i: row_c) 652 | for(IndexValue iv : sigDataset.getDataPoint(i).getNumericalValues()) 653 | feature_counts[iv.getIndex()]++; 654 | feature_counts_all.add(feature_counts); 655 | } 656 | int max_features_seen = IntStream.range(0, row_clusters.size()).map(c-> 657 | { 658 | int C_size = row_clusters.get(c).size(); 659 | int[] feature_counts = feature_counts_all.get(c); 660 | return (int)col_clusters.get(c).stream().filter(j->feature_counts[j] >= 0.5*C_size).count(); 661 | }).max().orElse(1); 662 | if(max_features_seen < min_features)//if we didn't mean the min, subtract an extra b/c otherwise we need the min count to hit the max rows, which is not likely 663 | min_features = Math.max(max_features_seen-1, 1); 664 | 665 | for (int c = 0; c < row_clusters.size(); c++) 666 | { 667 | int C_size = row_clusters.get(c).size(); 668 | if(C_size < min_rows) 669 | continue; 670 | int[] feature_counts = feature_counts_all.get(c); 671 | 672 | //First, lets remove obvious non-starters. You need to appear in at least half the files in your cluster 673 | Set selected_features = new HashSet<>(col_clusters.get(c)); 674 | 675 | //We are only going to consider features that occur in >= 50% of this cluster 676 | selected_features.removeIf(j-> feature_counts[j] < 0.5*C_size); 677 | 678 | if(selected_features.size() < min_features) 679 | continue; 680 | 681 | conjuction_sets.add(selected_features); 682 | 683 | 684 | //how many files have at least X of these features? 685 | List file_occurance_counts = new ArrayList<>(); 686 | rows_covered.addAll(row_clusters.get(c)); 687 | for(int i : row_clusters.get(c)) 688 | { 689 | int nnz_in_bic = 0; 690 | for(IndexValue iv : dataRep.get(i)) 691 | if(selected_features.contains(iv.getIndex()) && iv.getValue() > 0) 692 | nnz_in_bic++; 693 | file_occurance_counts.add(nnz_in_bic); 694 | } 695 | 696 | Collections.sort(file_occurance_counts); 697 | OnLineStatistics right_portion = new OnLineStatistics(); 698 | for(int count : file_occurance_counts) 699 | right_portion.add(count); 700 | 701 | OnLineStatistics left_portion = new OnLineStatistics(); 702 | double min_score = Double.POSITIVE_INFINITY; 703 | int indx = 0; 704 | for(int i = 0; i < file_occurance_counts.size()-1; i++) 705 | { 706 | int count_i = file_occurance_counts.get(i); 707 | left_portion.add(count_i, 1.0); 708 | right_portion.remove(count_i, 1.0); 709 | //same value check, keep shifting while the value stays the same 710 | while(i < file_occurance_counts.size()-1 && count_i == file_occurance_counts.get(i+1)) 711 | { 712 | i++; 713 | left_portion.add(count_i, 1.0); 714 | right_portion.remove(count_i, 1.0); 715 | } 716 | 717 | 718 | double cur_score = left_portion.getVarance()*left_portion.getSumOfWeights() 719 | + right_portion.getVarance() * right_portion.getSumOfWeights(); 720 | 721 | if(cur_score < min_score) 722 | { 723 | indx = i; 724 | min_score = cur_score; 725 | } 726 | } 727 | 728 | 729 | int count_min = file_occurance_counts.get(Math.min(indx+1, file_occurance_counts.size()-1)); 730 | 731 | yara.addSignature(count_min, selected_features.stream().map(i->finalCandidates.get(i)).collect(Collectors.toSet())); 732 | 733 | } 734 | return yara; 735 | } 736 | 737 | private static void getCoClusteringH(SimpleDataSet sigDataset, List> rows, List> cols) 738 | { 739 | int D = sigDataset.getNumFeatures(); 740 | int[] cluster_assingments = getClustering(sigDataset); 741 | int num_clusters = IntStream.of(cluster_assingments).max().getAsInt()+1; 742 | for (int c = 0; c < num_clusters; c++) 743 | { 744 | IntList row = new IntList(); 745 | 746 | int[] feature_counts = new int[D]; 747 | for(int i = 0; i < cluster_assingments.length; i++) 748 | if(cluster_assingments[i] == c) 749 | { 750 | row.add(i); 751 | for(IndexValue iv : sigDataset.getDataPoint(i).getNumericalValues()) 752 | feature_counts[iv.getIndex()]++; 753 | } 754 | final int n_c =row.size(); 755 | 756 | //First, lets remove obvious non-starters. You need to appear in at least half the files in your cluster 757 | Set selected_features = IntStream.range(0, D) 758 | .filter(j->feature_counts[j] >= n_c*0.5) 759 | .boxed().collect(Collectors.toSet()); 760 | 761 | if(selected_features.isEmpty()) 762 | continue; 763 | //TODO, what if no-one satisfied the above? 764 | 765 | //Lets reduce to a set of features that occur at least as frequently as the median count 766 | int[] counts = selected_features.stream().mapToInt(i->feature_counts[i]).toArray(); 767 | Arrays.sort(counts); 768 | int median_feature_count = counts[counts.length/2]; 769 | selected_features.removeIf(j->feature_counts[j] < median_feature_count); 770 | 771 | 772 | if(!selected_features.isEmpty()) 773 | { 774 | 775 | } 776 | 777 | rows.add(row); 778 | cols.add(new IntList(selected_features)); 779 | 780 | } 781 | } 782 | 783 | static private void getCoClustering(SimpleDataSet sigDataset, List> rows, List> cols, SpectralCoClustering.InputNormalization norm) 784 | { 785 | SpectralCoClusteringVBMM bc = new SpectralCoClusteringVBMM(); 786 | bc.inputNormalization = norm; 787 | bc.bicluster(sigDataset, true, rows, cols); 788 | // SpectralCoClustering bc = new SpectralCoClustering(); 789 | // bc.setBaseClusterAlgo(new VBGMM()); 790 | // bc.bicluster(sigDataset, true, rows, cols); 791 | // int min_pts = 15; 792 | // while(rows.isEmpty() && min_pts > 3) 793 | // { 794 | // System.out.println("trying " + min_pts); 795 | // bc.setBaseClusterAlgo(new HDBSCAN(min_pts--)); 796 | // bc.bicluster(sigDataset, true, rows, cols); 797 | // } 798 | } 799 | 800 | private static int[] getClustering(SimpleDataSet sigDataset) 801 | { 802 | int[] designations = new int[sigDataset.size()]; 803 | Arrays.fill(designations, -1); 804 | //Ok, lets do some clustering and try to find good feature set intersections 805 | for (int min_pts : new int[]{15, 10, 5}) 806 | { 807 | HDBSCAN cluster_algo = new HDBSCAN(min_pts); 808 | cluster_algo.cluster(sigDataset, true, designations); 809 | int clusters = IntStream.of(designations).max().getAsInt()+1; 810 | if(clusters > 0) 811 | return designations; 812 | } 813 | return designations; 814 | } 815 | 816 | static public > Stream wrap(Stream stream, String task, boolean silent) 817 | { 818 | if(silent) 819 | return stream; 820 | else 821 | return ProgressBar.wrap(stream, task); 822 | } 823 | 824 | public > Stream wrap(Stream stream, String task) 825 | { 826 | return wrap(stream, task, silent); 827 | } 828 | 829 | /** 830 | * 831 | * @param bloom_dir the directory that contains bloom filters 832 | * @return a map where the key is the n-gram size, and the value is the corresponding bloom filter. 833 | * @throws IOException 834 | */ 835 | public static Map collectBloomFilters(File bloom_dir) throws IOException 836 | { 837 | Map blooms = new HashMap<>(); 838 | Files.walk(bloom_dir.toPath(), FileVisitOption.FOLLOW_LINKS) 839 | //just bloom filters 840 | .filter(p->p.getFileName().toString().endsWith(".bloom")) 841 | .forEach(p-> 842 | { 843 | try(ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(Files.newInputStream(p)))) 844 | { 845 | CountingBloom bloom = (CountingBloom) ois.readObject(); 846 | int gram_size = bloomNameToGramSize(p.getFileName().toString()); 847 | blooms.put(gram_size, bloom); 848 | } 849 | catch (IOException | ClassNotFoundException ex) 850 | { 851 | Logger.getLogger(AutoYaraCluster.class.getName()).log(Level.SEVERE, null, ex); 852 | } 853 | }); 854 | return blooms; 855 | } 856 | 857 | public static void collectBloomSizes(SortedSet bloomSizes, File... dirs) throws IOException 858 | { 859 | for(File dir : dirs) 860 | Files.walk(dir.toPath(), FileVisitOption.FOLLOW_LINKS) 861 | //just bloom filters 862 | .map(p->p.getFileName().toString()) 863 | .filter(s->s.endsWith(".bloom")) 864 | .filter(s->s.matches(".+_\\d+\\.bloom"))//name is formated as "name_size.bloom" 865 | .mapToInt(s->bloomNameToGramSize(s)) 866 | .forEach(bloomSizes::add); 867 | } 868 | 869 | public static int bloomNameToGramSize(String s) throws NumberFormatException 870 | { 871 | String[] tmp = s.replace(".bloom", "").split("_"); 872 | return Integer.parseInt(tmp[tmp.length-1]); 873 | } 874 | } 875 | -------------------------------------------------------------------------------- /src/main/java/edu/lps/acs/ml/autoyara/Bytes2Bloom.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package edu.lps.acs.ml.autoyara; 7 | 8 | import com.beust.jcommander.JCommander; 9 | import com.beust.jcommander.Parameter; 10 | import com.beust.jcommander.ParameterException; 11 | import edu.lps.acs.ml.ngram3.NGramGeneric; 12 | import edu.lps.acs.ml.ngram3.alphabet.AlphabetGram; 13 | import edu.lps.acs.ml.ngram3.utils.FileConverter; 14 | import edu.lps.acs.ml.ngram3.utils.GZIPHelper; 15 | import java.io.BufferedInputStream; 16 | import java.io.BufferedOutputStream; 17 | import java.io.File; 18 | import java.io.FileInputStream; 19 | import java.io.FileOutputStream; 20 | import java.io.IOException; 21 | import java.io.InputStream; 22 | import java.io.ObjectOutputStream; 23 | import java.nio.file.FileVisitOption; 24 | import java.nio.file.Files; 25 | import java.util.List; 26 | import java.util.Map; 27 | import java.util.concurrent.atomic.AtomicInteger; 28 | import java.util.logging.Level; 29 | import java.util.logging.Logger; 30 | import java.util.stream.Collectors; 31 | import static java.lang.Math.*; 32 | import java.util.stream.Stream; 33 | import me.tongfei.progressbar.ProgressBar; 34 | import me.tongfei.progressbar.ProgressBarStyle; 35 | 36 | /** 37 | * 38 | * @author edraff 39 | */ 40 | public class Bytes2Bloom 41 | { 42 | @Parameter(names="--filter-size") 43 | int filterSize = Integer.MAX_VALUE - 18; 44 | 45 | @Parameter(names={"--false-pos-rate", "-fp"}) 46 | double false_pos = 0.01; 47 | 48 | @Parameter(names={"--progress-bar", "-pb"}) 49 | boolean pb_bars = false; 50 | 51 | @Parameter(names="--name", description="NGrams to keep") 52 | String out_name; 53 | 54 | @Parameter(names={"--too-keep", "-k"}, required=true, description="NGrams to keep") 55 | int tooKeep; 56 | @Parameter(names={"--ngram-size", "-n"}, required=true, description="Sizes of ngrams") 57 | List gramSizes; 58 | 59 | @Parameter(names={"--input-dir", "-i"}, converter = FileConverter.class, required=true, description="Directory of files to n-gram") 60 | File inDir; 61 | 62 | @Parameter(names={"--out", "-o"}, converter = FileConverter.class, required=true, description="Output file") 63 | File outDir; 64 | 65 | public static void main(String... args) throws IOException 66 | { 67 | System.out.println("AutoYara version " + Version.pomVersion + ", compile date: " + Version.buildTime); 68 | Bytes2Bloom main = new Bytes2Bloom(); 69 | 70 | JCommander optionParser = JCommander.newBuilder() 71 | .addObject(main) 72 | .build(); 73 | try { 74 | optionParser.parse(args); 75 | } catch(ParameterException ex) { 76 | optionParser.usage(); 77 | return; 78 | } 79 | 80 | 81 | main.run(); 82 | } 83 | 84 | public void run() throws IOException 85 | { 86 | if(out_name == null) 87 | out_name = inDir.getName(); 88 | 89 | final int filter_slots = (int) ceil((tooKeep * log(false_pos)) / log(1 / pow(2, log(2)))); 90 | final int filter_hashes = (int) round((filter_slots / (double)tooKeep) * log(2)); 91 | 92 | /** 93 | * All the files we will be running n-grams over 94 | */ 95 | List allFiles = Files.walk(inDir.toPath(), FileVisitOption.FOLLOW_LINKS) 96 | .parallel() 97 | .filter(p->!Files.isDirectory(p)) 98 | .map(p->p.toFile()) 99 | .collect(Collectors.toList()); 100 | 101 | gramSizes.forEach(gram_size-> 102 | { 103 | NGramGeneric ngram = new NGramGeneric(); 104 | ngram.setAlphabetSize(256); 105 | ngram.setFilterSize(filterSize); 106 | ngram.setGramSize(gram_size); 107 | ngram.setTooKeep(tooKeep); 108 | 109 | System.out.println("Starting " + gram_size + "-grams of " + allFiles.size() + " files..."); 110 | 111 | ngram.init(); 112 | 113 | Stream stream = allFiles.parallelStream(); 114 | 115 | if(pb_bars) 116 | stream = ProgressBar.wrap(stream, "Hash-Pass"); 117 | 118 | stream.forEach(f-> 119 | { 120 | try(InputStream is = new BufferedInputStream(GZIPHelper.getStream(new FileInputStream(f)))) 121 | { 122 | ngram.hashCount(is); 123 | } 124 | catch (IOException | InterruptedException ex) 125 | { 126 | Logger.getLogger(Bytes2Bloom.class.getName()).log(Level.SEVERE, null, ex); 127 | } 128 | 129 | }); 130 | 131 | System.out.println("Finding top-k hashes"); 132 | ngram.finishHashCount(); 133 | 134 | stream = allFiles.parallelStream(); 135 | if(pb_bars) 136 | stream = ProgressBar.wrap(stream, "Exact-Pass"); 137 | stream.forEach(f-> 138 | { 139 | try(InputStream is = new BufferedInputStream(GZIPHelper.getStream(new FileInputStream(f)))) 140 | { 141 | ngram.exactCount(is); 142 | } 143 | catch (IOException ex) 144 | { 145 | Logger.getLogger(Bytes2Bloom.class.getName()).log(Level.SEVERE, null, ex); 146 | } 147 | 148 | }); 149 | 150 | Map found_grams = ngram.finishExactCount(); 151 | CountingBloom bloom = new CountingBloom(filter_slots, filter_hashes); 152 | bloom.divisor = allFiles.size(); 153 | 154 | for(Map.Entry entry : found_grams.entrySet()) 155 | bloom.put(entry.getKey(), entry.getValue().get()); 156 | 157 | try(ObjectOutputStream out = new ObjectOutputStream( 158 | new BufferedOutputStream(new FileOutputStream( 159 | new File(outDir, out_name + "_" + gram_size + ".bloom"))))) 160 | { 161 | out.writeObject(bloom); 162 | } 163 | catch (IOException ex) 164 | { 165 | Logger.getLogger(Bytes2Bloom.class.getName()).log(Level.SEVERE, null, ex); 166 | } 167 | 168 | }); 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /src/main/java/edu/lps/acs/ml/autoyara/CountingBloom.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package edu.lps.acs.ml.autoyara; 7 | 8 | import java.io.IOException; 9 | import java.io.ObjectInputStream; 10 | import java.io.ObjectOutputStream; 11 | import java.io.Serializable; 12 | import java.util.Random; 13 | 14 | /** 15 | * 16 | * @author edraff 17 | */ 18 | public class CountingBloom implements Serializable 19 | { 20 | static final long serialVersionUID = -5941164052778931036L; 21 | 22 | /** 23 | * default base value should be good to reach 2^32 24 | */ 25 | double base = 1.090878326190223750496427194244941608437246127397209913749; 26 | double log_base = 0.086983175599679411377848736810437843836925507056973208359; 27 | byte[] counts; 28 | 29 | int added = 0; 30 | int divisor = 0; 31 | 32 | int[] hashSeeds; 33 | 34 | public CountingBloom() 35 | { 36 | this(0, 0); 37 | } 38 | 39 | public int getNumEntries() 40 | { 41 | return added; 42 | } 43 | 44 | public CountingBloom(int slots, int hashFunctions) 45 | { 46 | counts = new byte[slots]; 47 | Random r = new Random(System.currentTimeMillis()); 48 | hashSeeds = new int[hashFunctions]; 49 | for (int i = 0; i < hashFunctions; ++i) 50 | hashSeeds[i] = r.nextInt(); 51 | } 52 | 53 | public void put(Object a, int raw_count) 54 | { 55 | int init_hash = a.hashCode(); 56 | 57 | //to make this more compact, we store only the exponent of the value, 58 | //to a specific base. 59 | byte expo_count = (byte) Math.min(Math.ceil(Math.log(raw_count)/log_base), 255); 60 | 61 | for(int i = 0; i < hashSeeds.length; i++) 62 | { 63 | int h = hash6432shift( (((long) hashSeeds[i]) << 32) | init_hash); 64 | h = Integer.remainderUnsigned(h, counts.length); 65 | 66 | counts[h] = (byte) Math.max(Byte.toUnsignedInt(counts[h]), expo_count); 67 | } 68 | 69 | added++; 70 | } 71 | 72 | public int get(Object a) 73 | { 74 | int init_hash = a.hashCode(); 75 | 76 | int min_expo = 257; 77 | for(int i = 0; i < hashSeeds.length; i++) 78 | { 79 | int h = hash6432shift( (((long) hashSeeds[i]) << 32) | init_hash); 80 | h = Integer.remainderUnsigned(h, counts.length); 81 | 82 | min_expo = Math.min(Byte.toUnsignedInt(counts[h]), min_expo); 83 | } 84 | 85 | if(min_expo == 0) 86 | return 0; 87 | else 88 | return (int) Math.pow(base, min_expo); 89 | } 90 | 91 | public double lowestNonZeroCount() 92 | { 93 | int min_expo = 257; 94 | 95 | for(byte i : counts) 96 | { 97 | int v = Byte.toUnsignedInt(i); 98 | if(v != 0) 99 | min_expo = Math.min(v, min_expo); 100 | } 101 | 102 | return (int) Math.pow(base, min_expo); 103 | } 104 | 105 | /** 106 | * 107 | * @param key 108 | * @return 109 | * @see https://gist.github.com/badboy/6267743 110 | */ 111 | static public int hash6432shift(long key) 112 | { 113 | key = (~key) + (key << 18); // key = (key << 18) - key - 1; 114 | key = key ^ (key >>> 31); 115 | key = key * 21; // key = (key + (key << 2)) + (key << 4); 116 | key = key ^ (key >>> 11); 117 | key = key + (key << 6); 118 | key = key ^ (key >>> 22); 119 | return (int) key; 120 | } 121 | 122 | private void writeObject(ObjectOutputStream oos) throws IOException 123 | { 124 | oos.defaultWriteObject(); 125 | oos.writeInt(divisor); 126 | oos.writeInt(added); 127 | oos.writeDouble(base); 128 | oos.writeDouble(log_base); 129 | 130 | 131 | oos.writeInt(counts.length); 132 | for(int i = 0; i < counts.length; i++) 133 | oos.writeByte(counts[i]); 134 | 135 | oos.writeInt(hashSeeds.length); 136 | for(int i = 0; i < hashSeeds.length; i++) 137 | oos.writeInt(hashSeeds[i]); 138 | } 139 | 140 | private void readObject(ObjectInputStream ois) throws ClassNotFoundException, IOException 141 | { 142 | ois.defaultReadObject(); 143 | 144 | divisor = ois.readInt(); 145 | added = ois.readInt(); 146 | base = ois.readDouble(); 147 | log_base = ois.readDouble(); 148 | 149 | counts = new byte[ois.readInt()]; 150 | for(int i = 0; i < counts.length; i++) 151 | counts[i] = ois.readByte(); 152 | 153 | hashSeeds = new int[ois.readInt()]; 154 | for(int i = 0; i < hashSeeds.length; i++) 155 | hashSeeds[i] = ois.readInt(); 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/main/java/edu/lps/acs/ml/autoyara/CountingBloomInfo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package edu.lps.acs.ml.autoyara; 7 | 8 | import static edu.lps.acs.ml.autoyara.AutoYaraCluster.bloomNameToGramSize; 9 | import java.io.BufferedInputStream; 10 | import java.io.File; 11 | import java.io.IOException; 12 | import java.io.ObjectInputStream; 13 | import java.nio.file.Files; 14 | import java.nio.file.Path; 15 | 16 | /** 17 | * 18 | * @author rjzak 19 | */ 20 | public class CountingBloomInfo { 21 | public static void main(String[] args) { 22 | if (args.length != 1) { 23 | System.err.println("Missing path to bloom filter file."); 24 | System.exit(1); 25 | } 26 | 27 | Path bloomPath = new File(args[0]).toPath(); 28 | CountingBloom bloom = null; 29 | int gram_size = -1; 30 | try(ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(Files.newInputStream(bloomPath)))) { 31 | bloom = (CountingBloom) ois.readObject(); 32 | gram_size = bloomNameToGramSize(bloomPath.getFileName().toString()); 33 | } 34 | catch (IOException | ClassNotFoundException ex) { 35 | System.err.println("Failed to read " + bloomPath + ": " + ex.getMessage()); 36 | System.exit(1); 37 | } 38 | 39 | if (bloom == null) { 40 | System.err.println("Failed to load bloom filter."); 41 | System.exit(1); 42 | } 43 | 44 | System.out.println("N-Gram size: " + gram_size); 45 | System.out.println("Number of entries: " + bloom.getNumEntries()); 46 | System.out.println("Divisor: " + bloom.divisor); 47 | System.out.println("Hash functions: " + bloom.hashSeeds.length); 48 | System.out.println("Number of slots: " + bloom.counts.length); 49 | System.out.println("Lowest non-zero count: " + bloom.lowestNonZeroCount()); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/edu/lps/acs/ml/autoyara/SigCandidate.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package edu.lps.acs.ml.autoyara; 7 | 8 | import edu.lps.acs.ml.ngram3.alphabet.AlphabetGram; 9 | import java.util.Collections; 10 | import java.util.Comparator; 11 | import java.util.HashSet; 12 | import java.util.List; 13 | import java.util.Set; 14 | 15 | /** 16 | * 17 | * @author edraff 18 | */ 19 | public class SigCandidate 20 | { 21 | public enum Priority 22 | { 23 | ENTROPY 24 | { 25 | @Override 26 | int cmp(SigCandidate a, SigCandidate b, int[] curCoverage, int targetCover, double max_b_fp, double max_m_fp) 27 | { 28 | //negative b/c we are assuming higher entropy is better 29 | return -Double.compare(sigEntropy(a), sigEntropy(b)); 30 | } 31 | 32 | }, 33 | TOTAL_FP 34 | { 35 | @Override 36 | int cmp(SigCandidate a, SigCandidate b, int[] curCoverage, int targetCover, double max_b_fp, double max_m_fp) 37 | { 38 | max_b_fp = Math.max(max_b_fp, 1e-14); 39 | max_m_fp = Math.max(max_m_fp, 1e-14); 40 | double a_val = a.b_fp/max_b_fp + a.m_fp/max_m_fp; 41 | double b_val = b.b_fp/max_b_fp + b.m_fp/max_m_fp; 42 | return Double.compare(a_val, b_val); 43 | } 44 | 45 | }, 46 | NEW_COVERAGE 47 | { 48 | @Override 49 | int cmp(SigCandidate a, SigCandidate b, int[] curCoverage, int targetCover, double max_b_fp, double max_m_fp) 50 | { 51 | long a_val = 0; 52 | long b_val = 0; 53 | 54 | a_val = a.coverage.stream().filter(v -> (curCoverage[v] < targetCover)).count(); 55 | b_val = b.coverage.stream().filter(v -> (curCoverage[v] < targetCover)).count(); 56 | 57 | //- b/c we want larger=better 58 | return -Long.compare(a_val, b_val); 59 | } 60 | }, 61 | TOTAL_COVERAGE 62 | { 63 | @Override 64 | int cmp(SigCandidate a, SigCandidate b, int[] curCoverage, int targetCover, double max_b_fp, double max_m_fp) 65 | { 66 | //- b/c we want larger=better 67 | return -Integer.compare(a.coverage.size(), b.coverage.size()); 68 | } 69 | } 70 | ; 71 | abstract int cmp(SigCandidate a, SigCandidate b, int[] curCoverage, int targetCover, double max_b_fp, double max_m_fp); 72 | } 73 | 74 | AlphabetGram signature; 75 | double b_fp; 76 | double m_fp; 77 | Set coverage; 78 | double entropy = -1; 79 | 80 | @Override 81 | public boolean equals(Object obj) 82 | { 83 | return signature.equals(obj); 84 | } 85 | 86 | @Override 87 | public int hashCode() 88 | { 89 | return signature.hashCode(); 90 | } 91 | 92 | public SigCandidate(AlphabetGram signature, double b_fp, double m_fp, Set coverage) 93 | { 94 | this.signature = signature; 95 | this.b_fp = b_fp; 96 | this.m_fp = m_fp; 97 | this.coverage = coverage; 98 | } 99 | 100 | public static Set select(List candidates, int[] coverage, int targetCover, double max_b_fp, double max_m_fp, List sortPriority) 101 | { 102 | Set selected = new HashSet<>(); 103 | 104 | Set remainingOptions = new HashSet<>(); 105 | remainingOptions.addAll(candidates); 106 | 107 | do 108 | { 109 | //First, lets go through and remove anyone that has NO increase in coverage 110 | remainingOptions.removeIf(s-> 111 | { 112 | return s.coverage.stream().filter(i->coverage[i] < targetCover).count() == 0; 113 | }); 114 | if(remainingOptions.isEmpty()) 115 | break; 116 | 117 | SigCandidate best = Collections.min(remainingOptions, (SigCandidate a, SigCandidate b) -> 118 | { 119 | for(Priority p : sortPriority) 120 | { 121 | int cmp = p.cmp(a, b, coverage, targetCover, max_b_fp, max_m_fp); 122 | if(cmp != 0) 123 | return cmp; 124 | } 125 | 126 | return 0; 127 | }); 128 | 129 | remainingOptions.remove(best); 130 | selected.add(best); 131 | 132 | int coverageIncrease = 0; 133 | for(int indx : best.coverage) 134 | if(coverage[indx]++ < targetCover) 135 | coverageIncrease++; 136 | 137 | if(coverageIncrease == 0) 138 | break; 139 | 140 | int c = targetCover; 141 | for(int x : coverage) 142 | c = Math.min(c, x); 143 | if(c >= targetCover)//min coverage is meet, break 144 | break; 145 | } 146 | while(!remainingOptions.isEmpty()); 147 | 148 | return selected; 149 | } 150 | 151 | public double getEntropy() 152 | { 153 | if(entropy >= 0) 154 | return entropy; 155 | else 156 | return (entropy = sigEntropy(this)); 157 | } 158 | 159 | public static double sigEntropy(SigCandidate a) 160 | { 161 | double[] counts = new double[256]; 162 | int wildCards = 0; 163 | for(int i = 0; i < a.signature.size(); i++) 164 | { 165 | int indx = a.signature.getUnsigned(i); 166 | if(indx < counts.length) 167 | counts[indx]++; 168 | else//wild card, lets increment everyone by a partial to smooth it out 169 | wildCards++; 170 | } 171 | double entropy = 0; 172 | for(double count : counts) 173 | { 174 | //add smothing from wild card counts 175 | count += wildCards/counts.length; 176 | double p = count/a.signature.size(); 177 | 178 | if(p > 0) 179 | entropy += -p * Math.log(p)/Math.log(256); 180 | 181 | } 182 | return 8*entropy; 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /src/main/java/edu/lps/acs/ml/autoyara/SpectralCoClusteringVBMM.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package edu.lps.acs.ml.autoyara; 7 | 8 | 9 | import java.util.List; 10 | import jsat.DataSet; 11 | import jsat.SimpleDataSet; 12 | import jsat.classifiers.CategoricalData; 13 | import jsat.classifiers.DataPoint; 14 | import jsat.clustering.VBGMM; 15 | import jsat.clustering.biclustering.Bicluster; 16 | import jsat.clustering.biclustering.SpectralCoClustering; 17 | import jsat.linear.DenseVector; 18 | import jsat.linear.Matrix; 19 | import jsat.linear.SubMatrix; 20 | import jsat.linear.TruncatedSVD; 21 | import jsat.utils.IntList; 22 | 23 | public class SpectralCoClusteringVBMM implements Bicluster 24 | { 25 | public static SpectralCoClustering.InputNormalization DEFAULT = SpectralCoClustering.InputNormalization.BISTOCHASTIZATION; 26 | 27 | public SpectralCoClustering.InputNormalization inputNormalization = SpectralCoClustering.InputNormalization.BISTOCHASTIZATION; 28 | 29 | @Override 30 | public void bicluster(DataSet dataSet, int clusters, boolean parallel, List> row_assignments, List> col_assignments) 31 | { 32 | 33 | //1. Given A, form An = D_1^{−1/2} A D_2^{−1/2} 34 | Matrix A = dataSet.getDataMatrix(); 35 | 36 | DenseVector R = new DenseVector(A.rows()); 37 | DenseVector C = new DenseVector(A.cols()); 38 | 39 | Matrix A_n = inputNormalization.normalize(A, R, C); 40 | 41 | 42 | //2. Compute l = ceil(log2 k) singular vectors of A_n, u2, . . . u_l+1 and v2, . . . v_l+1, and form the matrix Z as in (12) 43 | int l = (int) Math.ceil(Math.log(clusters)/Math.log(2.0)); 44 | 45 | 46 | //A_n has r rows and c columns. We are going to make a new data matrix Z 47 | //Z will have (r+c) rows, and l columns. 48 | SimpleDataSet Z = create_Z_dataset(A_n, l, R, C, inputNormalization);//+1 b/c we are going to skip the first SV 49 | 50 | VBGMM vbgmm = new VBGMM(VBGMM.COV_FIT_TYPE.DIAG); 51 | 52 | int[] joint_designations = vbgmm.cluster(Z, parallel, null); 53 | 54 | createAssignments(Z, row_assignments, col_assignments, clusters, A, joint_designations, vbgmm); 55 | 56 | } 57 | 58 | public void bicluster(DataSet dataSet, boolean parallel, List> row_assignments, List> col_assignments) 59 | { 60 | SpectralCoClustering.InputNormalization inputNormalization ; 61 | inputNormalization = DEFAULT; 62 | 63 | //1. Given A, form An = D_1^{−1/2} A D_2^{−1/2} 64 | Matrix A = dataSet.getDataMatrix(); 65 | 66 | DenseVector R = new DenseVector(A.rows()); 67 | DenseVector C = new DenseVector(A.cols()); 68 | 69 | Matrix A_n = inputNormalization.normalize(A, R, C); 70 | 71 | //2. Compute l = ceil(log2 k) singular vectors of A_n, u2, . . . u_l+1 and v2, . . . v_l+1, and form the matrix Z as in (12) 72 | int k_max = Math.min(A.rows(), A.cols()); 73 | int l = (int) Math.ceil(Math.log(k_max)/Math.log(2.0)); 74 | 75 | 76 | SimpleDataSet Z = create_Z_dataset(A_n, l, R, C, inputNormalization); 77 | 78 | VBGMM vbgmm = new VBGMM(VBGMM.COV_FIT_TYPE.DIAG); 79 | int[] joint_designations = vbgmm.cluster(Z, parallel, null); 80 | int clusters = 0; 81 | for(int i : joint_designations) 82 | clusters = Math.max(clusters, i); 83 | clusters++; 84 | 85 | createAssignments(Z, row_assignments, col_assignments, clusters, A, joint_designations, vbgmm); 86 | } 87 | 88 | private SimpleDataSet create_Z_dataset(Matrix A_n, int l, DenseVector R, DenseVector C, SpectralCoClustering.InputNormalization inputNormalization) 89 | { 90 | //A_n has r rows and c columns. We are going to make a new data matrix Z 91 | //Z will have (r+c) rows, and l columns. 92 | TruncatedSVD svd = new TruncatedSVD(A_n, l+1);//+1 b/c we are going to skip the first SV 93 | Matrix U = svd.getU(); 94 | Matrix V = svd.getV().transpose(); 95 | //In some cases, Drop the first column, which corresponds to the first SV we don't want 96 | int to_skip = 1; 97 | U = new SubMatrix(U, 0, to_skip, U.rows(), l+to_skip); 98 | V = new SubMatrix(V, 0, to_skip, V.rows(), l+to_skip); 99 | /* Orig paper says to do this multiplication for re-scaling. Why not for 100 | * bistochastic? Its very similar! b/c in "Spectral Biclustering of 101 | * Microarray Data: Coclustering Genes and Conditions" where bistochastic 102 | * is introduced, on page 710: "Once D1 and D2 are found, we apply SVD to 103 | * B with no further normalization " 104 | * 105 | */ 106 | if(inputNormalization == SpectralCoClustering.InputNormalization.SCALE) 107 | { 108 | Matrix.diagMult(R, U); 109 | Matrix.diagMult(C, V); 110 | } 111 | 112 | SimpleDataSet Z = new SimpleDataSet(l, new CategoricalData[0]); 113 | for(int i = 0; i < U.rows(); i++) 114 | Z.add(new DataPoint(U.getRow(i))); 115 | for(int i = 0; i < V.rows(); i++) 116 | Z.add(new DataPoint(V.getRow(i))); 117 | return Z; 118 | } 119 | 120 | private void createAssignments(SimpleDataSet Z, List> row_assignments, List> col_assignments, int clusters, Matrix A, int[] joint_designations, VBGMM vbgmm) 121 | { 122 | clusters = vbgmm.mixtureAssignments(Z.getDataPoint(0).getNumericalValues()).length; 123 | //prep label outputs 124 | row_assignments.clear(); 125 | col_assignments.clear(); 126 | for(int c = 0; c < clusters; c++) 127 | { 128 | row_assignments.add(new IntList()); 129 | col_assignments.add(new IntList()); 130 | } 131 | 132 | int n = A.rows(); 133 | double thresh = 1.0/(row_assignments.size()+1); 134 | for(int z = 0; z < Z.size(); z++) 135 | { 136 | double[] assignments = vbgmm.mixtureAssignments(Z.getDataPoint(z).getNumericalValues()); 137 | 138 | int assigned = 0; 139 | for(int k = 0; k < assignments.length; k++) 140 | { 141 | if(assignments[k] < thresh) 142 | continue;//not happening 143 | assigned++; 144 | if(z < A.rows())//maybe add this row 145 | { 146 | row_assignments.get(k).add(z); 147 | } 148 | else//maybe add this column 149 | { 150 | col_assignments.get(k).add(z-A.rows()); 151 | } 152 | } 153 | 154 | } 155 | 156 | //Now we need to prune potential false bi-clusterings that have only features or only rows 157 | for(int j = row_assignments.size()-1; j >= 0; j--) 158 | { 159 | if(row_assignments.get(j).isEmpty() || col_assignments.get(j).isEmpty()) 160 | { 161 | row_assignments.remove(j); 162 | col_assignments.remove(j); 163 | } 164 | } 165 | } 166 | 167 | @Override 168 | public SpectralCoClusteringVBMM clone() 169 | { 170 | return this; 171 | } 172 | 173 | } -------------------------------------------------------------------------------- /src/main/java/edu/lps/acs/ml/autoyara/YaraRuleContainerConjunctive.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package edu.lps.acs.ml.autoyara; 7 | 8 | import com.google.common.base.CharMatcher; 9 | import edu.lps.acs.ml.ngram3.NGramGeneric; 10 | import edu.lps.acs.ml.ngram3.alphabet.AlphabetGram; 11 | import edu.lps.acs.ml.ngram3.alphabet.ByteGrams; 12 | import edu.lps.acs.ml.ngram3.alphabet.ShortGrams; 13 | import java.awt.event.KeyEvent; 14 | import java.io.InputStream; 15 | import java.nio.charset.Charset; 16 | import java.util.ArrayList; 17 | import java.util.Collection; 18 | import java.util.HashMap; 19 | import java.util.HashSet; 20 | import java.util.List; 21 | import java.util.Map; 22 | import java.util.Set; 23 | 24 | /** 25 | * 26 | * @author edraff 27 | */ 28 | public class YaraRuleContainerConjunctive 29 | { 30 | int total_count; 31 | String name; 32 | List extraComments; 33 | List> signature_sets = new ArrayList<>(); 34 | List min_counts = new ArrayList<>(); 35 | 36 | 37 | public YaraRuleContainerConjunctive(int total_count, String name) 38 | { 39 | this.total_count = total_count; 40 | this.name = name; 41 | this.extraComments = new ArrayList<>(); 42 | } 43 | 44 | public void addSignature(int count, Set signature) 45 | { 46 | signature_sets.add(signature); 47 | min_counts.add(count); 48 | } 49 | 50 | /** 51 | * Returns the minimum number of terms in any sub-rule of this larger yara rule. 52 | * For example, (a and b and c and d) or (d and e and f) would return 3. 53 | * @return the minimum number of terms in any sub-rule. 54 | */ 55 | public int minConjunctionSize() 56 | { 57 | return signature_sets.stream().mapToInt(s->s.size()).min().orElse(0); 58 | } 59 | 60 | public void addComment(String comment) 61 | { 62 | this.extraComments.add(comment); 63 | } 64 | 65 | /** 66 | * 67 | * @param input 68 | * @return true if this yara rule would fire as a match on the given input stream 69 | */ 70 | public boolean match(InputStream input) 71 | { 72 | if(signature_sets.isEmpty()) 73 | return false; 74 | NGramGeneric ngram = new NGramGeneric(); 75 | ngram.setAlphabetSize(256); 76 | ngram.setGramSize(signature_sets.get(0).stream().findAny().get().signature.size()); 77 | ngram.setFilterSize((int) 214748383 / 8); 78 | 79 | Map> observed = new HashMap<>(); 80 | for(Set set : signature_sets) 81 | { 82 | for(SigCandidate cand : set) 83 | observed.put(cand.signature, new HashSet<>()); 84 | } 85 | 86 | //get counts for what n-grams were seen in this data 87 | ngram.incrementConuts(input, 0, observed); 88 | 89 | //Do we have a match? 90 | for(int group = 0; group < signature_sets.size(); group++) 91 | { 92 | Set set = signature_sets.get(group); 93 | int matches_found = set.stream() 94 | .mapToInt(sig_component->observed.get(sig_component).size()) 95 | .sum(); 96 | 97 | if (matches_found >= min_counts.get(group)) 98 | return true; 99 | } 100 | 101 | return false; 102 | } 103 | 104 | @Override 105 | public String toString() 106 | { 107 | StringBuilder sb = new StringBuilder(); 108 | sb.append("rule ").append(name).append("\n"); 109 | sb.append("{\n"); 110 | 111 | 112 | for(String comment : extraComments) 113 | { 114 | while(comment.endsWith("\n")) 115 | comment = comment.substring(0, comment.length()-1); 116 | comment = comment.replaceAll("\n", "\n\t//"); 117 | 118 | sb.append("\t//").append(comment).append("\n"); 119 | } 120 | 121 | Map sigToName = new HashMap<>(); 122 | for(Set sig_set : signature_sets) 123 | for(SigCandidate s : sig_set) 124 | if(!sigToName.containsKey(s)) 125 | sigToName.putIfAbsent(s, "$x" + sigToName.size()); 126 | List signatures = new ArrayList<>(sigToName.keySet()); 127 | 128 | sb.append("\tstrings:\n"); 129 | for(int i = 0; i < signatures.size(); i++) 130 | { 131 | //first lets right out a comment for this rule 132 | SigCandidate sig = signatures.get(i); 133 | sb.append("\t\t//Benign FP est: "); 134 | if(sig.b_fp < 0) 135 | sb.append("<").append(-sig.b_fp); 136 | else 137 | sb.append(-sig.b_fp); 138 | sb.append(" Malicious FP est: "); 139 | if(sig.m_fp < 0) 140 | sb.append("<").append(-sig.m_fp); 141 | else 142 | sb.append(-sig.m_fp); 143 | sb.append(" Entropy: ").append(SigCandidate.sigEntropy(sig)) 144 | .append(" Found in ").append(sig.coverage.size()).append(" files") 145 | .append("\n"); 146 | sb.append("\t\t").append(sigToName.get(sig)); 147 | sb.append(" = "); 148 | sigToYaraString(sb, sig); 149 | sb.append("\n"); 150 | } 151 | sb.append("\n"); 152 | sb.append("\t\tcondition:\n"); 153 | for(int i = 0; i < signature_sets.size(); i++) 154 | { 155 | if(i != 0) 156 | sb.append(" or "); 157 | sb.append("(").append(min_counts.get(i)).append(" of ("); 158 | boolean first= true; 159 | for(SigCandidate s : signature_sets.get(i)) 160 | { 161 | if(first) 162 | first = false; 163 | else 164 | sb.append(","); 165 | sb.append(sigToName.get(s)); 166 | } 167 | sb.append(") )"); 168 | 169 | } 170 | sb.append("}"); 171 | return sb.toString(); 172 | } 173 | 174 | /** 175 | * Conver the given signature candidate into a sub-string for Yara to use within a rule. 176 | * 177 | * The default is to create a byte string that would look like:
178 | * {6A 40 68 00 30 00 00 6A 14 8D 91} 179 | * 180 | * 181 | * @param sb the string builder to insert the string into 182 | * @param sig the signature to convert into a string 183 | */ 184 | private void sigToYaraString(StringBuilder sb, SigCandidate sig) 185 | { 186 | AlphabetGram g = sig.signature; 187 | 188 | 189 | byte[] byte_values = new byte[g.size()]; 190 | boolean[] is_wild = new boolean[g.size()]; 191 | boolean any_wild = false; 192 | 193 | 194 | for(int j = 0; j < g.size(); j++) 195 | { 196 | int val; 197 | if(g instanceof ByteGrams) 198 | val = Byte.toUnsignedInt((byte) g.get(j)); 199 | else if(g instanceof ShortGrams) 200 | val = Short.toUnsignedInt((short) g.get(j)); 201 | else 202 | val = 1000;///WHAT? 203 | if (val > 255)//WILD CARD 204 | { 205 | any_wild = is_wild[j] = true; 206 | } 207 | else 208 | { 209 | byte_values[j] = (byte) val; 210 | } 211 | } 212 | 213 | if(!any_wild && addAsASCII(sb, byte_values)) 214 | { 215 | //work done in addAsASCII call, so no need for inner loop conent 216 | } 217 | else if(!any_wild && addAsASCII_wide(sb, byte_values)) 218 | { 219 | //work done in addAsASCII_wide call, so no need for inner loop conent 220 | } 221 | else//just write out byte values 222 | { 223 | sb.append("{ "); 224 | for(int j = 0; j < byte_values.length; j++) 225 | { 226 | if(is_wild[j]) 227 | sb.append("??"); 228 | else 229 | sb.append(String.format("%02X", Byte.toUnsignedInt(byte_values[j]))); 230 | sb.append(" ");//each hex byte needs to have a space after! 231 | } 232 | sb.append("} "); 233 | asStringComment(sb, byte_values); 234 | } 235 | 236 | 237 | 238 | } 239 | 240 | /** 241 | * If the byte values are actually an ASCII printable string, lets use that 242 | * instead of the raw byte values. 243 | * @param sb the string builder to insert the string into 244 | * @param sig the signature to convert into a string 245 | * @return true if the string was added, false if it does not appear to be 246 | * an ascii string 247 | */ 248 | private boolean addAsASCII(StringBuilder sb, byte[] byte_values) 249 | { 250 | try 251 | { 252 | String s = new String(byte_values, Charset.forName("US-ASCII")); 253 | if(s.length() != byte_values.length) 254 | return false; 255 | if(!CharMatcher.ascii().matchesAllOf(s)) 256 | return false; 257 | for(char c :s.toCharArray()) 258 | if(!isPrintableChar(c)) 259 | return false; 260 | //Not sure how to format these two cases in yara, so skip 261 | if(s.contains("\"") || s.contains("\\") || s.contains("\r")) 262 | return false; 263 | //escape ones that I do understand how to put in yara 264 | s = s.replace("\t", "\\t"); 265 | s = s.replace("\n", "\\n"); 266 | sb.append("\"").append(s).append("\" ascii"); 267 | return true; 268 | } 269 | catch(Exception e) 270 | { 271 | return false; 272 | } 273 | } 274 | 275 | /** 276 | * A ASCII string may be encoded in Unicode format, which this will try and 277 | * detect. Yara dosn't actually support Unicode, so we only want to do this 278 | * if it is an ascii string in uicode encoding. 279 | * 280 | * @param sb the string builder to insert the string into 281 | * @param sig the signature to convert into a string 282 | * @return true if the string was added, false if it does not appear to be 283 | * an ascii string in unicode 284 | */ 285 | private boolean addAsASCII_wide(StringBuilder sb, byte[] byte_values) 286 | { 287 | try 288 | { 289 | String s = new String(byte_values, Charset.forName("UTF-8")); 290 | if(s.length() != byte_values.length/2) 291 | return false; 292 | if(!CharMatcher.ascii().matchesAllOf(s)) 293 | return false; 294 | for(char c :s.toCharArray()) 295 | if(!isPrintableChar(c)) 296 | return false; 297 | //Not sure how to format these two cases in yara, so skip 298 | if(s.contains("\"") || s.contains("\\") || s.contains("\r")) 299 | return false; 300 | //escape ones that I do understand how to put in yara 301 | s = s.replace("\t", "\\t"); 302 | s = s.replace("\n", "\\n"); 303 | 304 | sb.append("\"").append(s).append("\" wide"); 305 | return true; 306 | } 307 | catch(Exception e) 308 | { 309 | return false; 310 | } 311 | } 312 | 313 | /** 314 | * Add string 315 | * @param sb 316 | * @param byte_values 317 | * @return 318 | */ 319 | private boolean asStringComment(StringBuilder sb, byte[] byte_values) 320 | { 321 | try 322 | { 323 | String s_ascii = new String(byte_values, Charset.forName("US-ASCII")); 324 | String s_uni = new String(byte_values, Charset.forName("UTF-8")); 325 | 326 | double uni_printable = 0; 327 | for(char c :s_uni.toCharArray()) 328 | if(isPrintableChar(c)) 329 | uni_printable++; 330 | uni_printable /= s_uni.length(); 331 | 332 | double ascii_printable = 0; 333 | for(char c :s_ascii.toCharArray()) 334 | if(isPrintableChar(c)) 335 | ascii_printable++; 336 | ascii_printable /= s_ascii.length(); 337 | 338 | if(Math.max(ascii_printable, uni_printable) < 0.5) 339 | return false;//Not that much printable content, skip it 340 | 341 | String s; 342 | if(ascii_printable > uni_printable) 343 | s = s_ascii; 344 | else 345 | s = s_uni; 346 | 347 | //escape ones that I do understand how to put in yara 348 | s = s.replace("\t", "\\t"); 349 | s = s.replace("\n", "\\n"); 350 | s = s.replace("\r", "\\r"); 351 | 352 | // strips off all non-ASCII characters 353 | s = s.replaceAll("[^\\x00-\\x7F]", ""); 354 | 355 | // erases all the ASCII control characters 356 | s = s.replaceAll("[\\p{Cntrl}&&[^\r\n\t]]", ""); 357 | 358 | // removes non-printable characters from Unicode 359 | s = s.replaceAll("\\p{C}", ""); 360 | 361 | 362 | sb.append("//This might be a string? Looks like:").append(s); 363 | return true; 364 | } 365 | catch(Exception e) 366 | { 367 | return false; 368 | } 369 | } 370 | 371 | public boolean isPrintableChar(char c) 372 | { 373 | Character.UnicodeBlock block = Character.UnicodeBlock.of(c); 374 | return (!Character.isISOControl(c)) 375 | && c != KeyEvent.CHAR_UNDEFINED 376 | && block != null 377 | && block != Character.UnicodeBlock.SPECIALS; 378 | } 379 | 380 | 381 | } 382 | --------------------------------------------------------------------------------