├── doc ├── latex-post.inc ├── malheur.pdf ├── 2011-jcs.pdf ├── modules ├── latex-pre.inc.in ├── Makefile.am └── example.cfg ├── data └── README.md ├── pedantic ├── mrproper ├── bootstrap ├── src ├── murmur.h ├── proto.h ├── Makefile.am ├── mist.h ├── export.h ├── md5.h ├── class.h ├── mconfig.h ├── ftable.h ├── fmath.h ├── cluster.h ├── malheur.h ├── quality.h ├── common.h ├── fvec.h ├── farray.h ├── util.h ├── murmur.c ├── mist.c ├── class.c ├── proto.c ├── quality.c ├── ftable.c ├── mconfig.c ├── export.c ├── cluster.c ├── util.c ├── fmath.c └── md5.c ├── TODO ├── .gitignore ├── Makefile.am ├── tests ├── tests.h ├── Makefile.am ├── tests.c ├── test_quality.c ├── test_proto.c ├── test_class.c ├── test_ftable.c ├── test_farray.c ├── test_fmath.c ├── test_fvec.c └── test_cluster.c ├── configure.ac ├── git2changes.py ├── m4 ├── openmp.m4 └── pkg.m4 ├── README.md └── INSTALL /doc/latex-post.inc: -------------------------------------------------------------------------------- 1 | \end{document} 2 | -------------------------------------------------------------------------------- /doc/malheur.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/product/malheur/master/doc/malheur.pdf -------------------------------------------------------------------------------- /doc/2011-jcs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/product/malheur/master/doc/2011-jcs.pdf -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # Malheur Dataset 2 | 3 | The dataset is available here: 4 | 5 | https://www.sec.cs.tu-bs.de/data/malheur 6 | 7 | -------------------------------------------------------------------------------- /pedantic: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Shell script to see all compiler warnings 3 | 4 | make clean > /dev/null 5 | make -j4 all > /dev/null 6 | -------------------------------------------------------------------------------- /doc/modules: -------------------------------------------------------------------------------- 1 | 2 | /** @defgroup fvec */ 3 | /** @defgroup farray */ 4 | /** @defgroup fmath */ 5 | /** @defgroup proto */ 6 | /** @defgroup cluster */ 7 | /** @defgroup class */ 8 | /** @defgroup quality */ 9 | /** @defgroup export */ 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /mrproper: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Do the Mr. Proper! 4 | rm -f configure depcomp install-sh missing ltmain.sh 5 | rm -f aclocal.m4 m4/lib* m4/lt* 6 | rm -f config.* 7 | rm -rf a.out.dSYM autom4te.cache 8 | 9 | find . -name Makefile.in -delete 10 | 11 | echo "All clean now." -------------------------------------------------------------------------------- /bootstrap: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # OpenBSD setup 4 | export AUTOCONF_VERSION=2.69 AUTOMAKE_VERSION=1.13 5 | 6 | # Remove auto-generated m4 files 7 | rm -f aclocal.m4 8 | rm -f m4/lib* m4/lt* 9 | 10 | # Setup autotools elegantly 11 | autoreconf --install --force --symlink 12 | 13 | -------------------------------------------------------------------------------- /doc/latex-pre.inc.in: -------------------------------------------------------------------------------- 1 | \documentclass[10pt,a4paper]{article} 2 | \usepackage[width=13cm]{geometry} 3 | \usepackage{palatino} 4 | 5 | \setlength{\parskip}{6pt} 6 | \setlength{\parindent}{0pt} 7 | 8 | \begin{document} 9 | 10 | \title{\textbf{Malheur Version __VERSION__}\\ 11 | --- User Manual ---} 12 | \author{Konrad Rieck} 13 | \maketitle 14 | 15 | \tableofcontents 16 | \pagebreak 17 | -------------------------------------------------------------------------------- /src/murmur.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MurmurHash2, 64-bit versions, by Austin Appleby 3 | * -- 4 | * The same caveats as 32-bit MurmurHash2 apply here - beware of alignment 5 | * and endian-ness issues if used across multiple platforms. 6 | */ 7 | 8 | #ifndef MURMUR_H 9 | #define MURMUR_H 10 | 11 | uint32_t MurmurHash2(const void *key, int32_t len, uint32_t seed); 12 | uint64_t MurmurHash64B(const void *key, int32_t len, uint32_t seed); 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | 2 | MALHEUR - Automatic Analysis of Malware Behavior 3 | Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | University of Goettingen, Berlin Institute of Technology 5 | -- 6 | 7 | TODO (major stuff) 8 | * Functionality for aging of prototypes (and auto-removal) 9 | * Functionality to split or merge existing clusters 10 | 11 | TODO (minor stuff) 12 | * Debian/Ubuntu package generation? 13 | * Fix for strange gzclose issue (valgrind) 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.a 2 | *.mex* 3 | *.o 4 | .deps 5 | /a.out.dSYM 6 | /aclocal.m4 7 | /autom4te.cache 8 | /build 9 | /compile 10 | /config.* 11 | /configure 12 | /depcomp 13 | /doc/latex-pre.inc 14 | /doc/malheur.pod 15 | /doc/malheur.txt 16 | /doc/test.cfg 17 | /install-sh 18 | /libtool 19 | /ltmain.sh 20 | /m4 21 | /malheur-*.tar.gz 22 | /missing 23 | /src/malheur 24 | /stamp-h1 25 | /tests/test_* 26 | CHANGES 27 | Makefile 28 | Makefile.in 29 | src/malheur.h 30 | test-driver 31 | tests/*.log 32 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | # MALHEUR - Automatic Analysis of Malware Behavior 2 | # Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 3 | # University of Goettingen, Berlin Institute of Technology 4 | # -- 5 | 6 | AUTOMAKE_OPTIONS = foreign 7 | SUBDIRS = src doc tests . 8 | ACLOCAL_AMFLAGS = -I m4 9 | EXTRA_DIST = README.md INSTALL COPYING CHANGES 10 | 11 | clean-local: 12 | rm -f *~ 13 | rm -f CHANGES 14 | 15 | CHANGES: 16 | $(srcdir)/git2changes.py 17 | 18 | install-data-local: 19 | $(mkinstalldirs) $(DESTDIR)$(localstatedir)/malheur 20 | -------------------------------------------------------------------------------- /src/proto.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef PROTO_H 15 | #define PROTO_H 16 | 17 | #include "farray.h" 18 | #include "class.h" 19 | 20 | /* Function declarations */ 21 | farray_t *proto_extract(farray_t *, assign_t **); 22 | assign_t *proto_assign(farray_t *, farray_t *); 23 | 24 | #endif /* PROTO_H */ 25 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | # MALHEUR - Automatic Analysis of Malware Behavior 2 | # Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 3 | # University of Goettingen, Berlin Institute of Technology 4 | 5 | noinst_LIBRARIES = libmalheur.a 6 | libmalheur_a_SOURCES = md5.c md5.h common.h util.c util.h fvec.c \ 7 | fvec.h ftable.c ftable.h mist.c \ 8 | mist.h fmath.c fmath.h export.c export.h \ 9 | farray.c farray.h proto.c proto.h mconfig.c \ 10 | mconfig.h cluster.c cluster.h quality.c \ 11 | quality.h class.c class.h uthash.h \ 12 | murmur.c murmur.h 13 | 14 | bin_PROGRAMS = malheur 15 | malheur_SOURCES = $(libmalheur_a_SOURCES) malheur.c malheur.h 16 | malheur_LDFLAGS = -static 17 | 18 | beautify: $(libmalheur_a_SOURCES) $(malheur_SOURCES) 19 | gindent -i4 -kr -l77 -lc77 --no-tabs -cs \ 20 | -T FILE -T fvec_t -T farray_t -T cluster_t \ 21 | -T assign_t -T config_t -T config_setting_T \ 22 | -T DIR -T DIRP $^ 23 | -------------------------------------------------------------------------------- /src/mist.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef MIST_H 15 | #define MIST_H 16 | 17 | /* 18 | * Definitions for MIST parsing 19 | */ 20 | #define MIST_LEVEL '|' /* Delimiter for MIST levels */ 21 | #define MIST_INSTR '\n' /* Delimiter for MIST instructions */ 22 | #define MIST_COMMENT '#' /* Prefix for comments */ 23 | #define MIST_THREAD "thread" /* Comment string indicating new thread */ 24 | 25 | /* Functions */ 26 | char *mist_preproc(char *report); 27 | char *mist_trunc_report(char *, int); 28 | char *mist_trunc_thread(char *, int); 29 | char *mist_trunc_level(char *, int); 30 | char *mist_trim(char *); 31 | 32 | #endif /* MIST_H */ 33 | -------------------------------------------------------------------------------- /tests/tests.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef TESTS_H 15 | #define TESTS_H 16 | 17 | #include "config.h" 18 | #include "common.h" 19 | #include "util.h" 20 | 21 | /* With of text line */ 22 | #define LINE_WIDTH 60 23 | 24 | /* Macros for faking a configuration */ 25 | #define config_set_string(c,x,s) \ 26 | config_setting_set_string(config_lookup(c,x),s) 27 | #define config_set_int(c,x,s) \ 28 | config_setting_set_int(config_lookup(c,x),s) 29 | #define config_set_float(c,x,s) \ 30 | config_setting_set_float(config_lookup(c,x),s) 31 | 32 | /* Functions */ 33 | void test_printf(char *fmt, ...); 34 | void test_return(int, int); 35 | void test_error(char *fmt, ...); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/export.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef EXPORT_H 15 | #define EXPORT_H 16 | 17 | #include "farray.h" 18 | #include "proto.h" 19 | #include "cluster.h" 20 | 21 | /* I/O functions */ 22 | void export_proto(farray_t *, farray_t *, assign_t *, const char *); 23 | void export_cluster(cluster_t *, farray_t *, farray_t *, assign_t *, 24 | const char *); 25 | void export_shared_ngrams(cluster_t *, farray_t *, const char *); 26 | void export_dist(double *, farray_t *, const char *); 27 | void export_class(farray_t *, farray_t *, assign_t *, const char *); 28 | void export_increment1(farray_t *, farray_t *, assign_t *, const char *); 29 | void export_increment2(cluster_t *, farray_t *, farray_t *, assign_t *, 30 | const char *); 31 | 32 | #endif /* EXPORT_H */ 33 | -------------------------------------------------------------------------------- /doc/Makefile.am: -------------------------------------------------------------------------------- 1 | # MALHEUR - Automatic Analysis of Malware Behavior 2 | # Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 3 | # University of Goettingen, Berlin Institute of Technology 4 | # 5 | 6 | EXTRA_DIST = example.cfg malheur.pod malheur.man malheur.txt malheur.pdf \ 7 | doxygen.cfg modules latex-pre.inc.in latex-post.inc 8 | 9 | doc_DATA = example.cfg 10 | man1_MANS = malheur.man 11 | 12 | malheur.man: malheur.pod 13 | pod2man -c "User Manual" -s 1 -r "Malheur $(VERSION)" malheur.pod > $@ 14 | 15 | malheur.txt: malheur.pod 16 | pod2text -w 70 malheur.pod > $@ 17 | 18 | malheur.pdf: malheur.pod latex-pre.inc latex-post.inc 19 | pod2latex -prefile $(top_builddir)/doc/latex-pre.inc \ 20 | -postfile $(top_srcdir)/doc/latex-post.inc \ 21 | -full -out $(top_builddir)/doc/malheur.tex \ 22 | $(top_srcdir)/doc/malheur.pod 23 | pdflatex $(top_builddir)/doc/malheur.tex 24 | pdflatex $(top_builddir)/doc/malheur.tex 25 | pdflatex $(top_builddir)/doc/malheur.tex 26 | rm -f *.tex *.log *.aux *.toc 27 | 28 | latex-pre.inc: latex-pre.inc.in 29 | sed -e 's|__VERSION__|$(VERSION)|' \ 30 | $(top_srcdir)/doc/latex-pre.inc.in \ 31 | > $(top_builddir)/doc/latex-pre.inc 32 | 33 | distclean-local: 34 | rm -f malheur.pdf malheur.man malheur.txt latex-pre.inc 35 | 36 | dist-hook: distclean-local malheur.pdf malheur.man malheur.txt 37 | -------------------------------------------------------------------------------- /src/md5.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This code implements the MD5 message-digest algorithm. 3 | * The algorithm is due to Ron Rivest. This code was 4 | * written by Colin Plumb in 1993, no copyright is claimed. 5 | * This code is in the public domain; do with it what you wish. 6 | * 7 | * Equivalent code is available from RSA Data Security, Inc. 8 | * This code has been tested against that, and is equivalent, 9 | * except that you don't need to include two pages of legalese 10 | * with every copy. 11 | * 12 | * To compute the message digest of a chunk of bytes, declare an 13 | * MD5Context structure, pass it to MD5Init, call MD5Update as 14 | * needed on buffers full of bytes, and then call MD5Final, which 15 | * will fill a supplied 16-byte array with the digest. 16 | */ 17 | 18 | #ifndef MD5_H 19 | #define MD5_H 20 | 21 | #include 22 | 23 | #define MD5_DIGEST_LENGTH 16 24 | #define MD5_SEED_NONE -1 25 | 26 | struct MD5Context { 27 | uint32_t buf[4]; 28 | uint32_t bits[2]; 29 | unsigned char in[64]; 30 | }; 31 | 32 | /* Functions */ 33 | void MD5(unsigned char *x, unsigned l, unsigned char *buf); 34 | void MD5Init(struct MD5Context *context); 35 | void MD5Update(struct MD5Context *context, unsigned char const *buf, 36 | unsigned len); 37 | void MD5Final(unsigned char digest[16], struct MD5Context *context); 38 | void MD5Transform(uint32_t buf[4], uint32_t const in[16]); 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /src/class.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef CLASSIFY_H 15 | #define CLASSIFY_H 16 | 17 | #include "farray.h" 18 | 19 | /** 20 | * Assignment structure. Assignments are used to either associate 21 | * feature vectors to prototypes or to assign labels to feature 22 | * vectors. 23 | */ 24 | typedef struct { 25 | unsigned int *label; /**< Predicted labels */ 26 | unsigned int *proto; /**< Nearest prototypes */ 27 | double *dist; /**< Distance to prototypes */ 28 | unsigned long len; /**< Length of assign arrays */ 29 | } assign_t; 30 | 31 | /* Functions */ 32 | assign_t *class_assign(farray_t *, farray_t *); 33 | farray_t *class_get_rejected(assign_t *, farray_t *f); 34 | assign_t *assign_create(farray_t *); 35 | void assign_destroy(assign_t *); 36 | 37 | #endif /* CLASSIFY_H */ 38 | -------------------------------------------------------------------------------- /src/mconfig.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef MCONFIG_H 15 | #define MCONFIG_H 16 | 17 | #include "config.h" 18 | 19 | #ifdef HAVE_LIBCONFIG_H 20 | #include 21 | #endif 22 | 23 | /** 24 | * Default configuration. This structure is used to define a default 25 | * configuration. The type can be determined by first testing for a 26 | * string and then for a float argument. 27 | */ 28 | typedef struct { 29 | char *group; /**< Configuration group */ 30 | char *name; /**< Configuration name */ 31 | int type; /**< Type of configuration */ 32 | 33 | union { 34 | long num; /**< Integer value */ 35 | double flt; /**< Float value */ 36 | char *str; /**< String */ 37 | } val; 38 | } config_default_t; 39 | 40 | /* Functions */ 41 | void config_print(config_t *); 42 | int config_check(config_t *); 43 | void config_fprint(FILE *, config_t *); 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/ftable.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef FTABLE_H 15 | #define FTABLE_H 16 | 17 | #include "zlib.h" 18 | #include "fvec.h" 19 | 20 | #ifdef HAVE_UTHASH_UTHASH_H 21 | #include 22 | #else 23 | #ifdef HAVE_UTHASH_H 24 | #include 25 | #else 26 | #include "uthash.h" 27 | #endif 28 | #endif 29 | 30 | /** 31 | * Entry of lookup table. 32 | */ 33 | typedef struct { 34 | feat_t key; /**< Feature key */ 35 | char *data; /**< Feature data */ 36 | int len; /**< Length of data */ 37 | UT_hash_handle hh; /**< Uthash handle */ 38 | } fentry_t; 39 | 40 | void ftable_put(feat_t, char *, int); 41 | fentry_t *ftable_get(feat_t); 42 | void ftable_init(); 43 | void ftable_destroy(); 44 | unsigned long ftable_size(); 45 | void ftable_print(); 46 | void ftable_remove(feat_t); 47 | void ftable_save(gzFile); 48 | void ftable_load(gzFile); 49 | int ftable_enabled(); 50 | 51 | #endif /* FTABLE_H */ 52 | -------------------------------------------------------------------------------- /src/fmath.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef FMATH_H 15 | #define FMATH_H 16 | 17 | #include "fvec.h" 18 | #include "farray.h" 19 | 20 | /** Normalization types for feature vectors */ 21 | typedef enum { 22 | NORM_L1, NORM_L2 23 | } norm_t; 24 | 25 | /* Functions */ 26 | void fvec_mul(fvec_t *, double); 27 | void fvec_div(fvec_t *, double); 28 | fvec_t *fvec_adds(fvec_t *, fvec_t *, double); 29 | fvec_t *fvec_add(fvec_t *, fvec_t *); 30 | fvec_t *fvec_sub(fvec_t *, fvec_t *); 31 | double fvec_dist(fvec_t *fa, fvec_t *fb); 32 | fvec_t *farray_sums(farray_t *, double *); 33 | fvec_t *farray_sum(farray_t *); 34 | fvec_t *farray_mean(farray_t *); 35 | double fvec_dot(fvec_t *, fvec_t *); 36 | double fvec_norm1(fvec_t *); 37 | double fvec_norm2(fvec_t *); 38 | void fvec_normalize(fvec_t *, norm_t); 39 | void fvec_sparsify(fvec_t *); 40 | void farray_dist(farray_t *fa, farray_t *fb, double *d); 41 | void farray_dist_tria(farray_t *fa, double *d); 42 | void farray_normalize(farray_t *f, norm_t n); 43 | void fvec_bin(fvec_t *f); 44 | 45 | #endif /* FMATH_H */ 46 | -------------------------------------------------------------------------------- /src/cluster.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef CLUSTER_H 15 | #define CLUSTER_H 16 | 17 | #include "farray.h" 18 | #include "proto.h" 19 | 20 | /** 21 | * Clustering structure. The structure holds a clustering in form of 22 | * indices to clusters. Moreover, the number and the run of the clustering 23 | * are saved for incremental analysis. 24 | */ 25 | typedef struct { 26 | unsigned int *cluster; /**< Assignments of clustering */ 27 | unsigned long len; /**< Length of assignments */ 28 | unsigned long num; /**< Number of clusters */ 29 | unsigned int run; /**< Run number of clustering */ 30 | } cluster_t; 31 | 32 | /* Functions */ 33 | cluster_t *cluster_linkage(farray_t *, int); 34 | void cluster_destroy(cluster_t *); 35 | void cluster_extrapolate(cluster_t *c, assign_t *a); 36 | void cluster_trim(cluster_t *c); 37 | 38 | farray_t *cluster_get_prototypes(cluster_t *, assign_t *, farray_t *); 39 | farray_t *cluster_get_rejected(cluster_t *, farray_t *); 40 | char *cluster_get_name(cluster_t *c, int i); 41 | 42 | #endif /* CLUSTER_H */ 43 | -------------------------------------------------------------------------------- /src/malheur.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef MALHEUR_H 15 | #define MALHEUR_H 16 | 17 | /* Operation actions of Malheur */ 18 | typedef enum { 19 | DISTANCE, PROTOTYPE, CLUSTER, CLASSIFY, INCREMENT, PROTODIST, INFO 20 | } malheur_action_t; 21 | 22 | /* Output file */ 23 | #define OUTPUT_FILE "malheur.out" 24 | 25 | /* Local malheur files */ 26 | #define REJECT_FILE "rejected.zfa" 27 | #define PROTO_FILE "prototypes.zfa" 28 | #define STATE_FILE "malheur.state" 29 | 30 | typedef struct { 31 | unsigned int run; /* Current run */ 32 | unsigned int num_proto; /* Number of prototype reports */ 33 | unsigned int num_reject; /* Number of rejected reports */ 34 | } malheur_state_t; 35 | 36 | /* Libconfig macros */ 37 | #define config_set_string(c,x,s) \ 38 | config_setting_set_string(config_lookup(c,x),s) 39 | #define config_set_int(c,x,s) \ 40 | config_setting_set_int(config_lookup(c,x),s) 41 | #define config_set_float(c,x,s) \ 42 | config_setting_set_float(config_lookup(c,x),s) 43 | 44 | #endif /* MALHEUR_H */ 45 | 46 | -------------------------------------------------------------------------------- /doc/example.cfg: -------------------------------------------------------------------------------- 1 | # MALHEUR - Automatic Analysis of Malware Behavior 2 | # Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 3 | # University of Goettingen, Berlin Institute of Technology 4 | 5 | # Generic configuration 6 | generic = { 7 | # Format of input data ("text", "mist") 8 | input_format = "text"; 9 | # Delimiters for monitored events or "" for bytes 10 | event_delim = "%0a%0d"; 11 | # Directory to store internal state 12 | state_dir = "/var/tmp/malheur"; 13 | # Output file 14 | output_file = "malheur.out"; 15 | }; 16 | 17 | # Feature configuration 18 | features = { 19 | # Length of n-grams for malware instructions (1 to n) 20 | ngram_len = 2; 21 | # Embedding of n-grams in feature space ("cnt", "bin") 22 | vect_embed = "bin"; 23 | # Level of MIST (see input_format) 24 | mist_level = 2; 25 | # Seed for MD5 hash. Change to random value for security. 26 | hash_seed1 = 0x1ea4501a; 27 | hash_seed2 = 0x75f3da43; 28 | }; 29 | 30 | # Prototype configuration 31 | prototypes = { 32 | # Maximum distance to prototypes (0 to 1.41) 33 | max_dist = 0.65; 34 | # Maximum number of prototypes (to disable 0) 35 | max_num = 0; 36 | }; 37 | 38 | # Classification configuration 39 | classify = { 40 | # Maximum distance to prototypes (0 to 1.41) 41 | max_dist = 0.68; 42 | }; 43 | 44 | # Clustering configuration 45 | cluster = { 46 | # Mode of linkage clustering ("single", "average", "complete"); 47 | link_mode = "complete"; 48 | # Minimum distance between clusters (0 to 1.41) 49 | min_dist = 0.95; 50 | # Rejection threshold for small clusters (0 to n) 51 | reject_num = 10; 52 | # Output shared n-grams above given ratio (to disable 0.0) 53 | shared_ngrams = 0.0; 54 | }; 55 | -------------------------------------------------------------------------------- /src/quality.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef QUALITY_H 15 | #define QUALITY_H 16 | 17 | #include "util.h" 18 | 19 | #ifdef HAVE_UTHASH_UTHASH_H 20 | #include 21 | #else 22 | #ifdef HAVE_UTHASH_H 23 | #include 24 | #else 25 | #include "uthash.h" 26 | #endif 27 | #endif 28 | 29 | /* Definitions of quality measures */ 30 | #define Q_PRECISION 0 /* Precision */ 31 | #define Q_RECALL 1 /* Recall */ 32 | #define Q_FMEASURE 2 /* F-measure */ 33 | #define Q_RAND 3 /* Rand index */ 34 | #define Q_ARAND 4 /* Adjusted rand */ 35 | 36 | /** 37 | * Histogram bin for labels. The histogram is used to compute 38 | * performance measure such as precision and recall over a set 39 | * of predicited labels. 40 | */ 41 | typedef struct { 42 | unsigned int label; /**< True label */ 43 | double total; /**< Number of elements with labels */ 44 | count_t *count; /**< Predicted labels */ 45 | UT_hash_handle hh; /**< Hash table entry */ 46 | } hist_t; 47 | 48 | /* Evaluation functions */ 49 | hist_t *hist_create(unsigned int *, unsigned int *, int); 50 | void hist_print(hist_t *); 51 | void hist_destroy(hist_t *); 52 | double *quality(unsigned int *, unsigned int *, int); 53 | 54 | #endif /* EVAL_H */ 55 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef COMMON_H 15 | #define COMMON_H 16 | 17 | #define __USE_BSD /* Also for dirent under Linux */ 18 | #define _BSD_SOURCE /* For setdup under Linux */ 19 | #define __USE_POSIX /* For readdir_r under Linux */ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | /* Standard C headers */ 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #ifdef HAVE_GETOPT_H 39 | #include 40 | #endif 41 | #ifdef HAVE_STRINGS_H 42 | #include 43 | #endif 44 | #ifdef HAVE_STRING_H 45 | #include 46 | #endif 47 | #ifdef HAVE_ZLIB_H 48 | #include 49 | #endif 50 | 51 | #ifdef HAVE_LIBCONFIG_H 52 | #include 53 | /* Patch for changed ABI in libconfig versions */ 54 | #if LIBCONFIG_VER_MAJOR > 1 || LIBCONFIG_VER_MINOR > 3 55 | typedef int cfg_int; 56 | #else 57 | typedef long cfg_int; 58 | #endif 59 | #endif 60 | 61 | /* Some useful definitions */ 62 | #ifndef TRUE 63 | #define TRUE 1 64 | #endif 65 | 66 | #ifndef FALSE 67 | #define FALSE 0 68 | #endif 69 | 70 | #endif /* COMMON_H */ 71 | -------------------------------------------------------------------------------- /tests/Makefile.am: -------------------------------------------------------------------------------- 1 | # MALHEUR - Automatic Analysis of Malware Behavior 2 | # Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 3 | # University of Goettingen, Berlin Institute of Technology 4 | # -- 5 | 6 | AM_CPPFLAGS = @AM_CPPFLAGS@ -I$(top_srcdir)/src 7 | TESTS = test_ftable test_fvec test_farray test_fmath \ 8 | test_quality test_proto test_cluster \ 9 | test_class 10 | noinst_PROGRAMS = test_ftable test_fvec test_farray test_fmath \ 11 | test_quality test_proto test_cluster \ 12 | test_class 13 | 14 | test_ftable_SOURCES = test_ftable.c tests.c tests.h 15 | test_ftable_LDADD = $(top_builddir)/src/libmalheur.a 16 | 17 | test_fmath_SOURCES = test_fmath.c tests.c tests.h 18 | test_fmath_LDADD = $(top_builddir)/src/libmalheur.a 19 | 20 | test_fvec_SOURCES = test_fvec.c tests.c tests.h 21 | test_fvec_LDADD = $(top_builddir)/src/libmalheur.a 22 | 23 | test_farray_SOURCES = test_farray.c tests.c tests.h 24 | test_farray_LDADD = $(top_builddir)/src/libmalheur.a 25 | 26 | test_quality_SOURCES = test_quality.c tests.c tests.h 27 | test_quality_LDADD = $(top_builddir)/src/libmalheur.a 28 | 29 | test_proto_SOURCES = test_proto.c tests.c tests.h 30 | test_proto_LDADD = $(top_builddir)/src/libmalheur.a 31 | 32 | test_cluster_SOURCES = test_cluster.c tests.c tests.h 33 | test_cluster_LDADD = $(top_builddir)/src/libmalheur.a 34 | 35 | test_class_SOURCES = test_class.c tests.c tests.h 36 | test_class_LDADD = $(top_builddir)/src/libmalheur.a 37 | 38 | beautify: 39 | gnuindent -i4 -kr -l77 -lc77 --no-tabs -cs \ 40 | -T FILE -T fvec_t -T farray_t -T cluster_t \ 41 | -T assign_t -T config_t -T config_setting_T \ 42 | -T DIR -T DIRP *.c *.h 43 | 44 | -------------------------------------------------------------------------------- /src/fvec.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef FVEC_H 15 | #define FVEC_H 16 | 17 | #include 18 | #include 19 | 20 | /** Data type for a feature */ 21 | typedef uint64_t feat_t; 22 | 23 | /** Placeholder for non-initialized delimiters */ 24 | #define DELIM_NOT_INIT 42 25 | 26 | /** 27 | * Sparse feature vector. The vector is stored as a sorted list 28 | * of non-zero dimensions containing real numbers. The dimensions 29 | * are specified as regular indices or alternatively as 64bit 30 | * hash values. 31 | */ 32 | typedef struct { 33 | feat_t *dim; /**< List of dimensions */ 34 | float *val; /**< List of values */ 35 | unsigned long len; /**< Length of list */ 36 | unsigned long total; /**< Total features in sequence */ 37 | unsigned long mem; /**< Allocated memory in bytes */ 38 | char *src; /**< Source of features, e.g. file */ 39 | } fvec_t; 40 | 41 | /* Functions */ 42 | char *fvec_preproc(char *); 43 | fvec_t *fvec_zero(); 44 | fvec_t *fvec_extract(char *, int l, char *); 45 | void fvec_destroy(fvec_t *); 46 | fvec_t *fvec_clone(fvec_t *); 47 | void fvec_print(fvec_t *); 48 | void fvec_save(fvec_t *, gzFile); 49 | void fvec_save_libsvm(fvec_t *, gzFile, int); 50 | fvec_t *fvec_load(gzFile); 51 | void fvec_reset_delim(); 52 | void fvec_realloc(fvec_t *); 53 | 54 | #endif /* FVEC_H */ 55 | -------------------------------------------------------------------------------- /tests/tests.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #include "tests.h" 15 | 16 | /** Global time stamp */ 17 | static double start_time; 18 | 19 | /** 20 | * Print a message for a starting test 21 | * @param fmt Format string. 22 | */ 23 | void test_printf(char *fmt, ...) 24 | { 25 | va_list ap; 26 | char s[256] = { " " }; 27 | int i; 28 | 29 | /* Assemble format string */ 30 | va_start(ap, fmt); 31 | vsnprintf(s, 256, fmt, ap); 32 | va_end(ap); 33 | 34 | /* Print line */ 35 | printf("%s ", s); 36 | for (i = strlen(s) + 1; i < LINE_WIDTH; i++) 37 | printf("."); 38 | printf(" "); 39 | 40 | /* Save start time */ 41 | start_time = time_stamp(); 42 | 43 | fflush(stdout); 44 | } 45 | 46 | /** 47 | * Print a message for a failed test 48 | * @param fmt Format string. 49 | */ 50 | void test_error(char *fmt, ...) 51 | { 52 | va_list ap; 53 | char s[256] = { " " }; 54 | int i; 55 | 56 | /* Assemble format string */ 57 | va_start(ap, fmt); 58 | vsnprintf(s, 256, fmt, ap); 59 | va_end(ap); 60 | 61 | /* Print line */ 62 | printf("\nError: %s", s); 63 | for (i = strlen(s) + 6; i < LINE_WIDTH; i++) 64 | printf(" "); 65 | 66 | fflush(stdout); 67 | } 68 | 69 | 70 | /** 71 | * Print return message for a test 72 | * @param e number of failed tests 73 | * @param m number of all tests 74 | */ 75 | void test_return(int e, int m) 76 | { 77 | double time = time_stamp() - start_time; 78 | printf("%s [%3.0f%%] %.2fs\n", e == 0 ? " OK" : "FAIL", 79 | 100.0 * (m - e) / m, time); 80 | } 81 | -------------------------------------------------------------------------------- /tests/test_quality.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | */ 12 | 13 | #include "tests.h" 14 | #include "quality.h" 15 | 16 | /* Global variables */ 17 | int verbose = 0; 18 | config_t cfg; 19 | 20 | /* Test structure */ 21 | typedef struct { 22 | unsigned int y[4]; /* True labels */ 23 | unsigned int a[4]; /* Assigned labels */ 24 | double e[5]; /* Quality measures */ 25 | } test_t; 26 | 27 | /* Quality test cases */ 28 | test_t tests[] = { 29 | {{0, 0, 1, 1}, {1, 1, 2, 2}, {1.0000, 1.0000, 1.0000, 1.0000, 1.0000}}, 30 | {{0, 0, 0, 0}, {1, 1, 3, 3}, {1.0000, 0.5000, 0.6667, 0.5000, 0.0000}}, 31 | {{0, 0, 1, 1}, {1, 1, 1, 1}, {0.5000, 1.0000, 0.6667, 0.5000, 0.0000}}, 32 | {{1, 2, 3, 4}, {1, 1, 1, 1}, {0.2500, 1.0000, 0.4000, 0.2500, 0.0000}}, 33 | {{1, 1, 2, 2}, {1, 1, 1, 3}, {0.7500, 0.7500, 0.7500, 0.6250, 0.2500}}, 34 | {{1, 1, 1, 1}, {1, 1, 1, 1}, {-1, -1, -1, -1, -1}} 35 | }; 36 | 37 | /** 38 | * Test the generic quality function 39 | */ 40 | int test_quality() 41 | { 42 | int i, j, err = 0; 43 | double *e; 44 | 45 | test_printf("Computing quality measures"); 46 | for (i = 0; tests[i].e[0] > -1; i++) { 47 | e = quality(tests[i].y, tests[i].a, 4); 48 | for (j = 0; j < 5; j++) 49 | err += fabs(tests[i].e[j] - e[j]) > 1e-3; 50 | } 51 | 52 | test_return(err, i * 5); 53 | return err; 54 | } 55 | 56 | /** 57 | * Main function 58 | */ 59 | int main(int argc, char **argv) 60 | { 61 | int err = FALSE; 62 | 63 | err |= test_quality(); 64 | 65 | config_destroy(&cfg); 66 | return err; 67 | } 68 | -------------------------------------------------------------------------------- /src/farray.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef FARRAY_H 15 | #define FARRAY_H 16 | 17 | #include "zlib.h" 18 | #include "fvec.h" 19 | 20 | #ifdef HAVE_UTHASH_UTHASH_H 21 | #include 22 | #else 23 | #ifdef HAVE_UTHASH_H 24 | #include 25 | #else 26 | #include "uthash.h" 27 | #endif 28 | #endif 29 | 30 | /* Allocate memory in blocks of this size */ 31 | #define BLOCK_SIZE (4096 / sizeof(farray_t)) 32 | 33 | /** 34 | * Entry for label hash table. 35 | */ 36 | typedef struct { 37 | char name[64]; /**< Label name (key 1)*/ 38 | unsigned int index; /**< Label index (key 2)*/ 39 | UT_hash_handle hn; /**< Uthash handle 1 */ 40 | UT_hash_handle hi; /**< Uthash handle 2 */ 41 | } label_t; 42 | 43 | /** 44 | * Array of feature vectors. 45 | */ 46 | typedef struct { 47 | fvec_t **x; /**< Array of feature vectors */ 48 | unsigned int *y; /**< Array of label indices */ 49 | unsigned long len; /**< Length of array */ 50 | unsigned long mem; /**< Allocated memory in bytes */ 51 | 52 | label_t *label_name; /**< Table of label names */ 53 | label_t *label_index; /**< Table of label indices */ 54 | char *src; /**< Source of array, e.g. dir */ 55 | } farray_t; 56 | 57 | /* Feature array functions */ 58 | farray_t *farray_create(char *); 59 | void farray_add(farray_t *, fvec_t *, char *); 60 | void farray_destroy(farray_t *); 61 | void farray_print(farray_t *); 62 | farray_t *farray_merge(farray_t *, farray_t *); 63 | char *farray_get_label(farray_t *fa, int i); 64 | int farray_get_fixed(farray_t *fa); 65 | 66 | /* Extract function */ 67 | farray_t *farray_extract(char *); 68 | farray_t *farray_extract_dir(char *); 69 | farray_t *farray_extract_archive(char *); 70 | 71 | /* I/O functions */ 72 | void farray_save(farray_t *, gzFile); 73 | void farray_save_file(farray_t *, char *); 74 | void farray_save_libsvm(farray_t *, gzFile); 75 | void farray_save_libsvm_file(farray_t *, char *); 76 | void farray_append_file(farray_t *, char *); 77 | farray_t *farray_load(gzFile); 78 | farray_t *farray_load_file(char *); 79 | 80 | #endif /* FARRAY_H */ 81 | -------------------------------------------------------------------------------- /src/util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #ifndef UTIL_H 15 | #define UTIL_H 16 | 17 | #include "config.h" 18 | 19 | #ifdef HAVE_UTHASH_UTHASH_H 20 | #include 21 | #else 22 | #ifdef HAVE_UTHASH_H 23 | #include 24 | #else 25 | #include "uthash.h" 26 | #endif 27 | #endif 28 | 29 | /* Progress bar stuff */ 30 | #define PROGBAR_LEN 52 31 | #define PROGBAR_EMPTY ':' 32 | #define PROGBAR_FULL '#' 33 | #define PROGBAR_DONE '#' 34 | #define PROGBAR_FRONT '|' 35 | 36 | /** 37 | * Counts for predicted labels 38 | */ 39 | typedef struct { 40 | unsigned int label; /**< Predicted label */ 41 | double count; /**< Number of elemtns with label */ 42 | UT_hash_handle hh; /**< Hash table entry */ 43 | } count_t; 44 | 45 | /* Fatal message */ 46 | #ifndef fatal 47 | #define fatal(...) {err_msg("Error", __func__, __VA_ARGS__); exit(-1);} 48 | #endif 49 | /* Error message */ 50 | #ifndef error 51 | #define error(...) {err_msg("Error", __func__, __VA_ARGS__);} 52 | #endif 53 | /* Warning message */ 54 | #ifndef warning 55 | #define warning(...) {err_msg("Warning", __func__, __VA_ARGS__);} 56 | #endif 57 | 58 | /** 59 | * Structure for indexed data. The structure enables comparing and sorting 60 | * data along with its indices, such that the order of indices can be later 61 | * retrieved. 62 | */ 63 | typedef struct { 64 | void *ptr; /**< Pointer to original data */ 65 | int idx; /**< Index number */ 66 | int (*cmp) (const void *, const void *); /**< Comparison function */ 67 | } index_t; 68 | 69 | /* Utility functions functions */ 70 | void err_msg(char *, const char *, char *, ...); 71 | void prog_bar(long, long, long); 72 | double time_stamp(); 73 | int decode_string(char *); 74 | char *file_suffix(char *file); 75 | char *load_file(char *, char *); 76 | int copy_file(char *src, char *dst); 77 | void list_dir_entries(char *dir, int *, int *); 78 | void list_arc_entries(char *arc, int *, int *); 79 | 80 | /* Version */ 81 | void malheur_version(FILE *f); 82 | 83 | /* Comparison function */ 84 | int *qsort_idx(void *b, size_t n, size_t w, 85 | int (*c) (const void *, const void *)); 86 | int cmp_feat(const void *, const void *); 87 | int cmp_index(const void *, const void *); 88 | 89 | /* Useful math functions */ 90 | int array_max(double *, int); 91 | int array_min(double *, int); 92 | long tria_size(long); 93 | long tria_pos(long, long, long); 94 | 95 | #endif /* UTIL_H */ 96 | -------------------------------------------------------------------------------- /src/murmur.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MurmurHash2, 64-bit versions, by Austin Appleby 3 | * -- 4 | * The same caveats as 32-bit MurmurHash2 apply here - beware of alignment 5 | * and endian-ness issues if used across multiple platforms. 6 | */ 7 | 8 | #include "config.h" 9 | #include "common.h" 10 | 11 | uint32_t MurmurHash2(const void *key, int32_t len, uint32_t seed) 12 | { 13 | // 'm' and 'r' are mixing constants generated offline. 14 | // They're not really 'magic', they just happen to work well. 15 | 16 | const uint32_t m = 0x5bd1e995; 17 | const int32_t r = 24; 18 | 19 | // Initialize the hash to a 'random' value 20 | 21 | uint32_t h = seed ^ len; 22 | 23 | // Mix 4 bytes at a time into the hash 24 | 25 | const unsigned char *data = (const unsigned char *) key; 26 | 27 | while (len >= 4) { 28 | uint32_t k = *(uint32_t *) data; 29 | 30 | k *= m; 31 | k ^= k >> r; 32 | k *= m; 33 | 34 | h *= m; 35 | h ^= k; 36 | 37 | data += 4; 38 | len -= 4; 39 | } 40 | 41 | // Handle the last few bytes of the input array 42 | 43 | switch (len) { 44 | case 3: 45 | h ^= data[2] << 16; 46 | case 2: 47 | h ^= data[1] << 8; 48 | case 1: 49 | h ^= data[0]; 50 | h *= m; 51 | }; 52 | 53 | // Do a few final mixes of the hash to ensure the last few 54 | // bytes are well-incorporated. 55 | 56 | h ^= h >> 13; 57 | h *= m; 58 | h ^= h >> 15; 59 | 60 | return h; 61 | } 62 | 63 | 64 | /* 64-bit hash for 32-bit platforms */ 65 | uint64_t MurmurHash64B(const void *key, int32_t len, uint32_t seed) 66 | { 67 | const uint32_t m = 0x5bd1e995; 68 | const int32_t r = 24; 69 | 70 | uint32_t h1 = seed ^ len; 71 | uint32_t h2 = 0; 72 | 73 | const uint32_t *data = (const uint32_t *) key; 74 | 75 | while (len >= 8) { 76 | uint32_t k1 = *data++; 77 | k1 *= m; 78 | k1 ^= k1 >> r; 79 | k1 *= m; 80 | h1 *= m; 81 | h1 ^= k1; 82 | len -= 4; 83 | 84 | uint32_t k2 = *data++; 85 | k2 *= m; 86 | k2 ^= k2 >> r; 87 | k2 *= m; 88 | h2 *= m; 89 | h2 ^= k2; 90 | len -= 4; 91 | } 92 | 93 | if (len >= 4) { 94 | uint32_t k1 = *data++; 95 | k1 *= m; 96 | k1 ^= k1 >> r; 97 | k1 *= m; 98 | h1 *= m; 99 | h1 ^= k1; 100 | len -= 4; 101 | } 102 | 103 | switch (len) { 104 | case 3: 105 | h2 ^= ((unsigned char *) data)[2] << 16; 106 | case 2: 107 | h2 ^= ((unsigned char *) data)[1] << 8; 108 | case 1: 109 | h2 ^= ((unsigned char *) data)[0]; 110 | h2 *= m; 111 | }; 112 | 113 | h1 ^= h2 >> 18; 114 | h1 *= m; 115 | h2 ^= h1 >> 22; 116 | h2 *= m; 117 | h1 ^= h2 >> 17; 118 | h1 *= m; 119 | h2 ^= h1 >> 19; 120 | h2 *= m; 121 | 122 | uint64_t h = h1; 123 | 124 | h = (h << 32) | h2; 125 | 126 | return h; 127 | } 128 | -------------------------------------------------------------------------------- /src/mist.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | /** 15 | * @defgroup mist Preprocessing of MIST 16 | * The module contains functions for processing of so-called MIST 17 | * reports as developed at University of Mannheim. 18 | * @author Konrad Rieck 19 | * @{ 20 | */ 21 | #include "config.h" 22 | #include "common.h" 23 | #include "mist.h" 24 | #include "util.h" 25 | 26 | /* Size of read buffer */ 27 | #define BUFFER_SIZE 1024 28 | 29 | /* External variables */ 30 | extern int verbose; 31 | extern config_t cfg; 32 | 33 | /** 34 | * Copy a line into the given buffer and advance pointer 35 | * @param ptr Pointer to data 36 | * @param buffer Buffer to write data to 37 | * @return true on success, false otherwise 38 | */ 39 | static int mist_read_line(char **ptr, char *buffer) 40 | { 41 | int i = 0; 42 | 43 | /* Copy line */ 44 | for (i = 0; i < BUFFER_SIZE; i++) { 45 | if ((*ptr)[i] == 0) 46 | return FALSE; 47 | if ((*ptr)[i] == MIST_INSTR) 48 | break; 49 | buffer[i] = (*ptr)[i]; 50 | } 51 | buffer[i] = 0; 52 | 53 | /* Move line */ 54 | *ptr += strlen(buffer) + 1; 55 | return TRUE; 56 | } 57 | 58 | /** 59 | * Copy a MIST instruction to the given pointer 60 | * @param ptr pointer to destrination 61 | * @param line pointer to source (line of instruction) 62 | * @param level MIST level to keep 63 | * @return destination for next instruction 64 | */ 65 | static char *mist_copy_instr(char *ptr, char *line, int level) 66 | { 67 | int i, l = 0, m = strlen(line); 68 | for (i = 0; i < m; i++) { 69 | if (line[i] == MIST_LEVEL) 70 | l++; 71 | if (l >= level) 72 | break; 73 | 74 | ptr[i] = line[i]; 75 | } 76 | 77 | /* Add a carriage return */ 78 | ptr[i] = '\n'; 79 | 80 | /* Update pointer */ 81 | return ptr + i + 1; 82 | } 83 | 84 | /** 85 | * Preprocesses a MIST report 86 | * @param report Report as string 87 | * @return preprocessed report 88 | */ 89 | char *mist_preproc(char *report) 90 | { 91 | assert(report); 92 | 93 | int level, ti = 0, ri = 0; 94 | char *read_ptr = report, *write_ptr = report; 95 | char line[BUFFER_SIZE]; 96 | 97 | /* Get MIST configuration */ 98 | config_lookup_int(&cfg, "features.mist_level", (int *) &level); 99 | 100 | /* Process MIST file */ 101 | while (mist_read_line(&read_ptr, line)) { 102 | if (line[0] == MIST_COMMENT) { 103 | /* Reset thread counter on new thread */ 104 | if (strstr(line, MIST_THREAD)) 105 | ti = 0; 106 | } else if (isalnum(line[0])) { 107 | write_ptr = mist_copy_instr(write_ptr, line, level); 108 | ri++; 109 | ti++; 110 | } 111 | } 112 | 113 | /* Terminate string */ 114 | *write_ptr = 0; 115 | return report; 116 | } 117 | 118 | 119 | /** @} */ 120 | -------------------------------------------------------------------------------- /tests/test_proto.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | */ 12 | 13 | #include "tests.h" 14 | #include "mconfig.h" 15 | #include "farray.h" 16 | #include "ftable.h" 17 | #include "proto.h" 18 | #include "fmath.h" 19 | 20 | /* Global variables */ 21 | config_t cfg; 22 | int verbose = 0; 23 | 24 | 25 | /* Number of stress runs */ 26 | #define STRESS_RUNS 20 27 | /* String length */ 28 | #define STR_LENGTH 500 29 | /* Number of vector */ 30 | #define NUM_VECTORS 500 31 | /* Number of elements in test data */ 32 | #define DATA_LEN 15 33 | /* Number of correct prototypes */ 34 | #define DATA_PROTO 5 35 | 36 | /* Data set for prototype extraction */ 37 | static char *test_data[] = { 38 | "XX YY XX YY XX YY", "XX YY XX ZZ XX YY", "XX ZZ XX YY XX YY", 39 | "AA BB AA BB AA BB", "AA BB AA CC AA BB", "AA CC AA BB AA BB", 40 | "MM NN MM NN MM NN", "MM NN MM OO MM NN", "MM OO MM NN MM NN", 41 | "UU VV UU VV UU VV", "UU VV UU WW UU VV", "UU WW UU VV UU VV", 42 | "RR SS RR SS RR SS", "RR SS RR TT RR SS", "RR TT RR SS RR SS" 43 | }; 44 | 45 | /** 46 | * Test the extraction of prototypes 47 | */ 48 | int test_proto() 49 | { 50 | int i, err = 0; 51 | double dist[DATA_LEN * DATA_PROTO]; 52 | 53 | test_printf("Prototype extraction"); 54 | 55 | /* Prepare test data */ ; 56 | farray_t *fa = farray_create("test"); 57 | for (i = 0; i < DATA_LEN; i++) { 58 | fvec_t *f = fvec_extract(test_data[i], strlen(test_data[i]), NULL); 59 | farray_add(fa, f, "test"); 60 | } 61 | 62 | /* Extract prototypes */ 63 | assign_t *as; 64 | farray_t *pr = proto_extract(fa, &as); 65 | assign_destroy(as); 66 | 67 | /* Check number of prototypes */ 68 | err += (pr->len != DATA_PROTO); 69 | 70 | /* Check position of prototypes */ 71 | farray_dist(fa, pr, dist); 72 | for (i = 0; i < DATA_PROTO; i++) 73 | err += fabs(dist[i * DATA_LEN + i]) > 1e-3; 74 | 75 | /* Clean up */ 76 | farray_destroy(pr); 77 | farray_destroy(fa); 78 | 79 | test_return(err, 1 + DATA_PROTO); 80 | return err; 81 | } 82 | 83 | /* 84 | * A simple stress test for prototype extraction 85 | */ 86 | int test_stress() 87 | { 88 | int i, j, k, err = 0; 89 | fvec_t *f; 90 | farray_t *fa; 91 | char buf[STR_LENGTH + 1], label[32]; 92 | 93 | test_printf("Stress test for prototype extraction"); 94 | 95 | for (i = 0; i < STRESS_RUNS; i++) { 96 | /* Create array */ 97 | fa = farray_create("test"); 98 | 99 | for (j = 0; j < NUM_VECTORS; j++) { 100 | for (k = 0; k < STR_LENGTH; k++) 101 | buf[k] = rand() % 10 + '0'; 102 | buf[k] = 0; 103 | 104 | /* Extract features */ 105 | f = fvec_extract(buf, strlen(buf), "test"); 106 | snprintf(label, 32, "label%.2d", rand() % 10); 107 | 108 | /* Add to array */ 109 | farray_add(fa, f, label); 110 | } 111 | 112 | /* Extract prototypes */ 113 | assign_t *as; 114 | farray_t *pr = proto_extract(fa, &as); 115 | assign_destroy(as); 116 | 117 | /* Destroy features */ 118 | farray_destroy(fa); 119 | farray_destroy(pr); 120 | } 121 | 122 | test_return(err, STRESS_RUNS); 123 | return err; 124 | } 125 | 126 | /** 127 | * Main function 128 | */ 129 | int main(int argc, char **argv) 130 | { 131 | int err = FALSE; 132 | 133 | /* Create config */ 134 | config_init(&cfg); 135 | config_check(&cfg); 136 | config_set_string(&cfg, "generic.event_delim", " "); 137 | config_set_string(&cfg, "features.vect_embed", "cnt"); 138 | 139 | ftable_init(); 140 | 141 | err |= test_proto(); 142 | err |= test_stress(); 143 | 144 | ftable_destroy(); 145 | 146 | config_destroy(&cfg); 147 | return err; 148 | } 149 | -------------------------------------------------------------------------------- /tests/test_class.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | */ 12 | 13 | #include "tests.h" 14 | #include "mconfig.h" 15 | #include "farray.h" 16 | #include "ftable.h" 17 | #include "cluster.h" 18 | #include "fmath.h" 19 | 20 | /* Global variables */ 21 | config_t cfg; 22 | int verbose = 0; 23 | 24 | /* Test structure */ 25 | typedef struct { 26 | char *str; /* Test report */ 27 | char *label; /* Test label */ 28 | } test_t; 29 | 30 | /* Number of stress runs */ 31 | #define STRESS_RUNS 20 32 | /* String length */ 33 | #define STR_LENGTH 500 34 | /* Number of vector */ 35 | #define NUM_VECTORS 500 36 | 37 | /* Training set for classification */ 38 | static test_t train_data[] = { 39 | {"A B B B X", "1"}, {"A A B B X", "1"}, {"B B A B X", "1"}, 40 | {"X Y B B Z", "2"}, {"A B Z X Y", "2"}, {"A X Y B Z", "2"}, 41 | {NULL, NULL} 42 | }; 43 | 44 | /* Test set for classification */ 45 | static test_t test_data[] = { 46 | {"A A B B X", "1"}, {"Z A B B X", "1"}, {"A B B X A", "1"}, 47 | {"A A B B X", "1"}, {"X Y B Z Z", "2"}, {"B Z X Y X", "2"}, 48 | {"A X Y B Z", "2"}, {"A X B B Z", "2"}, {"A Z X Y A", "2"}, 49 | {NULL, NULL}, 50 | }; 51 | 52 | /** 53 | * Simple test cases classification 54 | */ 55 | int test_classify() 56 | { 57 | int i, k, err = 0; 58 | fvec_t *f; 59 | 60 | test_printf("Classification using prototypes"); 61 | 62 | /* Prepare training data */ 63 | farray_t *fa1 = farray_create("train"); 64 | for (i = 0; train_data[i].str; i++) { 65 | f = fvec_extract(train_data[i].str, strlen(train_data[i].str), NULL); 66 | farray_add(fa1, f, train_data[i].label); 67 | } 68 | 69 | /* Prepare testing data */ 70 | farray_t *fa2 = farray_create("train"); 71 | for (i = 0; test_data[i].str; i++) { 72 | f = fvec_extract(test_data[i].str, strlen(test_data[i].str), NULL); 73 | farray_add(fa2, f, test_data[i].label); 74 | } 75 | 76 | /* Classification of test data */ 77 | config_set_float(&cfg, "classify.max_dist", 1.41); 78 | assign_t *a = class_assign(fa2, fa1); 79 | 80 | /* Check predicted labels */ 81 | for (k = 0; test_data[k].str; k++) { 82 | char *l = farray_get_label(fa1, a->proto[k]); 83 | err += strcmp(l, test_data[k].label) != 0; 84 | } 85 | 86 | /* Clean up */ 87 | assign_destroy(a); 88 | farray_destroy(fa1); 89 | farray_destroy(fa2); 90 | 91 | test_return(err, i); 92 | return err; 93 | } 94 | 95 | /* 96 | * A simple stress test for classification 97 | */ 98 | int test_stress() 99 | { 100 | int i, j, k, err = 0; 101 | fvec_t *f; 102 | farray_t *fa; 103 | char buf[STR_LENGTH + 1], label[32]; 104 | 105 | test_printf("Stress test for classification"); 106 | 107 | for (i = 0; i < STRESS_RUNS; i++) { 108 | /* Create array */ 109 | fa = farray_create("test"); 110 | 111 | for (j = 0; j < NUM_VECTORS; j++) { 112 | for (k = 0; k < STR_LENGTH; k++) 113 | buf[k] = rand() % 10 + '0'; 114 | buf[k] = 0; 115 | 116 | /* Extract features */ 117 | f = fvec_extract(buf, strlen(buf), "test"); 118 | snprintf(label, 32, "label%.2d", rand() % 10); 119 | 120 | /* Add to array */ 121 | farray_add(fa, f, label); 122 | } 123 | 124 | assign_t *a = class_assign(fa, fa); 125 | assign_destroy(a); 126 | farray_destroy(fa); 127 | } 128 | 129 | test_return(err, STRESS_RUNS); 130 | return err; 131 | } 132 | 133 | 134 | /** 135 | * Main function 136 | */ 137 | int main(int argc, char **argv) 138 | { 139 | int err = FALSE; 140 | 141 | /* Create config */ 142 | config_init(&cfg); 143 | config_check(&cfg); 144 | config_set_string(&cfg, "generic.event_delim", " "); 145 | 146 | ftable_init(); 147 | err |= test_classify(); 148 | err |= test_stress(); 149 | ftable_destroy(); 150 | 151 | config_destroy(&cfg); 152 | return err; 153 | } 154 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # MALHEUR - Automatic Analysis of Malware Behavior 2 | # Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 3 | # University of Goettingen, Berlin Institute of Technology 4 | # 5 | 6 | # Malheur version 7 | AC_INIT([malheur], [0.6.0], [konrad@mlsec.org]) 8 | AC_PREFIX_DEFAULT("/usr/local") 9 | 10 | # Defines (suitable for loading and saving versioned files) 11 | AC_DEFINE(MALHEUR_MAJOR, 0, Major version number) 12 | AC_DEFINE(MALHEUR_MINOR, 6, Minor version number) 13 | AC_DEFINE(MALHEUR_PATCH, 0, Patch version number) 14 | 15 | 16 | echo 17 | echo " > MALHEUR - Automatic Analysis of Malware Behavior" 18 | echo " Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org)" 19 | echo " University of Goettingen, Berlin Institute of Technology" 20 | echo 21 | 22 | # Init automake 23 | AM_INIT_AUTOMAKE 24 | AC_CONFIG_HEADER([config.h]) 25 | 26 | # Check for important programs 27 | AC_PROG_CC 28 | AC_PROG_LD 29 | AC_PROG_INSTALL 30 | 31 | # Libtool stuff 32 | AC_CONFIG_MACRO_DIR(m4) 33 | AC_PROG_LIBTOOL 34 | 35 | # By default remove assert statements 36 | CFLAGS="$CFLAGS -DNDEBUG" 37 | 38 | # By default we include the math library 39 | LIBS="$LIBS -lm" 40 | 41 | # Set GCC and C99 flags if present 42 | if test "$GCC" = "yes" ; then 43 | CFLAGS="$CFLAGS -std=c99 -fgnu89-inline -Wall -fPIC" 44 | fi 45 | 46 | # Optional packages 47 | AC_ARG_WITH([libarchive], [AS_HELP_STRING([--with-libarchive], 48 | [support for reading archives @<:@default=check@:>@])], 49 | [], [with_libarchive=check]) 50 | AC_ARG_WITH([openmp], [AS_HELP_STRING([--with-openmp], 51 | [support for multi-processing @<:@default=check@:>@])], 52 | [], [with_openmp=check]) 53 | 54 | 55 | # Check for zlib (required) 56 | AC_CHECK_HEADERS([zlib.h], HEADER_ZLIB="yes") 57 | AC_CHECK_LIB([z], gzopen, LIBRARY_ZLIB="yes") 58 | if test "x$LIBRARY_ZLIB" != "x" && test "x$HEADER_ZLIB" != "x" ; then 59 | LIBS="-lz $LIBS" 60 | AC_DEFINE([HAVE_ZLIB], [1], [Define if you have zlib]) 61 | HAVE_ZLIB=yes 62 | else 63 | HAVE_ZLIB=no 64 | AC_MSG_FAILURE([libz not found. see README.md]) 65 | fi 66 | 67 | # Check for libarchive (optional) 68 | AC_CHECK_HEADERS([archive.h], HEADER_LIBARCHIVE="yes") 69 | AC_CHECK_LIB([archive], archive_read_new, LIBRARY_LIBARCHIVE="yes") 70 | if test "x$LIBRARY_LIBARCHIVE" != "x" && \ 71 | test "x$HEADER_LIBARCHIVE" != "x" && \ 72 | test "x$with_libarchive" != "xno" ; then 73 | AC_DEFINE([HAVE_LIBARCHIVE], [1], [Define if you have libarchive]) 74 | LIBS="-larchive $LIBS" 75 | HAVE_LIBARCHIVE=yes 76 | else 77 | HAVE_LIBARCHIVE=no 78 | if test "x$with_libarchive" == "xyes" ; then 79 | AC_MSG_FAILURE([libarchive not found. see README.md]) 80 | fi 81 | fi 82 | 83 | # Check for libconfig (required) 84 | AC_CHECK_HEADERS([libconfig.h], HEADER_LIBCONFIG="yes") 85 | PKG_CHECK_MODULES([PKGCONFIG], [libconfig >= 1.3.2], LIBRARY_LIBCONFIG="yes") 86 | if test "x$LIBRARY_LIBCONFIG" != "x" && \ 87 | test "x$HEADER_LIBCONFIG" != "x" ; then 88 | CFLAGS="$CFLAGS $PKGCONFIG_CFLAGS" 89 | LIBS="$LIBS $PKGCONFIG_LIBS" 90 | AM_CPPFLAGS="$AM_CPPFLAGS `pkg-config --cflags-only-I libconfig`" 91 | HAVE_LIBCONFIG=yes 92 | AC_DEFINE([HAVE_LIBCONFIG], [1], [Define if you have libconfig]) 93 | else 94 | HAVE_LIBCONFIG=no 95 | AC_MSG_FAILURE([libconfig not found. see README.md]) 96 | fi 97 | 98 | # Check for OpenMP (optional) 99 | AC_CHECK_HEADERS([omp.h], HEADER_OPENMP="yes") 100 | AX_OPENMP(LIBRARY_OPENMP="yes") 101 | if test "x$LIBRARY_OPENMP" != "x" && \ 102 | test "x$HEADER_OPENMP" != "x" && \ 103 | test "x$with_openmp" != "xno" ; then 104 | CFLAGS="$CFLAGS $OPENMP_CFLAGS" 105 | HAVE_OPENMP=yes 106 | AC_DEFINE([HAVE_OPENMP], [1], [Define if you have OpenMP]) 107 | else 108 | HAVE_OPENMP=no 109 | if test "x$with_openmp" == "xyes" ; then 110 | AC_MSG_FAILURE([no openmp support. see README.md]) 111 | fi 112 | fi 113 | 114 | # Check headers 115 | AC_CHECK_HEADERS([getopt.h string.h strings.h]) 116 | AC_CHECK_HEADERS([uthash.h uthash/uthash.h], HEADER_UTHASH="yes") 117 | 118 | # Check functions 119 | AC_CHECK_FUNC(round, AC_DEFINE(HAVE_FUNC_ROUND, 1, 120 | [Define to 1 if you have the function round])) 121 | AC_CHECK_FUNC(log2, AC_DEFINE(HAVE_FUNC_LOG2, 1, 122 | [Define to 1 if you have the function log2])) 123 | 124 | AC_SUBST([AM_CPPFLAGS]) 125 | AC_CONFIG_FILES([ 126 | Makefile \ 127 | src/Makefile \ 128 | doc/Makefile \ 129 | tests/Makefile \ 130 | ]) 131 | AC_OUTPUT 132 | 133 | echo 134 | echo " > Optional packages" 135 | echo " Support reading archives (--with-libarchive): $HAVE_LIBARCHIVE" 136 | echo " Support for multi-processing (--with-openmp): $HAVE_OPENMP" 137 | -------------------------------------------------------------------------------- /git2changes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Copyright 2008 Marcus D. Hanwell 3 | # Distributed under the terms of the GNU General Public License v2 or later 4 | 5 | import string, re, os 6 | 7 | # Execute git log with the desired command line options. 8 | fin = os.popen('git log --summary --stat --no-merges --date=short', 'r') 9 | # Create a ChangeLog file in the current directory. 10 | fout = open('CHANGES', 'w') 11 | 12 | # Set up the loop variables in order to locate the blocks we want 13 | authorFound = False 14 | dateFound = False 15 | messageFound = False 16 | filesFound = False 17 | message = "" 18 | messageNL = False 19 | files = "" 20 | prevAuthorLine = "" 21 | 22 | # The main part of the loop 23 | for line in fin: 24 | # The commit line marks the start of a new commit object. 25 | if string.find(line, 'commit') >= 0: 26 | # Start all over again... 27 | authorFound = False 28 | dateFound = False 29 | messageFound = False 30 | messageNL = False 31 | message = "" 32 | filesFound = False 33 | files = "" 34 | continue 35 | # Match the author line and extract the part we want 36 | elif re.match('Author:', line) >=0: 37 | authorList = re.split(': ', line, 1) 38 | author = authorList[1] 39 | author = author[0:len(author)-1] 40 | authorFound = True 41 | # Match the date line 42 | elif re.match('Date:', line) >= 0: 43 | dateList = re.split(': ', line, 1) 44 | date = dateList[1] 45 | date = date[0:len(date)-1] 46 | dateFound = True 47 | # The svn-id lines are ignored 48 | elif re.match(' git-svn-id:', line) >= 0: 49 | continue 50 | # The sign off line is ignored too 51 | elif re.search('Signed-off-by', line) >= 0: 52 | continue 53 | # Extract the actual commit message for this commit 54 | elif authorFound & dateFound & messageFound == False: 55 | # Find the commit message if we can 56 | if len(line) == 1: 57 | if messageNL: 58 | messageFound = True 59 | else: 60 | messageNL = True 61 | elif len(line) == 4: 62 | messageFound = True 63 | else: 64 | if len(message) == 0: 65 | message = message + line.strip() 66 | else: 67 | message = message + " " + line.strip() 68 | # If this line is hit all of the files have been stored for this commit 69 | elif re.search('files? changed', line) >= 0: 70 | filesFound = True 71 | continue 72 | # Collect the files for this commit. FIXME: Still need to add +/- to files 73 | elif authorFound & dateFound & messageFound: 74 | fileList = re.split(' \| ', line, 2) 75 | if len(fileList) > 1: 76 | if len(files) > 0: 77 | files = files + ", " + fileList[0].strip() 78 | else: 79 | files = fileList[0].strip() 80 | # All of the parts of the commit have been found - write out the entry 81 | if authorFound & dateFound & messageFound & filesFound: 82 | # First the author line, only outputted if it is the first for that 83 | # author on this day 84 | authorLine = date + " " + author 85 | if len(prevAuthorLine) == 0: 86 | fout.write(authorLine + "\n") 87 | elif authorLine == prevAuthorLine: 88 | pass 89 | else: 90 | fout.write("\n" + authorLine + "\n") 91 | 92 | # Assemble the actual commit message line(s) and limit the line length 93 | # to 80 characters. 94 | commitLine = "- " + message 95 | i = 0 96 | commit = "" 97 | while i < len(commitLine): 98 | if len(commitLine) < i + 78: 99 | commit = commit + "\n " + commitLine[i:len(commitLine)] 100 | break 101 | index = commitLine.rfind(' ', i, i+78) 102 | if index > i: 103 | commit = commit + "\n " + commitLine[i:index] 104 | i = index+1 105 | else: 106 | commit = commit + "\n " + commitLine[i:78] 107 | i = i+79 108 | 109 | # Write out the commit line 110 | fout.write(commit + '\n') 111 | 112 | #Now reset all the variables ready for a new commit block. 113 | authorFound = False 114 | dateFound = False 115 | messageFound = False 116 | messageNL = False 117 | message = "" 118 | filesFound = False 119 | files = "" 120 | prevAuthorLine = authorLine 121 | 122 | # Close the input and output lines now that we are finished. 123 | fin.close() 124 | fout.close() 125 | 126 | -------------------------------------------------------------------------------- /src/class.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | /** 15 | * @defgroup class Classification using prototypes 16 | * The module contains functions for assigning and classifying feature 17 | * vectors to known clusters. Clusters and classes are both represented 18 | * by appropriate prototype vectors. 19 | * @author Konrad Rieck 20 | * @{ 21 | */ 22 | #include "config.h" 23 | #include "common.h" 24 | #include "util.h" 25 | #include "class.h" 26 | #include "fmath.h" 27 | 28 | /* External variables */ 29 | extern int verbose; 30 | extern config_t cfg; 31 | 32 | /** 33 | * Creates an empty structure of assignments. The assignments can be 34 | * either computed for matching prototypes against feature vectors or 35 | * for classification of feature vectors. 36 | * @param fa Array of feature vectors 37 | * @return assignment structure 38 | */ 39 | assign_t *assign_create(farray_t *fa) 40 | { 41 | assert(fa); 42 | 43 | /* Allocate assignment structure */ 44 | assign_t *c = malloc(sizeof(assign_t)); 45 | if (!c) { 46 | error("Could not allocate assignment structure"); 47 | return NULL; 48 | } 49 | 50 | /* Allocate structure fields */ 51 | c->label = calloc(fa->len, sizeof(unsigned int)); 52 | c->proto = calloc(fa->len, sizeof(unsigned int)); 53 | c->dist = calloc(fa->len, sizeof(double)); 54 | c->len = fa->len; 55 | 56 | if (!c->label || !c->proto || !c->dist) { 57 | error("Could not allocate assignment structure"); 58 | assign_destroy(c); 59 | return NULL; 60 | } 61 | 62 | return c; 63 | } 64 | 65 | 66 | /** 67 | * Destroys an assignment structure. 68 | * @param c Assignment structure 69 | */ 70 | void assign_destroy(assign_t *c) 71 | { 72 | if (!c) 73 | return; 74 | if (c->label) 75 | free(c->label); 76 | if (c->proto) 77 | free(c->proto); 78 | if (c->dist) 79 | free(c->dist); 80 | free(c); 81 | } 82 | 83 | 84 | /** 85 | * Classify feature vectors using labeled prototypes. The function assigns 86 | * the given feature vectors to the given prototypes and predicts labels. 87 | * Feature vectors with a too large distance are rejected from the 88 | * classification by setting their label to 0. 89 | * @param fa Array of feature vectors 90 | * @param p Array of prototypes 91 | * @return Assignment structure 92 | */ 93 | assign_t *class_assign(farray_t *fa, farray_t *p) 94 | { 95 | assert(fa && p); 96 | int i, k, j, cnt = 0; 97 | double d = 0; 98 | double maxdist; 99 | 100 | config_lookup_float(&cfg, "classify.max_dist", &maxdist); 101 | 102 | assign_t *c = assign_create(fa); 103 | 104 | if (verbose > 0) 105 | printf("Classifying feature vectors to %lu prototypes.\n", p->len); 106 | 107 | #pragma omp parallel for shared(fa,c,p) private(k,j) 108 | for (i = 0; i < fa->len; i++) { 109 | double min = DBL_MAX; 110 | for (k = 0, j = 0; k < p->len; k++) { 111 | d = fvec_dist(fa->x[i], p->x[k]); 112 | if (d < min) { 113 | min = d; 114 | j = k; 115 | } 116 | } 117 | 118 | if (p->len) { 119 | /* Compute assignments */ 120 | c->proto[i] = j; 121 | c->dist[i] = min; 122 | c->label[i] = p->y[j]; 123 | } 124 | 125 | if (c->dist[i] > maxdist) 126 | c->label[i] = 0; 127 | 128 | #pragma omp critical (cnt) 129 | if (verbose) 130 | prog_bar(0, fa->len, ++cnt); 131 | } 132 | 133 | if (verbose > 0) 134 | printf(" Done. Classified %lu feature vectors to %lu prototypes.\n", 135 | fa->len, p->len); 136 | 137 | return c; 138 | } 139 | 140 | /** 141 | * Return an array of rejected feature vectors. 142 | * @param as Assignment structure 143 | * @param fa Array of feature vectors 144 | * @return Rejected feature vectors 145 | */ 146 | farray_t *class_get_rejected(assign_t *as, farray_t *fa) 147 | { 148 | int i; 149 | farray_t *r = farray_create("rejected"); 150 | 151 | for (i = 0; i < fa->len; i++) { 152 | if (as->label[i]) 153 | continue; 154 | farray_add(r, fvec_clone(fa->x[i]), farray_get_label(fa, i)); 155 | } 156 | 157 | return r; 158 | } 159 | 160 | /** @} */ 161 | -------------------------------------------------------------------------------- /tests/test_ftable.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | #include "config.h" 15 | #include "common.h" 16 | #include "ftable.h" 17 | #include "tests.h" 18 | 19 | /* Test file */ 20 | #define TEST_FILE "test.ft" 21 | /* Number of stress runs */ 22 | #define STRESS_RUNS 10000 23 | /* String length */ 24 | #define STR_LENGTH 1024 25 | 26 | /* Global variables */ 27 | int verbose = 5; 28 | 29 | /* Test structure */ 30 | typedef struct { 31 | feat_t f; 32 | char *s; 33 | } test_t; 34 | 35 | /* Test features */ 36 | test_t tests[] = { 37 | {0, "a b c d e f"}, 38 | {-1, "a b c d e"}, 39 | {1, "a b c d"}, 40 | {0x10, "a b"}, 41 | {0x100, "a"}, 42 | {0xFFF, ""}, 43 | {0, 0} 44 | }; 45 | 46 | /* 47 | * A simple static test for the feature table 48 | */ 49 | int test_static() 50 | { 51 | int i, j, k, err = 0; 52 | fentry_t *f; 53 | 54 | test_printf("Creation and maintenance of feature table"); 55 | 56 | /* Initialize table */ 57 | ftable_init(); 58 | for (i = 0; tests[i].s != 0; i++) 59 | ftable_put(tests[i].f, tests[i].s, strlen(tests[i].s) + 1); 60 | 61 | /* Randomly query elements */ 62 | for (j = 0; j < 100; j++) { 63 | k = rand() % i; 64 | f = ftable_get(tests[k].f); 65 | 66 | /* Check for correct feature string */ 67 | if (memcmp(f->data, tests[k].s, f->len)) { 68 | test_error("(%d) '%s' != '%s'", k, f->data, tests[k].s); 69 | /* ftable_print(); */ 70 | err++; 71 | } 72 | } 73 | 74 | /* Destroy table */ 75 | ftable_destroy(); 76 | 77 | test_return(err, 100); 78 | return err; 79 | } 80 | 81 | /* 82 | * A simple stress test for the feature table 83 | */ 84 | int test_stress() 85 | { 86 | int i, j, err = 0; 87 | feat_t key; 88 | char buf[STR_LENGTH + 1]; 89 | 90 | test_printf("Stress test for feature table"); 91 | 92 | /* Initialize table */ 93 | ftable_init(); 94 | 95 | for (i = 0; i < STRESS_RUNS; i++) { 96 | /* Create random key and string */ 97 | key = rand() % 100; 98 | for (j = 0; j < STR_LENGTH; j++) 99 | buf[j] = rand() % 10 + '0'; 100 | buf[j] = 0; 101 | 102 | switch (rand() % 2) { 103 | case 0: 104 | /* Insert random string */ 105 | ftable_put(key, buf, strlen(buf)); 106 | break; 107 | case 1: 108 | /* Query for string */ 109 | ftable_get(key); 110 | break; 111 | } 112 | } 113 | 114 | /* Destroy table */ 115 | ftable_destroy(); 116 | 117 | test_return(err, STRESS_RUNS); 118 | return err; 119 | } 120 | 121 | /* 122 | * A test for loading and saving the feature table 123 | */ 124 | int test_load_save() 125 | { 126 | int i, j, err = 0; 127 | gzFile z; 128 | fentry_t *f; 129 | 130 | test_printf("Loading and saving of feature table"); 131 | 132 | /* Initialize table */ 133 | ftable_init(); 134 | for (i = 0; tests[i].s != 0; i++) 135 | ftable_put(tests[i].f, tests[i].s, strlen(tests[i].s) + 1); 136 | 137 | /* Create and save feature vectors */ 138 | if (!(z = gzopen(TEST_FILE, "wb9"))) { 139 | printf("Could not create file (ignoring)\n"); 140 | return FALSE; 141 | } 142 | ftable_save(z); 143 | gzclose(z); 144 | ftable_destroy(); 145 | 146 | /* Init and load */ 147 | ftable_init(); 148 | z = gzopen(TEST_FILE, "r"); 149 | ftable_load(z); 150 | gzclose(z); 151 | 152 | /* Check elements */ 153 | for (j = 0; j < i; j++) { 154 | f = ftable_get(tests[j].f); 155 | 156 | /* Check for correct feature string */ 157 | if (memcmp(f->data, tests[j].s, f->len)) { 158 | test_error("(%d) '%s' != '%s'", j, f->data, tests[j].s); 159 | err++; 160 | } 161 | } 162 | 163 | /* Destroy table */ 164 | ftable_destroy(); 165 | unlink(TEST_FILE); 166 | 167 | test_return(err, i); 168 | return (err > 0); 169 | } 170 | 171 | 172 | /** 173 | * Main function 174 | */ 175 | int main(int argc, char **argv) 176 | { 177 | int err = FALSE; 178 | 179 | err |= test_static(); 180 | err |= test_stress(); 181 | err |= test_load_save(); 182 | 183 | return err; 184 | } 185 | -------------------------------------------------------------------------------- /m4/openmp.m4: -------------------------------------------------------------------------------- 1 | 2 | # =========================================================================== 3 | # http://www.gnu.org/software/autoconf-archive/ax_openmp.html 4 | # =========================================================================== 5 | # 6 | # SYNOPSIS 7 | # 8 | # AX_OPENMP([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) 9 | # 10 | # DESCRIPTION 11 | # 12 | # This macro tries to find out how to compile programs that use OpenMP a 13 | # standard API and set of compiler directives for parallel programming 14 | # (see http://www-unix.mcs/) 15 | # 16 | # On success, it sets the OPENMP_CFLAGS/OPENMP_CXXFLAGS/OPENMP_F77FLAGS 17 | # output variable to the flag (e.g. -omp) used both to compile *and* link 18 | # OpenMP programs in the current language. 19 | # 20 | # NOTE: You are assumed to not only compile your program with these flags, 21 | # but also link it with them as well. 22 | # 23 | # If you want to compile everything with OpenMP, you should set: 24 | # 25 | # CFLAGS="$CFLAGS $OPENMP_CFLAGS" 26 | # #OR# CXXFLAGS="$CXXFLAGS $OPENMP_CXXFLAGS" 27 | # #OR# FFLAGS="$FFLAGS $OPENMP_FFLAGS" 28 | # 29 | # (depending on the selected language). 30 | # 31 | # The user can override the default choice by setting the corresponding 32 | # environment variable (e.g. OPENMP_CFLAGS). 33 | # 34 | # ACTION-IF-FOUND is a list of shell commands to run if an OpenMP flag is 35 | # found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it is 36 | # not found. If ACTION-IF-FOUND is not specified, the default action will 37 | # define HAVE_OPENMP. 38 | # 39 | # LICENSE 40 | # 41 | # Copyright (c) 2008 Steven G. Johnson 42 | # 43 | # This program is free software: you can redistribute it and/or modify it 44 | # under the terms of the GNU General Public License as published by the 45 | # Free Software Foundation, either version 3 of the License, or (at your 46 | # option) any later version. 47 | # 48 | # This program is distributed in the hope that it will be useful, but 49 | # WITHOUT ANY WARRANTY; without even the implied warranty of 50 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 51 | # Public License for more details. 52 | # 53 | # You should have received a copy of the GNU General Public License along 54 | # with this program. If not, see . 55 | # 56 | # As a special exception, the respective Autoconf Macro's copyright owner 57 | # gives unlimited permission to copy, distribute and modify the configure 58 | # scripts that are the output of Autoconf when processing the Macro. You 59 | # need not follow the terms of the GNU General Public License when using 60 | # or distributing such scripts, even though portions of the text of the 61 | # Macro appear in them. The GNU General Public License (GPL) does govern 62 | # all other use of the material that constitutes the Autoconf Macro. 63 | # 64 | # This special exception to the GPL applies to versions of the Autoconf 65 | # Macro released by the Autoconf Archive. When you make and distribute a 66 | # modified version of the Autoconf Macro, you may extend this special 67 | # exception to the GPL to apply to your modified version as well. 68 | 69 | #serial 10 70 | 71 | AC_DEFUN([AX_OPENMP], [ 72 | AC_PREREQ([2.68]) dnl for _AC_LANG_PREFIX 73 | 74 | AC_CACHE_CHECK([for OpenMP flag of _AC_LANG compiler], ax_cv_[]_AC_LANG_ABBREV[]_openmp, [save[]_AC_LANG_PREFIX[]FLAGS=$[]_AC_LANG_PREFIX[]FLAGS 75 | ax_cv_[]_AC_LANG_ABBREV[]_openmp=unknown 76 | # Flags to try: -fopenmp (gcc), -openmp (icc), -mp (SGI & PGI), 77 | # -xopenmp (Sun), -omp (Tru64), -qsmp=omp (AIX), none 78 | ax_openmp_flags="-fopenmp -openmp -mp -xopenmp -omp -qsmp=omp none" 79 | if test "x$OPENMP_[]_AC_LANG_PREFIX[]FLAGS" != x; then 80 | ax_openmp_flags="$OPENMP_[]_AC_LANG_PREFIX[]FLAGS $ax_openmp_flags" 81 | fi 82 | for ax_openmp_flag in $ax_openmp_flags; do 83 | case $ax_openmp_flag in 84 | none) []_AC_LANG_PREFIX[]FLAGS=$save[]_AC_LANG_PREFIX[] ;; 85 | *) []_AC_LANG_PREFIX[]FLAGS="$save[]_AC_LANG_PREFIX[]FLAGS $ax_openmp_flag" ;; 86 | esac 87 | AC_LINK_IFELSE([AC_LANG_SOURCE([[ 88 | #ifdef __cplusplus 89 | extern "C" 90 | #endif 91 | void omp_set_num_threads(int); 92 | 93 | static void 94 | parallel_fill(int * data, int n) 95 | { 96 | int i; 97 | #pragma omp parallel for 98 | for (i = 0; i < n; ++i) 99 | data[i] = i; 100 | } 101 | 102 | int 103 | main() 104 | { 105 | int arr[100000]; 106 | omp_set_num_threads(2); 107 | parallel_fill(arr, 100000); 108 | return 0; 109 | } 110 | ]])],[ax_cv_[]_AC_LANG_ABBREV[]_openmp=$ax_openmp_flag; break],[]) 111 | done 112 | []_AC_LANG_PREFIX[]FLAGS=$save[]_AC_LANG_PREFIX[]FLAGS 113 | ]) 114 | if test "x$ax_cv_[]_AC_LANG_ABBREV[]_openmp" = "xunknown"; then 115 | m4_default([$2],:) 116 | else 117 | if test "x$ax_cv_[]_AC_LANG_ABBREV[]_openmp" != "xnone"; then 118 | OPENMP_[]_AC_LANG_PREFIX[]FLAGS=$ax_cv_[]_AC_LANG_ABBREV[]_openmp 119 | fi 120 | m4_default([$1], [AC_DEFINE(HAVE_OPENMP,1,[Define if OpenMP is enabled])]) 121 | fi 122 | ])dnl AX_OPENMP 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Malheur - Automatic Analysis of Malware Behavior 3 | == 4 | 5 | This software belongs to the publication 6 | 7 | > Konrad Rieck, Philipp Trinius, Carsten Willems, and Thorsten Holz. 8 | > Automatic Analysis of Malware Behavior using Machine Learning. 9 | > Journal of Computer Security (JCS), 19 (4), 639–668, June 2011. 10 | > [Preprint](doc/2011-jcs.pdf) 11 | 12 | Introduction 13 | -- 14 | 15 | Malheur is a tool for the automatic analysis of malware behavior (program 16 | behavior recorded from malicious software in a sandbox environment). It 17 | has been designed to support the regular analysis of malicious software and 18 | the development of detection and defense measures. Malheur allows for 19 | identifying novel classes of malware with similar behavior and assigning 20 | unknown malware to discovered classes. It supports four basic actions for 21 | analysis which can be applied to reports of recorded behavior: 22 | 23 | 1. *Extraction of prototypes:* 24 | From a given set of reports, malheur identifies a subset of 25 | prototypes representative for the full data set. The prototypes 26 | provide a quick overview of recorded behavior and can be used to 27 | guide manual inspection. 28 | 29 | 2. *Clustering of behavior* 30 | Malheur automatically identifies groups (clusters) of reports 31 | containing similar behavior. Clustering allows for discovering novel 32 | classes of malware and provides the basis for crafting specific 33 | detection and defense mechanisms, such as anti-virus signatures. 34 | 35 | 3. *Classification of behavior:* 36 | Based on a set of previously clustered reports, malheur is able to 37 | assign unknown behavior to known groups of malware. Classification 38 | enables identifying novel and unknown variants of malware and can be 39 | used to filter program behavior prior to manual inspection. 40 | 41 | 4. *Incremental analysis:* 42 | Malheur can be applied incrementally for analysis of large data 43 | sets. By processing reports in chunks, the run-time as well as 44 | memory requirements can be significantly reduced. This renders 45 | long-term application of malheur feasible, for example for daily 46 | analysis of incoming malware programs. 47 | 48 | A detailed description of these techniques as well as technical 49 | background on analysis of malicious software is provided in the 50 | following articles: 51 | 52 | + "Automatic Analysis of Malware Behavior using Machine Learning." 53 | Konrad Rieck, Philipp Trinius, Carsten Willems, and Thorsten Holz 54 | Journal of Computer Security (JCS), 19 (4) 639-668, 2011. 55 | 56 | + "A Malware Instruction Set for Behavior-Based Analysis." 57 | Philipp Trinius, Carsten Willems, Thorsten Holz, and Konrad Rieck 58 | Technical report TR-2009-07, University of Mannheim, 2009 59 | 60 | Dependencies 61 | -- 62 | 63 | + libconfig >= 1.4, 64 | + libarchive >= 3.1.2, 65 | 66 | #### Debian & Ubuntu Linux 67 | 68 | The following packages need to be installed for compiling Malheur on Debian 69 | and Ubuntu Linux 70 | 71 | gcc 72 | libconfig9-dev 73 | libarchive-dev 74 | 75 | For bootstrapping Malheur from the GIT repository or manipulating the 76 | automake/autoconf configuration, the following additional packages are 77 | necessary. 78 | 79 | automake 80 | autoconf 81 | libtool 82 | 83 | #### Mac OS X 84 | 85 | For compiling Malheur on Mac OS X a working installation of Xcode is required 86 | including `gcc`. Additionally, the following packages need to be installed 87 | via Homebrew 88 | 89 | libconfig 90 | libarchive (from homebrew-alt) 91 | 92 | #### OpenBSD 93 | 94 | For compiling Malheur on OpenBSD the following packages are required. Note 95 | that you need to use `gmake` instead of `make` for building Malheur. 96 | 97 | gmake 98 | libconfig 99 | libarchive 100 | 101 | For bootstrapping Malheur from the GIT repository, the following packages 102 | need be additionally installed 103 | 104 | autoconf 105 | automake 106 | libtool 107 | 108 | Compilation & Installation 109 | -- 110 | 111 | From GIT repository first run 112 | 113 | $ ./bootstrap 114 | 115 | From tarball run 116 | 117 | $ ./configure [options] 118 | $ make 119 | $ make check 120 | $ make install 121 | 122 | Options for configure 123 | 124 | --prefix=PATH Set directory prefix for installation 125 | 126 | By default Malheur is installed into /usr/local. If you prefer 127 | a different location, use this option to select an installation 128 | directory. 129 | 130 | License 131 | -- 132 | 133 | This program is free software; you can redistribute it and/or modify 134 | it under the terms of the GNU General Public License as published by 135 | the Free Software Foundation; either version 3 of the License, or 136 | (at your option) any later version. This program is distributed 137 | without any warranty. See the GNU General Public License for more 138 | details. 139 | 140 | Copyright 141 | -- 142 | 143 | Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 144 | University of Goettingen, Berlin Institute of Technology 145 | -------------------------------------------------------------------------------- /tests/test_farray.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | */ 12 | 13 | #include "tests.h" 14 | #include "mconfig.h" 15 | #include "farray.h" 16 | #include "fmath.h" 17 | 18 | /* Global variables */ 19 | int verbose = 0; 20 | config_t cfg; 21 | 22 | /* Test file */ 23 | #define TEST_FILE "test.fa" 24 | /* Number of stress runs */ 25 | #define STRESS_RUNS 10 26 | /* String length */ 27 | #define STR_LENGTH 2000 28 | /* Number of vector */ 29 | #define NUM_VECTORS 200 30 | 31 | /* 32 | * A simple stress test for feature arrays 33 | */ 34 | int test_stress() 35 | { 36 | int i, j, k, err = 0; 37 | fvec_t *f; 38 | farray_t *fa; 39 | char buf[STR_LENGTH + 1], label[32]; 40 | 41 | test_printf("Stress test for feature arrays"); 42 | 43 | for (i = 0; i < STRESS_RUNS; i++) { 44 | /* Create array */ 45 | fa = farray_create("test"); 46 | 47 | for (j = 0; j < NUM_VECTORS; j++) { 48 | for (k = 0; k < STR_LENGTH; k++) 49 | buf[k] = rand() % 10 + '0'; 50 | buf[k] = 0; 51 | 52 | /* Extract features */ 53 | f = fvec_extract(buf, strlen(buf), "test"); 54 | 55 | /* Get label */ 56 | snprintf(label, 32, "label%.2d", rand() % 10); 57 | 58 | /* Add to array */ 59 | farray_add(fa, f, label); 60 | } 61 | 62 | /* Destroy features */ 63 | farray_destroy(fa); 64 | } 65 | 66 | test_return(err, STRESS_RUNS); 67 | return err; 68 | } 69 | 70 | /* 71 | * A simple stress test for feature arrays using OpenMP 72 | */ 73 | int test_stress_omp() 74 | { 75 | int i, j, k, err = 0; 76 | char buf[STR_LENGTH + 1], label[32]; 77 | 78 | test_printf("Stress test for feature arrays (OpenMP)"); 79 | 80 | for (i = 0; i < STRESS_RUNS; i++) { 81 | /* Create array */ 82 | farray_t *fa = farray_create("test"); 83 | 84 | #pragma omp parallel for 85 | for (j = 0; j < NUM_VECTORS; j++) { 86 | for (k = 0; k < STR_LENGTH; k++) 87 | buf[k] = rand() % 10 + '0'; 88 | buf[k] = 0; 89 | 90 | /* Extract features */ 91 | fvec_t *f = fvec_extract(buf, strlen(buf), "test"); 92 | 93 | #pragma omp critical 94 | { 95 | /* Get label */ 96 | snprintf(label, 32, "label%.2d", rand() % 10); 97 | /* Add to array */ 98 | farray_add(fa, f, label); 99 | } 100 | } 101 | 102 | /* Destroy features */ 103 | farray_destroy(fa); 104 | } 105 | 106 | test_return(err, STRESS_RUNS); 107 | return err; 108 | } 109 | 110 | /* 111 | * A simple stress test for feature arrays 112 | */ 113 | int test_load_save() 114 | { 115 | int i, j, k, err = 0; 116 | char buf[STR_LENGTH + 1], label[32]; 117 | gzFile z; 118 | 119 | test_printf("Loading and saving of feature arrays"); 120 | 121 | /* Create array */ 122 | farray_t *fa = farray_create("test"); 123 | for (j = 0; j < NUM_VECTORS; j++) { 124 | for (k = 0; k < STR_LENGTH; k++) 125 | buf[k] = rand() % 10 + '0'; 126 | buf[k] = 0; 127 | 128 | /* Extract features and add to array */ 129 | fvec_t *f = fvec_extract(buf, strlen(buf), "test"); 130 | snprintf(label, 32, "label%.2d", rand() % 10); 131 | farray_add(fa, f, label); 132 | } 133 | 134 | /* Create and save feature vectors */ 135 | if (!(z = gzopen(TEST_FILE, "wb9"))) { 136 | printf("Could not create file (ignoring)\n"); 137 | return FALSE; 138 | } 139 | farray_save(fa, z); 140 | gzclose(z); 141 | 142 | /* Load saved array */ 143 | z = gzopen(TEST_FILE, "r"); 144 | farray_t *fb = farray_load(z); 145 | gzclose(z); 146 | unlink(TEST_FILE); 147 | 148 | /* Compare each vector mathematically */ 149 | for (i = 0; i < fa->len; i++) { 150 | fvec_t *c = fvec_sub(fa->x[i], fb->x[i]); 151 | err += fvec_norm1(c) > 10e-9; 152 | fvec_destroy(c); 153 | } 154 | 155 | err += fa->len != fb->len; 156 | err += fa->mem != fb->mem; 157 | 158 | /* Destroy features */ 159 | farray_destroy(fa); 160 | farray_destroy(fb); 161 | 162 | test_return(err, NUM_VECTORS + 2); 163 | return err; 164 | } 165 | 166 | 167 | /** 168 | * Main function 169 | */ 170 | int main(int argc, char **argv) 171 | { 172 | int err = FALSE; 173 | 174 | /* Create config */ 175 | config_init(&cfg); 176 | config_check(&cfg); 177 | 178 | err |= test_stress(); 179 | err |= test_stress_omp(); 180 | err |= test_load_save(); 181 | 182 | config_destroy(&cfg); 183 | return err; 184 | } 185 | -------------------------------------------------------------------------------- /src/proto.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | /** 15 | * @defgroup proto Prototype extraction 16 | * The module contains functions for extracting prototype feature vectors. 17 | * @author Konrad Rieck 18 | * @{ 19 | */ 20 | 21 | #include "config.h" 22 | #include "common.h" 23 | #include "fmath.h" 24 | #include "util.h" 25 | #include "proto.h" 26 | 27 | /* External variables */ 28 | extern int verbose; 29 | extern config_t cfg; 30 | 31 | /** 32 | * Extracts prototypes using an extended version of Gonzalez' algorithm. 33 | * @param fa Array of feature vectors 34 | * @param as Empty assignment structure 35 | * @param n Maximum number of prototypes 36 | * @param m Minimum distance between prototypes 37 | * @return Prototypes 38 | */ 39 | static farray_t *proto_gonzalez(farray_t *fa, assign_t *as, long n, double m) 40 | { 41 | assert(fa); 42 | int i, j, k; 43 | 44 | /* Allocate prototype structure and distance arrays */ 45 | farray_t *pr = farray_create(fa->src); 46 | if (!pr) { 47 | error("Could not allocate memory for prototype extraction"); 48 | return NULL; 49 | } 50 | 51 | /* Init distances to maximum value */ 52 | for (i = 0; i < fa->len; i++) 53 | as->dist[i] = DBL_MAX; 54 | 55 | /* Check for maximum number of protos */ 56 | if (n == 0) 57 | n = as->len; 58 | 59 | /* Get a fixed first element */ 60 | int fixed = farray_get_fixed(fa); 61 | 62 | /* Loop over feature vectors. First prototype: j = 0. */ 63 | for (i = 0; i < n; i++) { 64 | /* Determine largest distance */ 65 | if (i > 0) 66 | j = array_max(as->dist, as->len); 67 | else 68 | j = fixed; 69 | 70 | /* Check for minimum distance between prototypes */ 71 | if (as->dist[j] < m) 72 | break; 73 | 74 | /* Add prototype */ 75 | fvec_t *pv = fvec_clone(fa->x[j]); 76 | farray_add(pr, pv, farray_get_label(fa, j)); 77 | 78 | /* Update distances and assignments */ 79 | #pragma omp parallel for shared(fa, pv) 80 | for (k = 0; k < fa->len; k++) { 81 | double d = fvec_dist(pv, fa->x[k]); 82 | if (d < as->dist[k]) { 83 | as->dist[k] = d; 84 | as->proto[k] = pr->len - 1; 85 | as->label[k] = pr->y[pr->len - 1]; 86 | } 87 | } 88 | 89 | if (verbose) 90 | prog_bar(0, n, i); 91 | } 92 | 93 | /* Update progress bar */ 94 | if (verbose) 95 | prog_bar(0, n, n); 96 | 97 | return pr; 98 | } 99 | 100 | /** 101 | * Extracts a set of prototypes using the prototype algorithm. 102 | * Prototype algorithm is run multiple times and the smallest set 103 | * of prototypes is returned. 104 | * @param fa Array of feature vectors 105 | * @param as Pointer for new assignment structure 106 | * @return Prototypes 107 | */ 108 | farray_t *proto_extract(farray_t *fa, assign_t **as) 109 | { 110 | assert(fa); 111 | farray_t *p; 112 | int maxnum; 113 | double maxdist; 114 | 115 | /* Get configuration */ 116 | config_lookup_float(&cfg, "prototypes.max_dist", (double *) &maxdist); 117 | config_lookup_int(&cfg, "prototypes.max_num", (int *) &maxnum); 118 | 119 | if (verbose > 0) 120 | printf("Extracting prototypes with maximum distance %4.2f.\n", 121 | maxdist); 122 | 123 | /* Create assignments */ 124 | *as = assign_create(fa); 125 | 126 | /* Extract prototypes */ 127 | p = proto_gonzalez(fa, *as, maxnum, maxdist); 128 | 129 | if (verbose > 0) 130 | printf(" Done. %lu prototypes using %.2fMb extracted.\n", 131 | p->len, p->mem / 1e6); 132 | 133 | return p; 134 | } 135 | 136 | 137 | /** 138 | * Assign a set of vector to prototypes 139 | * @param fa Feature vectors 140 | * @param p Prototype vectors 141 | */ 142 | assign_t *proto_assign(farray_t *fa, farray_t *p) 143 | { 144 | assert(fa && p); 145 | int i, k, j, cnt = 0; 146 | double d = 0; 147 | 148 | assign_t *c = assign_create(fa); 149 | 150 | if (verbose > 0) 151 | printf("Assigning feature vectors to %lu prototypes.\n", p->len); 152 | 153 | #pragma omp parallel for shared(fa,c,p) private(k,j) 154 | for (i = 0; i < fa->len; i++) { 155 | double min = DBL_MAX; 156 | for (k = 0, j = 0; k < p->len; k++) { 157 | d = fvec_dist(fa->x[i], p->x[k]); 158 | if (d < min) { 159 | min = d; 160 | j = k; 161 | } 162 | } 163 | 164 | /* Compute assignments */ 165 | c->proto[i] = j; 166 | c->dist[i] = min; 167 | c->label[i] = p->y[j]; 168 | 169 | #pragma omp critical (cnt) 170 | if (verbose) 171 | prog_bar(0, fa->len, ++cnt); 172 | } 173 | 174 | if (verbose > 0) 175 | printf(" Done. Assigened %lu feature vectors to %lu prototypes.\n", 176 | fa->len, p->len); 177 | 178 | return c; 179 | } 180 | 181 | /** @} */ 182 | -------------------------------------------------------------------------------- /m4/pkg.m4: -------------------------------------------------------------------------------- 1 | # pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- 2 | # 3 | # Copyright © 2004 Scott James Remnant . 4 | # 5 | # This program is free software; you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation; either version 2 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, but 11 | # WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program; if not, write to the Free Software 17 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 | # 19 | # As a special exception to the GNU General Public License, if you 20 | # distribute this file as part of a program that contains a 21 | # configuration script generated by Autoconf, you may include it under 22 | # the same distribution terms that you use for the rest of that program. 23 | 24 | # PKG_PROG_PKG_CONFIG([MIN-VERSION]) 25 | # ---------------------------------- 26 | AC_DEFUN([PKG_PROG_PKG_CONFIG], 27 | [m4_pattern_forbid([^_?PKG_[A-Z_]+$]) 28 | m4_pattern_allow([^PKG_CONFIG(_PATH)?$]) 29 | AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])dnl 30 | if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then 31 | AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) 32 | fi 33 | if test -n "$PKG_CONFIG"; then 34 | _pkg_min_version=m4_default([$1], [0.9.0]) 35 | AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) 36 | if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then 37 | AC_MSG_RESULT([yes]) 38 | else 39 | AC_MSG_RESULT([no]) 40 | PKG_CONFIG="" 41 | fi 42 | 43 | fi[]dnl 44 | ])# PKG_PROG_PKG_CONFIG 45 | 46 | # PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) 47 | # 48 | # Check to see whether a particular set of modules exists. Similar 49 | # to PKG_CHECK_MODULES(), but does not set variables or print errors. 50 | # 51 | # 52 | # Similar to PKG_CHECK_MODULES, make sure that the first instance of 53 | # this or PKG_CHECK_MODULES is called, or make sure to call 54 | # PKG_CHECK_EXISTS manually 55 | # -------------------------------------------------------------- 56 | AC_DEFUN([PKG_CHECK_EXISTS], 57 | [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl 58 | if test -n "$PKG_CONFIG" && \ 59 | AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then 60 | m4_ifval([$2], [$2], [:]) 61 | m4_ifvaln([$3], [else 62 | $3])dnl 63 | fi]) 64 | 65 | 66 | # _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) 67 | # --------------------------------------------- 68 | m4_define([_PKG_CONFIG], 69 | [if test -n "$$1"; then 70 | pkg_cv_[]$1="$$1" 71 | elif test -n "$PKG_CONFIG"; then 72 | PKG_CHECK_EXISTS([$3], 73 | [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`], 74 | [pkg_failed=yes]) 75 | else 76 | pkg_failed=untried 77 | fi[]dnl 78 | ])# _PKG_CONFIG 79 | 80 | # _PKG_SHORT_ERRORS_SUPPORTED 81 | # ----------------------------- 82 | AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], 83 | [AC_REQUIRE([PKG_PROG_PKG_CONFIG]) 84 | if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then 85 | _pkg_short_errors_supported=yes 86 | else 87 | _pkg_short_errors_supported=no 88 | fi[]dnl 89 | ])# _PKG_SHORT_ERRORS_SUPPORTED 90 | 91 | 92 | # PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], 93 | # [ACTION-IF-NOT-FOUND]) 94 | # 95 | # 96 | # Note that if there is a possibility the first call to 97 | # PKG_CHECK_MODULES might not happen, you should be sure to include an 98 | # explicit call to PKG_PROG_PKG_CONFIG in your configure.ac 99 | # 100 | # 101 | # -------------------------------------------------------------- 102 | AC_DEFUN([PKG_CHECK_MODULES], 103 | [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl 104 | AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl 105 | AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl 106 | 107 | pkg_failed=no 108 | AC_MSG_CHECKING([for $1]) 109 | 110 | _PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) 111 | _PKG_CONFIG([$1][_LIBS], [libs], [$2]) 112 | 113 | m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS 114 | and $1[]_LIBS to avoid the need to call pkg-config. 115 | See the pkg-config man page for more details.]) 116 | 117 | if test $pkg_failed = yes; then 118 | _PKG_SHORT_ERRORS_SUPPORTED 119 | if test $_pkg_short_errors_supported = yes; then 120 | $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors "$2" 2>&1` 121 | else 122 | $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors "$2" 2>&1` 123 | fi 124 | # Put the nasty error message in config.log where it belongs 125 | echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD 126 | 127 | ifelse([$4], , [AC_MSG_ERROR(dnl 128 | [Package requirements ($2) were not met: 129 | 130 | $$1_PKG_ERRORS 131 | 132 | Consider adjusting the PKG_CONFIG_PATH environment variable if you 133 | installed software in a non-standard prefix. 134 | 135 | _PKG_TEXT 136 | ])], 137 | [AC_MSG_RESULT([no]) 138 | $4]) 139 | elif test $pkg_failed = untried; then 140 | ifelse([$4], , [AC_MSG_FAILURE(dnl 141 | [The pkg-config script could not be found or is too old. Make sure it 142 | is in your PATH or set the PKG_CONFIG environment variable to the full 143 | path to pkg-config. 144 | 145 | _PKG_TEXT 146 | 147 | To get pkg-config, see .])], 148 | [$4]) 149 | else 150 | $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS 151 | $1[]_LIBS=$pkg_cv_[]$1[]_LIBS 152 | AC_MSG_RESULT([yes]) 153 | ifelse([$3], , :, [$3]) 154 | fi[]dnl 155 | ])# PKG_CHECK_MODULES 156 | -------------------------------------------------------------------------------- /src/quality.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | /** 15 | * @defgroup quality Quality functions 16 | * The module contains functions for evaluating the quality of 17 | * classification and clustering methods. 18 | * @author Konrad Rieck 19 | * @{ 20 | */ 21 | #include "config.h" 22 | #include "common.h" 23 | #include "quality.h" 24 | #include "util.h" 25 | 26 | /* External variables */ 27 | extern int verbose; 28 | extern config_t cfg; 29 | 30 | /** 31 | * Computes quality measures for the label countment. 32 | * The function returns a static array. The code is not thread-safe. 33 | * @param y Labels of data points 34 | * @param a Assignments to clusters or classes 35 | * @param n Number of data points 36 | * @return array with quality values 37 | */ 38 | double *quality(unsigned int *y, unsigned int *a, int n) 39 | { 40 | assert(y && a && n > 0); 41 | static double r[5]; 42 | double mn, ac, bc, cc, dc; 43 | hist_t *h, *hi; 44 | count_t *ai; 45 | int i, j; 46 | 47 | /* Compute precision. This is ugly code. */ 48 | h = hist_create(a, y, n); 49 | for (hi = h, i = 0, ac = 0; hi != NULL; hi = hi->hh.next, i++) { 50 | for (ai = hi->count, mn = 0; ai != NULL; ai = ai->hh.next) 51 | if (ai->count > mn) 52 | mn = ai->count; 53 | ac += mn; 54 | } 55 | hist_destroy(h); 56 | r[Q_PRECISION] = ac / n; 57 | 58 | /* Compute recall. This is again ugly code. */ 59 | h = hist_create(y, a, n); 60 | for (hi = h, i = 0, ac = 0; hi != NULL; hi = hi->hh.next, i++) { 61 | for (ai = hi->count, mn = 0; ai != NULL; ai = ai->hh.next) 62 | if (ai->count > mn) 63 | mn = ai->count; 64 | ac += mn; 65 | } 66 | hist_destroy(h); 67 | r[Q_RECALL] = ac / n; 68 | 69 | /* Compute f-measure */ 70 | r[Q_FMEASURE] = (2 * r[Q_RECALL] * r[Q_PRECISION]) / 71 | (r[Q_RECALL] + r[Q_PRECISION]); 72 | 73 | /* Compute similarity coefficients */ 74 | ac = bc = cc = dc = 0; 75 | for (i = 0; i < n; i++) { 76 | for (j = 0; j < n; j++) { 77 | ac += (a[i] == a[j] && y[i] == y[j]) ? 1 : 0; 78 | bc += (a[i] != a[j] && y[i] != y[j]) ? 1 : 0; 79 | cc += (a[i] != a[j] && y[i] == y[j]) ? 1 : 0; 80 | dc += (a[i] == a[j] && y[i] != y[j]) ? 1 : 0; 81 | } 82 | } 83 | r[Q_RAND] = (ac + bc) / (ac + bc + cc + dc); 84 | r[Q_ARAND] = 2 * (ac * bc - cc * dc) / 85 | ((ac + dc) * (dc + bc) + (ac + cc) * (cc + bc)); 86 | 87 | return r; 88 | } 89 | 90 | /** 91 | * Creates an histogram for each label containing assignments. 92 | * @param y Labels of data points 93 | * @param a Assignments to clusters or classes 94 | * @param n Number of data points 95 | * @return histogram struct 96 | */ 97 | hist_t *hist_create(unsigned int *y, unsigned int *a, int n) 98 | { 99 | assert(y && a && n > 0); 100 | hist_t *entry, *hist = NULL; 101 | count_t *count = NULL; 102 | int i; 103 | 104 | /* Loop over labels */ 105 | for (i = 0; i < n; i++) { 106 | HASH_FIND_INT(hist, &y[i], entry); 107 | if (!entry) { 108 | entry = malloc(sizeof(hist_t)); 109 | if (!entry) { 110 | error("Could not allocate histogram"); 111 | hist_destroy(hist); 112 | return NULL; 113 | } 114 | 115 | /* Create new entry */ 116 | entry->label = y[i]; 117 | entry->count = NULL; 118 | entry->total = 0; 119 | 120 | /* Add entry */ 121 | HASH_ADD_INT(hist, label, entry); 122 | } 123 | entry->total++; 124 | 125 | HASH_FIND_INT(entry->count, &a[i], count); 126 | if (!count) { 127 | count = malloc(sizeof(count_t)); 128 | if (!count) { 129 | error("Could not allocate countments"); 130 | hist_destroy(hist); 131 | return NULL; 132 | } 133 | count->label = a[i]; 134 | count->count = 0; 135 | 136 | /* Add entry */ 137 | HASH_ADD_INT(entry->count, label, count); 138 | } 139 | count->count++; 140 | } 141 | 142 | return hist; 143 | } 144 | 145 | /** 146 | * Prints the contents of a histogram for labels 147 | * @param h histogram struct 148 | */ 149 | void hist_print(hist_t * h) 150 | { 151 | hist_t *hi; 152 | count_t *ai; 153 | 154 | for (hi = h; hi != NULL; hi = hi->hh.next) { 155 | printf("Label: %u\n", hi->label); 156 | printf("Total: %f\n", hi->total); 157 | 158 | printf("countments: "); 159 | for (ai = hi->count; ai != NULL; ai = ai->hh.next) 160 | printf("%u (%f) ", ai->label, ai->count); 161 | printf("\n"); 162 | } 163 | } 164 | 165 | /** 166 | * Destroys a histogram. 167 | * @param h histrogram struct 168 | */ 169 | void hist_destroy(hist_t * h) 170 | { 171 | hist_t *hi; 172 | count_t *ai; 173 | 174 | /* Iterate over classes */ 175 | while (h) { 176 | hi = h; 177 | 178 | /* Delete indices */ 179 | while (hi->count) { 180 | ai = hi->count; 181 | HASH_DEL(hi->count, ai); 182 | free(ai); 183 | } 184 | 185 | HASH_DEL(h, hi); 186 | free(hi); 187 | } 188 | } 189 | 190 | /** @} */ 191 | -------------------------------------------------------------------------------- /tests/test_fmath.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | */ 12 | 13 | #include "tests.h" 14 | #include "mconfig.h" 15 | #include "farray.h" 16 | #include "fmath.h" 17 | 18 | /* Global variables */ 19 | int verbose = 0; 20 | config_t cfg; 21 | 22 | /* String length */ 23 | #define STR_LENGTH 2000 24 | /* Number of vector */ 25 | #define NUM_VECTORS 200 26 | /* Number of stress runs */ 27 | #define STRESS_RUNS 5 28 | 29 | /* Test structure */ 30 | typedef struct { 31 | char *x; 32 | char *y; 33 | double res; 34 | } test_t; 35 | 36 | /* Addition test cases */ 37 | test_t test_add[] = { 38 | {"aa0bb0cc", "aa0bb0cc", 3.4641016}, 39 | {"aa0bb0cc", "xx0bb0cc", 3.4641016}, 40 | {"aa0bb0cc", "xx0yy0cc", 3.4641016}, 41 | {"aa0bb0cc", "xx0yy0zz", 3.4641016}, 42 | {"", "xx0yy0zz", 1.73205080}, 43 | {"aa0bb0cc", "", 1.73205080}, 44 | {NULL, NULL, 0} 45 | }; 46 | 47 | 48 | /* Dot product test cases */ 49 | test_t test_dot[] = { 50 | {"aa0bb0cc", "aa0bb0cc", 1}, 51 | {"aa0bb0cc", "xx0bb0cc", 0.6666666}, 52 | {"aa0bb0cc", "xx0yy0cc", 0.3333333}, 53 | {"aa0bb0cc", "xx0yy0zz", 0.0000000}, 54 | {"aa", "aa", 1.000000}, 55 | {"aa", "aa0xx", 0.7071067811}, 56 | {"aa", "aa0xx0yy", 0.5773502691}, 57 | {"aa", "aa0xx0yy0zz", 0.5}, 58 | {NULL, NULL, 0} 59 | }; 60 | 61 | /* 62 | * A simple static test for the addition of feature vectors 63 | */ 64 | int test_static_add() 65 | { 66 | int i, err = 0; 67 | fvec_t *fx, *fy, *fz; 68 | 69 | test_printf("Addition of feature vectors"); 70 | 71 | for (i = 0; test_add[i].x; i++) { 72 | /* Extract features */ 73 | fx = fvec_extract(test_add[i].x, strlen(test_add[i].x), "test"); 74 | fy = fvec_extract(test_add[i].y, strlen(test_add[i].y), "test"); 75 | 76 | /* Add test vectors */ 77 | fz = fvec_add(fx, fy); 78 | err += fabs(fvec_norm1(fz) - test_add[i].res) > 1e-7; 79 | 80 | fvec_destroy(fz); 81 | fvec_destroy(fx); 82 | fvec_destroy(fy); 83 | } 84 | 85 | test_return(err, i); 86 | return err; 87 | } 88 | 89 | /* 90 | * A simple static test for the dot-product of feature vectors 91 | */ 92 | int test_static_dot() 93 | { 94 | int i, err = 0; 95 | fvec_t *fx, *fy; 96 | 97 | test_printf("Dot product of feature vectors"); 98 | 99 | for (i = 0; test_dot[i].x; i++) { 100 | /* Extract features */ 101 | fx = fvec_extract(test_dot[i].x, strlen(test_dot[i].x), "test"); 102 | fy = fvec_extract(test_dot[i].y, strlen(test_dot[i].y), "test"); 103 | 104 | /* Compute dot product */ 105 | double d = fvec_dot(fx, fy); 106 | err += fabs(d - test_dot[i].res) > 1e-6; 107 | 108 | fvec_destroy(fx); 109 | fvec_destroy(fy); 110 | } 111 | 112 | test_return(err, i); 113 | return err; 114 | } 115 | 116 | /* 117 | * A stres test for the addition of feature vectors 118 | */ 119 | int test_stress_add() 120 | { 121 | int i, j, err = 0; 122 | fvec_t *fx, *fy, *fz; 123 | char buf[STR_LENGTH + 1]; 124 | 125 | test_printf("Stress test for addition of feature vectors"); 126 | 127 | /* Create empty vector */ 128 | fz = fvec_extract("aa0bb0cc", 8, "zero"); 129 | for (i = 0; i < NUM_VECTORS; i++) { 130 | 131 | /* Create random key and string */ 132 | for (j = 0; j < STR_LENGTH; j++) 133 | buf[j] = rand() % 10 + '0'; 134 | buf[j] = 0; 135 | 136 | /* Extract features */ 137 | fx = fvec_extract(buf, strlen(buf), "test"); 138 | 139 | /* Add fx to fz */ 140 | fy = fvec_add(fz, fx); 141 | fvec_destroy(fz); 142 | 143 | err += fabs(fvec_norm2(fy) - 1.4142135623) > 1e-7; 144 | 145 | /* Substract fx from fz */ 146 | fz = fvec_sub(fy, fx); 147 | fvec_sparsify(fz); 148 | 149 | /* Clean up */ 150 | fvec_destroy(fy); 151 | fvec_destroy(fx); 152 | } 153 | 154 | fvec_destroy(fz); 155 | test_return(err, i); 156 | return err; 157 | } 158 | 159 | /* 160 | * A stres test for the addition of feature vectors 161 | */ 162 | int test_stress_dot() 163 | { 164 | int i, j, err = 0; 165 | fvec_t *fx, *fy; 166 | char buf[STR_LENGTH + 1]; 167 | 168 | test_printf("Stress test for dot product of feature vectors"); 169 | 170 | /* Create empty vector */ 171 | for (i = 0; i < NUM_VECTORS; i++) { 172 | 173 | /* Create random key and string */ 174 | for (j = 0; j < STR_LENGTH; j++) 175 | buf[j] = rand() % 10 + '0'; 176 | buf[j] = 0; 177 | fx = fvec_extract(buf, strlen(buf), "test"); 178 | 179 | /* Create random key and string */ 180 | for (j = 0; j < STR_LENGTH; j++) 181 | buf[j] = rand() % 10 + '0'; 182 | buf[j] = 0; 183 | fy = fvec_extract(buf, strlen(buf), "test"); 184 | 185 | double nx = fvec_dot(fx, fx); 186 | double ny = fvec_dot(fy, fy); 187 | err += fabs(fvec_norm2(fx) - sqrt(nx)) > 1e-7; 188 | err += fabs(fvec_norm2(fy) - sqrt(ny)) > 1e-7; 189 | err += fabs(fvec_dot(fx, fy) > nx + ny); 190 | 191 | /* Clean up */ 192 | fvec_destroy(fx); 193 | fvec_destroy(fy); 194 | } 195 | 196 | test_return(err, 3 * i); 197 | return err; 198 | } 199 | 200 | 201 | /** 202 | * Main function 203 | */ 204 | int main(int argc, char **argv) 205 | { 206 | int err = FALSE; 207 | 208 | /* Create config */ 209 | config_init(&cfg); 210 | config_check(&cfg); 211 | 212 | config_set_string(&cfg, "features.vect_embed", "cnt"); 213 | config_set_string(&cfg, "generic.event_delim", "0"); 214 | config_set_int(&cfg, "features.ngram_len", 1); 215 | 216 | err |= test_static_add(); 217 | err |= test_stress_add(); 218 | err |= test_static_dot(); 219 | err |= test_stress_dot(); 220 | 221 | config_destroy(&cfg); 222 | return err; 223 | } 224 | -------------------------------------------------------------------------------- /tests/test_fvec.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | */ 12 | 13 | #include "tests.h" 14 | #include "mconfig.h" 15 | #include "fvec.h" 16 | #include "ftable.h" 17 | 18 | /* Global variables */ 19 | int verbose = 5; 20 | config_t cfg; 21 | 22 | /* Test structure */ 23 | typedef struct { 24 | char *str; 25 | char *dlm; 26 | int nlen; 27 | int len; 28 | } test_t; 29 | 30 | /* Test array of strings */ 31 | test_t tests[] = { 32 | {" a:a a:a a:a a:a ", " ", 1, 1}, 33 | {" a:a a:b a:c a:d ", " ", 1, 4}, 34 | {" a:a b:c a:a b:c ", " :", 1, 3}, 35 | {" a:a a:b a:c a:d ", " :", 1, 4}, 36 | {" a:a a:a a:a a:a ", " ", 2, 1}, 37 | {" a:a a:b a:c a:d ", " ", 2, 3}, 38 | {" a:a a:a a:a a:a ", " :", 2, 1}, 39 | {" a:a a:a a:a a:a ", "", 1, 3}, 40 | {" a:a a:b a:c a:d ", "", 1, 6}, 41 | {" a:a a:a a:a a:a ", "", 2, 4}, 42 | {NULL, NULL, 0} 43 | }; 44 | 45 | /* Test file */ 46 | #define TEST_FILE "test.fv" 47 | /* Number of stress runs */ 48 | #define STRESS_RUNS 2000 49 | /* String length */ 50 | #define STR_LENGTH 1024 51 | 52 | /* 53 | * A simple static test for the feature vectors 54 | */ 55 | int test_static() 56 | { 57 | int i, err = 0; 58 | fvec_t *f; 59 | 60 | test_printf("Extraction of feature vectors"); 61 | 62 | for (i = 0; tests[i].str; i++) { 63 | fvec_reset_delim(); 64 | config_set_string(&cfg, "generic.event_delim", tests[i].dlm); 65 | config_set_int(&cfg, "features.ngram_len", tests[i].nlen); 66 | 67 | /* Extract features */ 68 | f = fvec_extract(tests[i].str, strlen(tests[i].str), "test"); 69 | 70 | /* Check for correct number of dimensions */ 71 | if (f->len != tests[i].len) { 72 | test_error("(%d) len %d != %d", i, f->len, tests[i].len); 73 | err++; 74 | } 75 | 76 | fvec_destroy(f); 77 | } 78 | 79 | test_return(err, i); 80 | return err; 81 | } 82 | 83 | /* 84 | * A simple stress test for the feature table 85 | */ 86 | int test_stress() 87 | { 88 | int i, j, err = 0; 89 | fvec_t *f; 90 | char buf[STR_LENGTH + 1]; 91 | 92 | test_printf("Stress test for feature vectors"); 93 | 94 | config_set_string(&cfg, "generic.event_delim", "0"); 95 | 96 | ftable_init(); 97 | 98 | for (i = 0; i < STRESS_RUNS; i++) { 99 | config_set_int(&cfg, "features.ngram_len", rand() % 10 + 1); 100 | 101 | /* Create random key and string */ 102 | for (j = 0; j < STR_LENGTH; j++) 103 | buf[j] = rand() % 10 + '0'; 104 | buf[j] = 0; 105 | 106 | /* Extract features */ 107 | f = fvec_extract(buf, strlen(buf), "test"); 108 | /* Destroy features */ 109 | fvec_destroy(f); 110 | } 111 | 112 | ftable_destroy(); 113 | 114 | test_return(err, STRESS_RUNS); 115 | return err; 116 | } 117 | 118 | /* 119 | * A simple stress test for the feature table 120 | */ 121 | int test_stress_omp() 122 | { 123 | int i, j, err = 0; 124 | fvec_t *f; 125 | char buf[STR_LENGTH + 1]; 126 | 127 | test_printf("Stress test for feature vectors (OpenMP)"); 128 | 129 | config_set_string(&cfg, "generic.event_delim", "0"); 130 | 131 | ftable_init(); 132 | 133 | #pragma omp parallel for 134 | for (i = 0; i < STRESS_RUNS; i++) { 135 | config_set_int(&cfg, "features.ngram_len", rand() % 10 + 1); 136 | 137 | /* Create random key and string */ 138 | for (j = 0; j < STR_LENGTH; j++) 139 | buf[j] = rand() % 10 + '0'; 140 | buf[j] = 0; 141 | 142 | /* Extract features */ 143 | f = fvec_extract(buf, strlen(buf), "test"); 144 | /* Destroy features */ 145 | fvec_destroy(f); 146 | } 147 | 148 | ftable_destroy(); 149 | 150 | test_return(err, STRESS_RUNS); 151 | return err; 152 | } 153 | 154 | 155 | /* 156 | * A simple load and save test case 157 | */ 158 | int test_load_save() 159 | { 160 | int i, j, err = 0; 161 | fvec_t *f, *g; 162 | gzFile z; 163 | 164 | test_printf("Loading and saving of feature vectors"); 165 | 166 | fvec_reset_delim(); 167 | config_set_string(&cfg, "generic.event_delim", " "); 168 | config_set_int(&cfg, "features.ngram_len", 2); 169 | 170 | /* Create and save feature vectors */ 171 | z = gzopen(TEST_FILE, "wb9"); 172 | if (!z) { 173 | printf("Could not create file (ignoring)\n"); 174 | return FALSE; 175 | } 176 | 177 | for (i = 0; tests[i].str; i++) { 178 | f = fvec_extract(tests[i].str, strlen(tests[i].str), "test"); 179 | fvec_save(f, z); 180 | fvec_destroy(f); 181 | } 182 | gzclose(z); 183 | 184 | 185 | /* Load and compare feature vectors */ 186 | z = gzopen(TEST_FILE, "r"); 187 | 188 | for (i = 0; tests[i].str; i++) { 189 | f = fvec_extract(tests[i].str, strlen(tests[i].str), "test"); 190 | g = fvec_load(z); 191 | 192 | /* Check dimensions and values */ 193 | for (j = 0; j < f->len && j < g->len; j++) { 194 | if (f->dim[j] != g->dim[j]) { 195 | test_error("(%d) f->dim[%d] != g->dim[%d]", i, j, j); 196 | break; 197 | } 198 | if (fabs(f->val[j] - g->val[j]) > 10e-10) { 199 | test_error("(%d) f->val[%d] != g->val[%d]", i, j, j); 200 | break; 201 | } 202 | } 203 | err += (j < f->len || j < g->len); 204 | 205 | fvec_destroy(f); 206 | fvec_destroy(g); 207 | } 208 | 209 | gzclose(z); 210 | unlink(TEST_FILE); 211 | 212 | test_return(err, i); 213 | return err; 214 | } 215 | 216 | /** 217 | * Main function 218 | */ 219 | int main(int argc, char **argv) 220 | { 221 | int err = FALSE; 222 | 223 | /* Create config */ 224 | config_init(&cfg); 225 | config_check(&cfg); 226 | 227 | err |= test_static(); 228 | err |= test_stress(); 229 | err |= test_stress_omp(); 230 | err |= test_load_save(); 231 | 232 | config_destroy(&cfg); 233 | return err; 234 | } 235 | -------------------------------------------------------------------------------- /tests/test_cluster.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | */ 12 | 13 | #include "tests.h" 14 | #include "mconfig.h" 15 | #include "farray.h" 16 | #include "ftable.h" 17 | #include "cluster.h" 18 | #include "fmath.h" 19 | 20 | /* Global variables */ 21 | config_t cfg; 22 | int verbose = 0; 23 | 24 | /* Number of stress runs */ 25 | #define STRESS_RUNS 20 26 | /* String length */ 27 | #define STR_LENGTH 500 28 | /* Number of vector */ 29 | #define NUM_VECTORS 500 30 | /* Number of elements in test data */ 31 | #define DATA_LEN 15 32 | /* Number of correct prototypes */ 33 | #define DATA_CLUSTER 5 34 | 35 | /* Data set for clustering */ 36 | static char *test_data[] = { 37 | "XX YY XX YY XX 11", "XX YY XX ZZ XX 66", "XX ZZ XX YY XX YY", 38 | "AA BB AA BB AA 22", "AA BB AA CC AA 77", "AA CC AA BB AA BB", 39 | "MM NN MM NN MM 33", "MM NN MM OO MM 88", "MM OO MM NN MM NN", 40 | "UU VV UU VV UU 44", "UU VV UU WW UU 99", "UU WW UU VV UU VV", 41 | "RR SS RR SS RR 55", "RR SS RR TT RR 00", "RR TT RR SS RR SS" 42 | }; 43 | 44 | /** 45 | * Test clustering 46 | */ 47 | int test_cluster_complete() 48 | { 49 | int i, j, k, err = 0; 50 | 51 | test_printf("Clustering using prototypes (complete)"); 52 | 53 | /* Prepare test data */ ; 54 | farray_t *fa = farray_create("test"); 55 | for (i = 0; i < DATA_LEN; i++) { 56 | fvec_t *f = fvec_extract(test_data[i], strlen(test_data[i]), NULL); 57 | farray_add(fa, f, "test"); 58 | } 59 | 60 | /* Get clustering */ 61 | config_set_string(&cfg, "cluster.link_mode", "complete"); 62 | cluster_t *c = cluster_linkage(fa, 0); 63 | 64 | /* Check number of clusters */ 65 | err += (c->num != DATA_CLUSTER); 66 | 67 | /* Check match of clusters */ 68 | for (k = 0; k < DATA_LEN; k += DATA_LEN / DATA_CLUSTER) 69 | for (j = 0; j < DATA_LEN / DATA_CLUSTER - 1; j++) 70 | err += c->cluster[k + j] != c->cluster[k + j + 1]; 71 | 72 | /* Clean up */ 73 | cluster_destroy(c); 74 | farray_destroy(fa); 75 | 76 | test_return(err, 1 + DATA_CLUSTER * (DATA_LEN / DATA_CLUSTER - 1)); 77 | return err; 78 | } 79 | 80 | /** 81 | * Test clustering 82 | */ 83 | int test_cluster_single() 84 | { 85 | int i, j, k, err = 0; 86 | 87 | test_printf("Clustering using prototypes (single)"); 88 | 89 | /* Prepare test data */ ; 90 | farray_t *fa = farray_create("test"); 91 | for (i = 0; i < DATA_LEN; i++) { 92 | fvec_t *f = fvec_extract(test_data[i], strlen(test_data[i]), NULL); 93 | farray_add(fa, f, "test"); 94 | } 95 | 96 | /* Get clustering */ 97 | config_set_string(&cfg, "cluster.link_mode", "single"); 98 | cluster_t *c = cluster_linkage(fa, 0); 99 | 100 | /* Check number of clusters */ 101 | err += (c->num != DATA_CLUSTER); 102 | 103 | /* Check position of prototypes */ 104 | for (k = 0; k < DATA_LEN; k += DATA_LEN / DATA_CLUSTER) 105 | for (j = 0; j < DATA_LEN / DATA_CLUSTER - 1; j++) 106 | err += c->cluster[k + j] != c->cluster[k + j + 1]; 107 | 108 | /* Clean up */ 109 | cluster_destroy(c); 110 | farray_destroy(fa); 111 | 112 | test_return(err, 1 + DATA_CLUSTER * (DATA_LEN / DATA_CLUSTER - 1)); 113 | return err; 114 | } 115 | 116 | /** 117 | * Test clustering 118 | */ 119 | int test_cluster_average() 120 | { 121 | int i, j, k, err = 0; 122 | 123 | test_printf("Clustering using prototypes (average)"); 124 | 125 | /* Prepare test data */ ; 126 | farray_t *fa = farray_create("test"); 127 | for (i = 0; i < DATA_LEN; i++) { 128 | fvec_t *f = fvec_extract(test_data[i], strlen(test_data[i]), NULL); 129 | farray_add(fa, f, "test"); 130 | } 131 | 132 | /* Get clustering */ 133 | config_set_string(&cfg, "cluster.link_mode", "average"); 134 | cluster_t *c = cluster_linkage(fa, 0); 135 | 136 | /* Check number of clusters */ 137 | err += (c->num != DATA_CLUSTER); 138 | 139 | /* Check position of prototypes */ 140 | for (k = 0; k < DATA_LEN; k += DATA_LEN / DATA_CLUSTER) 141 | for (j = 0; j < DATA_LEN / DATA_CLUSTER - 1; j++) 142 | err += c->cluster[k + j] != c->cluster[k + j + 1]; 143 | 144 | /* Clean up */ 145 | cluster_destroy(c); 146 | farray_destroy(fa); 147 | 148 | test_return(err, 1 + DATA_CLUSTER * (DATA_LEN / DATA_CLUSTER - 1)); 149 | return err; 150 | } 151 | 152 | /* 153 | * A simple stress test for clustering 154 | */ 155 | int test_stress() 156 | { 157 | int i, j, k, err = 0; 158 | fvec_t *f; 159 | farray_t *fa; 160 | char buf[STR_LENGTH + 1], label[32]; 161 | 162 | test_printf("Stress test for clustering"); 163 | 164 | for (i = 0; i < STRESS_RUNS; i++) { 165 | /* Create array */ 166 | fa = farray_create("test"); 167 | 168 | for (j = 0; j < NUM_VECTORS; j++) { 169 | for (k = 0; k < STR_LENGTH; k++) 170 | buf[k] = rand() % 10 + '0'; 171 | buf[k] = 0; 172 | 173 | /* Extract features */ 174 | f = fvec_extract(buf, strlen(buf), "test"); 175 | snprintf(label, 32, "label%.2d", rand() % 10); 176 | 177 | /* Add to array */ 178 | farray_add(fa, f, label); 179 | } 180 | 181 | /* Extract prototypes */ 182 | cluster_t *c = cluster_linkage(fa, 0); 183 | 184 | /* Destroy features */ 185 | cluster_destroy(c); 186 | farray_destroy(fa); 187 | } 188 | 189 | test_return(err, STRESS_RUNS); 190 | return err; 191 | } 192 | 193 | /** 194 | * Main function 195 | */ 196 | int main(int argc, char **argv) 197 | { 198 | int err = FALSE; 199 | 200 | /* Create config */ 201 | config_init(&cfg); 202 | config_check(&cfg); 203 | config_set_string(&cfg, "generic.event_delim", " "); 204 | config_set_string(&cfg, "features.vect_embed", "cnt"); 205 | 206 | ftable_init(); 207 | 208 | err |= test_cluster_complete(); 209 | err |= test_cluster_average(); 210 | err |= test_cluster_single(); 211 | err |= test_stress(); 212 | 213 | ftable_destroy(); 214 | 215 | config_destroy(&cfg); 216 | return err; 217 | } 218 | -------------------------------------------------------------------------------- /src/ftable.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | /** 15 | * @defgroup ftable Lookup table for features 16 | * Lookup table for extracted features. The extracted feature, such as 17 | * substrings and n-grams, a stored in sparse feature vectors with each 18 | * feature represented by hash value. This global hash table is used 19 | * to efficiently lookup the original feature for a given hash. The table 20 | * keeps also track of counting insertions and collisions, such that 21 | * the quality of the hashing can be assessed. Maintaining a global hash 22 | * table impacts performance when using OpenMP, thus it is adviced to 23 | * disable the table if not required. 24 | * @author Konrad Rieck 25 | * @{ 26 | */ 27 | 28 | #include "config.h" 29 | #include "common.h" 30 | #include "fvec.h" 31 | #include "ftable.h" 32 | #include "util.h" 33 | 34 | /* Hash table */ 35 | static fentry_t *feature_table = NULL; 36 | static int table_enabled = FALSE; 37 | static unsigned long collisions = 0; 38 | static unsigned long insertions = 0; 39 | 40 | /* External variables */ 41 | extern int verbose; 42 | extern config_t cfg; 43 | 44 | /** 45 | * Add a feature and its key to the lookup table. The function clones 46 | * all input arguments, that is new memory is allocated and the data 47 | * is copied. This memory is free'd when destroy the feature table. 48 | * @param k Key for feature 49 | * @param x Data of feature 50 | * @param l Length of feature 51 | */ 52 | void ftable_put(feat_t k, char *x, int l) 53 | { 54 | assert(x && l > 0); 55 | fentry_t *g, *h; 56 | 57 | if (!table_enabled) 58 | return; 59 | 60 | /* Check for duplicate */ 61 | HASH_FIND(hh, feature_table, &k, sizeof(feat_t), g); 62 | 63 | /* Check for collision */ 64 | if (g) { 65 | if (l != g->len || memcmp(x, g->data, l)) 66 | collisions++; 67 | return; 68 | } 69 | 70 | /* Build new entry */ 71 | h = malloc(sizeof(fentry_t)); 72 | h->len = l; 73 | h->key = k; 74 | h->data = malloc(l); 75 | if (h->data) 76 | memcpy(h->data, x, l); 77 | else 78 | error("Could not allocate feature data"); 79 | 80 | /* Add to hash and count insertion */ 81 | HASH_ADD(hh, feature_table, key, sizeof(feat_t), h); 82 | insertions++; 83 | } 84 | 85 | /** 86 | * Gets an entry from the lookup table. The returned memory must not 87 | * be free'd. 88 | * @param key Feature key 89 | * @return feature table entry 90 | */ 91 | fentry_t *ftable_get(feat_t key) 92 | { 93 | fentry_t *f; 94 | HASH_FIND(hh, feature_table, &key, sizeof(feat_t), f); 95 | return f; 96 | } 97 | 98 | /** 99 | * Initialize the feature lookup table. 100 | */ 101 | void ftable_init() 102 | { 103 | if (table_enabled) 104 | ftable_destroy(); 105 | 106 | table_enabled = TRUE; 107 | collisions = 0; 108 | insertions = 0; 109 | } 110 | 111 | /** 112 | * Destroy the feature lookup table. 113 | */ 114 | void ftable_destroy() 115 | { 116 | if (!feature_table) 117 | return; 118 | 119 | fentry_t *f; 120 | 121 | while (feature_table) { 122 | f = feature_table; 123 | HASH_DEL(feature_table, f); 124 | free(f->data); 125 | free(f); 126 | } 127 | 128 | table_enabled = FALSE; 129 | collisions = 0; 130 | insertions = 0; 131 | } 132 | 133 | /** 134 | * Removes an element from the lookup hash. 135 | * @param key Feature to remove 136 | */ 137 | void ftable_remove(feat_t key) 138 | { 139 | fentry_t *f; 140 | 141 | /* Find element */ 142 | HASH_FIND(hh, feature_table, &key, sizeof(feat_t), f); 143 | if (!f) 144 | return; 145 | 146 | /* Remove */ 147 | HASH_DEL(feature_table, f); 148 | } 149 | 150 | 151 | /** 152 | * Print the feature lookup table. 153 | */ 154 | void ftable_print() 155 | { 156 | fentry_t *f; 157 | int i; 158 | 159 | if (!table_enabled) 160 | return; 161 | 162 | printf("feature table [size: %lu, puts: %lu, colls: %lu (%g%%), %p]\n", 163 | ftable_size(), insertions, collisions, 164 | (collisions * 100.0) / insertions, (void *) feature_table); 165 | 166 | if (verbose < 3) 167 | return; 168 | 169 | for (f = feature_table; f != NULL; f = f->hh.next) { 170 | printf(" 0x%.16llx: ", (long long unsigned int) f->key); 171 | 172 | for (i = 0; i < f->len; i++) { 173 | if (isprint(f->data[i]) && f->data[i] != '%') 174 | printf("%c", f->data[i]); 175 | else 176 | printf("%%%.2x", f->data[i]); 177 | } 178 | printf("\n"); 179 | } 180 | } 181 | 182 | /** 183 | * Returns the size of the feature lookup table. 184 | * @return size of table 185 | */ 186 | unsigned long ftable_size() 187 | { 188 | return HASH_COUNT(feature_table); 189 | } 190 | 191 | /** 192 | * Saves a feature table to a file stream. 193 | * @param z Stream pointer 194 | */ 195 | void ftable_save(gzFile z) 196 | { 197 | fentry_t *f; 198 | int i; 199 | 200 | gzprintf(z, "feature table: len=%lu\n", HASH_COUNT(feature_table)); 201 | for (f = feature_table; f != NULL; f = f->hh.next) { 202 | gzprintf(z, " %.16llx: ", (long long unsigned int) f->key); 203 | for (i = 0; i < f->len; i++) { 204 | if (isprint(f->data[i]) || f->data[i] == '%') 205 | gzprintf(z, "%c", f->data[i]); 206 | else 207 | gzprintf(z, "%%%.2x", f->data[i]); 208 | } 209 | gzprintf(z, "\n"); 210 | } 211 | } 212 | 213 | /** 214 | * Loads a feature table from a file stream (Not synchronized) 215 | * @param z Stream pointer 216 | */ 217 | void ftable_load(gzFile z) 218 | { 219 | int i, r; 220 | unsigned long len; 221 | char buf[512], str[512]; 222 | feat_t key; 223 | 224 | gzgets(z, buf, 512); 225 | r = sscanf(buf, "feature table: len=%lu\n", (unsigned long *) &len); 226 | if (r != 1) { 227 | error("Could not parse feature table"); 228 | return; 229 | } 230 | 231 | for (i = 0; i < len; i++) { 232 | gzgets(z, buf, 512); 233 | r = sscanf(buf, " %llx:%511s\n", (unsigned long long *) &key, 234 | (char *) str); 235 | if (r != 2) { 236 | error("Could not parse feature table contents"); 237 | return; 238 | } 239 | 240 | /* Decode string */ 241 | r = decode_string(str); 242 | 243 | /* Put string to table */ 244 | ftable_put(key, str, r); 245 | } 246 | } 247 | 248 | /** 249 | * Returns true if the feature table is enabled 250 | * @return true if enabled false otherwise 251 | */ 252 | int ftable_enabled() 253 | { 254 | return table_enabled; 255 | } 256 | 257 | /** @} */ 258 | -------------------------------------------------------------------------------- /src/mconfig.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | /** 15 | * @defgroup mconfig Configuration functions 16 | * Functions for configuration of the Malheur tool. Additionally default 17 | * values for each configruation parameter are specified in this module. 18 | * @author Konrad Rieck 19 | * @{ 20 | */ 21 | 22 | #include "config.h" 23 | #include "common.h" 24 | #include "util.h" 25 | #include "mconfig.h" 26 | 27 | /* Macros to make config lines shorter */ 28 | #define G "generic" 29 | #define F "features" 30 | #define P "prototypes" 31 | #define C "cluster" 32 | #define Y "classify" 33 | 34 | /* Default configuration */ 35 | static config_default_t defaults[] = { 36 | /* Input */ 37 | {G, "input_format", CONFIG_TYPE_STRING, {.str = "text"}}, 38 | {G, "event_delim", CONFIG_TYPE_STRING, {.str = "%0a%0d"}}, 39 | {G, "state_dir", CONFIG_TYPE_STRING, {.str = "/var/tmp/malheur"}}, 40 | {G, "output_file", CONFIG_TYPE_STRING, {.str = "malheur.out"}}, 41 | 42 | /* Features */ 43 | {F, "ngram_len", CONFIG_TYPE_INT, {.num = 2}}, 44 | {F, "vect_embed", CONFIG_TYPE_STRING, {.str = "bin"}}, 45 | {F, "mist_level", CONFIG_TYPE_INT, {.num = 1}}, 46 | {F, "hash_seed1", CONFIG_TYPE_INT, {.num = 0x1ea4501a}}, 47 | {F, "hash_seed2", CONFIG_TYPE_INT, {.num = 0x75f3da43}}, 48 | 49 | /* Prototypes */ 50 | {P, "max_dist", CONFIG_TYPE_FLOAT, {.flt = 0.65}}, 51 | {P, "max_num", CONFIG_TYPE_INT, {.num = 0}}, 52 | 53 | /* Classification */ 54 | {Y, "max_dist", CONFIG_TYPE_FLOAT, {.flt = 0.68}}, 55 | 56 | /* Clustering */ 57 | {C, "link_mode", CONFIG_TYPE_STRING, {.str = "complete"}}, 58 | {C, "min_dist", CONFIG_TYPE_FLOAT, {.flt = 0.95}}, 59 | {C, "reject_num", CONFIG_TYPE_INT, {.num = 10}}, 60 | {C, "shared_ngrams", CONFIG_TYPE_FLOAT, {.flt = 0.0}}, 61 | 62 | /* Terminating entry */ 63 | {NULL} 64 | }; 65 | 66 | /** 67 | * Print a configuration setting. 68 | * @param f File stream to print to 69 | * @param cs Configuration setting 70 | * @param d Current depth. 71 | */ 72 | static void config_setting_fprint(FILE *f, config_setting_t * cs, int d) 73 | { 74 | assert(cs && d >= 0); 75 | 76 | int i; 77 | for (i = 0; i < d - 1; i++) 78 | fprintf(f, " "); 79 | 80 | char *n = config_setting_name(cs); 81 | 82 | switch (config_setting_type(cs)) { 83 | case CONFIG_TYPE_GROUP: 84 | if (d > 0) 85 | fprintf(f, "%s = {\n", n); 86 | 87 | for (i = 0; i < config_setting_length(cs); i++) 88 | config_setting_fprint(f, config_setting_get_elem(cs, i), d + 1); 89 | 90 | if (d > 0) { 91 | for (i = 0; i < d - 1; i++) 92 | fprintf(f, " "); 93 | fprintf(f, "};\n\n"); 94 | } 95 | break; 96 | case CONFIG_TYPE_STRING: 97 | fprintf(f, "%s\t= \"%s\";\n", n, config_setting_get_string(cs)); 98 | break; 99 | case CONFIG_TYPE_FLOAT: 100 | fprintf(f, "%s\t= %7.5f;\n", n, config_setting_get_float(cs)); 101 | break; 102 | case CONFIG_TYPE_INT: 103 | fprintf(f, "%s\t= %ld;\n", n, (long) config_setting_get_int(cs)); 104 | break; 105 | case CONFIG_TYPE_BOOL: 106 | fprintf(f, "%s\t= %s;\n", n, config_setting_get_bool(cs) 107 | ? "true" : "false"); 108 | break; 109 | default: 110 | error("Unsupported type for configuration setting '%s'", n); 111 | break; 112 | } 113 | } 114 | 115 | /** 116 | * Print the configuration. 117 | * @param cfg configuration 118 | */ 119 | void config_print(config_t *cfg) 120 | { 121 | config_setting_fprint(stdout, config_root_setting(cfg), 0); 122 | } 123 | 124 | /** 125 | * Print the configuration to a file. 126 | * @param f pointer to file stream 127 | * @param cfg configuration 128 | */ 129 | void config_fprint(FILE *f, config_t *cfg) 130 | { 131 | config_setting_fprint(f, config_root_setting(cfg), 0); 132 | } 133 | 134 | /** 135 | * The functions add default values to unspecified parameters. 136 | * @param cfg configuration 137 | */ 138 | static void config_default(config_t *cfg) 139 | { 140 | int i, b; 141 | cfg_int j; 142 | const char *s; 143 | double f; 144 | config_setting_t *cs = NULL, *vs; 145 | char *token, *string, *tofree; 146 | 147 | for (i = 0; defaults[i].name; i++) { 148 | /* Lookup and create setting group */ 149 | tofree = string = strdup(defaults[i].group); 150 | vs = config_root_setting(cfg); 151 | while ((token = strsep(&string, ".")) != NULL) { 152 | cs = config_setting_get_member(vs, token); 153 | if (!cs) 154 | cs = config_setting_add(vs, token, CONFIG_TYPE_GROUP); 155 | vs = cs; 156 | } 157 | free(tofree); 158 | 159 | switch (defaults[i].type) { 160 | case CONFIG_TYPE_STRING: 161 | if (config_setting_lookup_string(cs, defaults[i].name, &s)) 162 | continue; 163 | 164 | /* Add default value */ 165 | config_setting_remove(cs, defaults[i].name); 166 | vs = config_setting_add(cs, defaults[i].name, 167 | CONFIG_TYPE_STRING); 168 | config_setting_set_string(vs, defaults[i].val.str); 169 | break; 170 | case CONFIG_TYPE_FLOAT: 171 | if (config_setting_lookup_float(cs, defaults[i].name, &f)) 172 | continue; 173 | 174 | /* Check for mis-interpreted integer */ 175 | if (config_setting_lookup_int(cs, defaults[i].name, &j)) { 176 | config_setting_remove(cs, defaults[i].name); 177 | vs = config_setting_add(cs, defaults[i].name, 178 | CONFIG_TYPE_FLOAT); 179 | config_setting_set_float(vs, (double) j); 180 | continue; 181 | } 182 | 183 | /* Add default value */ 184 | config_setting_remove(cs, defaults[i].name); 185 | vs = config_setting_add(cs, defaults[i].name, CONFIG_TYPE_FLOAT); 186 | config_setting_set_float(vs, defaults[i].val.flt); 187 | break; 188 | case CONFIG_TYPE_INT: 189 | if (config_setting_lookup_int(cs, defaults[i].name, &j)) 190 | continue; 191 | 192 | /* Check for mis-interpreted float */ 193 | if (config_setting_lookup_float(cs, defaults[i].name, &f)) { 194 | config_setting_remove(cs, defaults[i].name); 195 | vs = config_setting_add(cs, defaults[i].name, 196 | CONFIG_TYPE_INT); 197 | config_setting_set_int(vs, (long) round(f)); 198 | continue; 199 | } 200 | 201 | /* Add default value */ 202 | config_setting_remove(cs, defaults[i].name); 203 | vs = config_setting_add(cs, defaults[i].name, CONFIG_TYPE_INT); 204 | config_setting_set_int(vs, defaults[i].val.num); 205 | break; 206 | case CONFIG_TYPE_BOOL: 207 | if (config_setting_lookup_bool(cs, defaults[i].name, &b)) 208 | continue; 209 | 210 | /* Check for mis-interpreted integer */ 211 | if (config_setting_lookup_int(cs, defaults[i].name, &j)) { 212 | config_setting_remove(cs, defaults[i].name); 213 | vs = config_setting_add(cs, defaults[i].name, 214 | CONFIG_TYPE_BOOL); 215 | config_setting_set_bool(vs, 216 | j == 0 ? CONFIG_FALSE : CONFIG_TRUE); 217 | continue; 218 | } 219 | 220 | /* Add default value */ 221 | config_setting_remove(cs, defaults[i].name); 222 | vs = config_setting_add(cs, defaults[i].name, CONFIG_TYPE_BOOL); 223 | config_setting_set_bool(vs, defaults[i].val.num); 224 | break; 225 | 226 | } 227 | } 228 | } 229 | 230 | /** 231 | * Checks if the configuration is valid and sane. 232 | * @return 1 if config is valid, 0 otherwise 233 | */ 234 | int config_check(config_t *cfg) 235 | { 236 | int num; 237 | 238 | /* Add default values where missing */ 239 | config_default(cfg); 240 | 241 | config_lookup_int(cfg, "features.ngram_len", &num); 242 | if (num < 1) { 243 | error("N-gram length needs to be > 0"); 244 | return FALSE; 245 | } 246 | 247 | config_lookup_int(cfg, "features.mist_level", &num); 248 | if (num < 1) { 249 | error("MIST level needs to be > 0"); 250 | return FALSE; 251 | } 252 | 253 | return TRUE; 254 | } 255 | 256 | /** @} */ 257 | -------------------------------------------------------------------------------- /src/export.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MALHEUR - Automatic Analysis of Malware Behavior 3 | * Copyright (c) 2009-2015 Konrad Rieck (konrad@mlsec.org) 4 | * University of Goettingen, Berlin Institute of Technology 5 | * -- 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 3 of the License, or (at your 9 | * option) any later version. This program is distributed without any 10 | * warranty. See the GNU General Public License for more details. 11 | * -- 12 | */ 13 | 14 | /** 15 | * @defgroup export Export functions 16 | * The module contains functions for exporting data computed by 17 | * Malheur to external format such as plain text and HTML documents. 18 | * @author Konrad Rieck 19 | * @{ 20 | */ 21 | #include "config.h" 22 | #include "common.h" 23 | #include "farray.h" 24 | #include "util.h" 25 | #include "export.h" 26 | #include "mconfig.h" 27 | #include "quality.h" 28 | #include "fmath.h" 29 | #include "ftable.h" 30 | 31 | /* External variables */ 32 | extern int verbose; 33 | extern config_t cfg; 34 | 35 | /** 36 | * Exports a distance matrix to a text file 37 | * @param d Pointer to matrix 38 | * @param fa Feature vector array 39 | * @param file File name 40 | */ 41 | void export_dist(double *d, farray_t *fa, const char *file) 42 | { 43 | assert(d && fa && file); 44 | int i, j; 45 | FILE *f; 46 | 47 | if (verbose > 0) 48 | printf("Exporting distance matrix to '%s'.\n", file); 49 | 50 | if (!(f = fopen(file, "w"))) { 51 | error("Could not create file '%s'.", file); 52 | return; 53 | } 54 | 55 | /* Print version header */ 56 | malheur_version(f); 57 | 58 | /* Print distance header */ 59 | fprintf(f, "# ---\n# Distance matrix for %s\n", fa->src); 60 | fprintf(f, "# Matrix size: %lu x %lu\n# ---\n", fa->len, fa->len); 61 | fprintf(f, "# ... \n"); 62 | 63 | /* Print matrix */ 64 | for (i = 0; i < fa->len; i++) { 65 | fprintf(f, "%s %s ", fa->x[i]->src, farray_get_label(fa, i)); 66 | for (j = 0; j < fa->len; j++) 67 | fprintf(f, "%g ", d[i * fa->len + j]); 68 | fprintf(f, "\n"); 69 | } 70 | 71 | fclose(f); 72 | } 73 | 74 | /** 75 | * Exports a structure of prototypes to a text file 76 | * @param pr Prototype structure 77 | * @param fa Feature vector array 78 | * @param as Assignments to protoypes 79 | * @param file File name 80 | */ 81 | void export_proto(farray_t *pr, farray_t *fa, assign_t *as, const char *file) 82 | { 83 | assert(pr && fa && file); 84 | int i, j; 85 | FILE *f; 86 | 87 | if (verbose > 0) 88 | printf("Exporting prototypes to '%s'.\n", file); 89 | 90 | if (!(f = fopen(file, "w"))) { 91 | error("Could not create file '%s'.", file); 92 | return; 93 | } 94 | 95 | /* Print version header */ 96 | malheur_version(f); 97 | 98 | /* Evaluate some quality functions */ 99 | double *e = quality(fa->y, as->proto, as->len); 100 | 101 | /* Print prototype header */ 102 | fprintf(f, "# ---\n# Prototypes for %s\n", fa->src); 103 | fprintf(f, "# Number of prototypes: %lu\n", pr->len); 104 | fprintf(f, "# Compression of prototypes: %4.1f %%\n", 105 | pr->len * 100.0 / (double) fa->len); 106 | fprintf(f, "# Precision of prototypes: %4.1f %%\n", 107 | e[Q_PRECISION] * 100.0); 108 | fprintf(f, "# ---\n# \n"); 109 | 110 | for (i = 0; i < fa->len; i++) { 111 | j = as->proto[i]; 112 | fprintf(f, "%s %s %g\n", fa->x[i]->src, pr->x[j]->src, as->dist[i]); 113 | } 114 | 115 | fclose(f); 116 | } 117 | 118 | /** 119 | * Exports a clustering structure to a text file 120 | * @param c Clustering structure 121 | * @param fa Feature vector array 122 | * @param p Prototype struture 123 | * @param a Assignments of prototypes 124 | * @param file File name 125 | */ 126 | void export_cluster(cluster_t *c, farray_t *p, farray_t *fa, assign_t *a, 127 | const char *file) 128 | { 129 | assert(c && fa && file); 130 | FILE *f; 131 | int i, j; 132 | 133 | if (verbose > 0) 134 | printf("Exporting clusters to '%s'.\n", file); 135 | 136 | if (!(f = fopen(file, "w"))) { 137 | error("Could not create file '%s'.", file); 138 | return; 139 | } 140 | 141 | /* Print version header */ 142 | malheur_version(f); 143 | 144 | /* Evaluate some quality functions */ 145 | double *e = quality(fa->y, c->cluster, c->len); 146 | 147 | /* Print prototype header */ 148 | fprintf(f, "# ---\n# Clusters for %s\n", fa->src); 149 | fprintf(f, "# Number of cluster: %lu\n", c->num); 150 | fprintf(f, "# Precision of clusters: %4.1f %%\n", 151 | e[Q_PRECISION] * 100.0); 152 | fprintf(f, "# Recall of clusters: %4.1f %%\n", e[Q_RECALL] * 100.0); 153 | fprintf(f, "# F-measure of clusters: %4.1f %%\n", e[Q_FMEASURE] * 100.0); 154 | fprintf(f, "# ---\n# \n"); 155 | 156 | for (i = 0; i < fa->len; i++) { 157 | j = a->proto[i]; 158 | fprintf(f, "%s %s %s %g\n", fa->x[i]->src, cluster_get_name(c, i), 159 | p->x[j]->src, a->dist[i]); 160 | } 161 | 162 | fclose(f); 163 | } 164 | 165 | 166 | /** 167 | * Exports classification results 168 | * @param p Prototype structure 169 | * @param fa Feature vector array 170 | * @param as Assignments to protoypes 171 | * @param file File name 172 | */ 173 | void export_class(farray_t *p, farray_t *fa, assign_t *as, const char *file) 174 | { 175 | assert(p && fa && file); 176 | int i, j; 177 | char *l; 178 | FILE *f; 179 | 180 | if (verbose > 0) 181 | printf("Exporting classification to '%s'.\n", file); 182 | 183 | if (!(f = fopen(file, "w"))) { 184 | error("Could not create file '%s'.", file); 185 | return; 186 | } 187 | 188 | /* Print version header */ 189 | malheur_version(f); 190 | 191 | /* Evaluate some quality functions */ 192 | double *e = quality(fa->y, as->label, as->len); 193 | 194 | /* Print prototype header */ 195 | fprintf(f, "# ---\n# Classification for %s\n", fa->src); 196 | fprintf(f, "# Precision of classification: %4.1f %%\n", 197 | e[Q_PRECISION] * 100.0); 198 | fprintf(f, "# Recall of classification: %4.1f %%\n", 199 | e[Q_RECALL] * 100.0); 200 | fprintf(f, "# F-measure of classification: %4.1f %%\n", 201 | e[Q_FMEASURE] * 100.0); 202 | fprintf(f, "# ---\n#