├── .gitignore ├── LICENSE ├── README.md ├── ROC.png └── dga_classifier.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 ENDGAME 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SANS_THIR16 2 | SANS Hunting on the Cheap 3 | 4 | ##DGA Classifier 5 | RandomForest using domain features like vowel_to_consonant_ratio, longest_consonant_sequence, ngrams from dictionary and alexa top domains. 6 | 7 | ###ROC Curve 8 | ![roc](ROC.png) 9 | 10 | 11 | -------------------------------------------------------------------------------- /ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/endgameinc/SANS_THIR16/8b698f0450d51b40ed36fc5b4ab8e19344ab765a/ROC.png -------------------------------------------------------------------------------- /dga_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn.feature_extraction 3 | import sklearn.ensemble 4 | import pandas as pd 5 | import matplotlib 6 | import tldextract 7 | import math 8 | from collections import Counter 9 | import pickle 10 | import json 11 | import sys 12 | from functools import partial 13 | from sklearn.cross_validation import train_test_split 14 | from sklearn.metrics import roc_curve, roc_auc_score, auc 15 | import argparse 16 | 17 | #http://s3.amazonaws.com/alexa-static/top-1m.csv.zip 18 | ALEXA_FILEPATH = 'top-1m.csv' 19 | DICT_FILEPATH = '/usr/share/dict/words' 20 | #http://osint.bambenekconsulting.com/feeds/dga-feed.txt 21 | DGA_FILEPATH = 'dga-feed.txt' 22 | CLASSIFIER_STORAGE = 'dga_classifier.pickle' 23 | 24 | def get_domain(hostname): 25 | try: 26 | return tldextract.extract(hostname).domain 27 | except ValueError: 28 | print 'Error extracting domain from %s'%(hostname,) 29 | return np.nan 30 | 31 | def get_subdomain(hostname): 32 | try: 33 | return tldextract.extract(hostname).subdomain 34 | except ValueError: 35 | print 'Error extracting domain from %s'%(hostname,) 36 | return np.nan 37 | 38 | def entropy(s): 39 | p, lns = Counter(s), float(len(s)) 40 | return -sum(count/lns * math.log(count/lns, 2) for count in p.values()) 41 | 42 | def longest_consonant_sequence(s): 43 | vowels = set('aeiou') 44 | longest = 0 45 | current = 0 46 | for c in s: 47 | if c not in vowels: 48 | current += 1 49 | else: 50 | if current >= longest: 51 | longest = current 52 | current = 0 53 | if current >= longest: 54 | longest = current 55 | current = 0 56 | return longest 57 | 58 | def vowel_consonant_ratio(s): 59 | classes = {v:'v' for v in 'aeiou'} 60 | classes.update({'.':'d'}) 61 | d = Counter([classes.get(c, 'c') for c in s]) 62 | return float(d.get('v', 0))/d.get('c', 0) if d.get('c', 0) else np.nan 63 | 64 | def strip_non_alpha(string): 65 | #Time to move to Python 3? 66 | delchars = '0123456789-' 67 | if isinstance(string, unicode): 68 | table = {ord(c):None for c in delchars} 69 | return string.translate(table) 70 | else: 71 | return string.translate(None, delchars) 72 | 73 | def train_vectorizer(series): 74 | alexa_cv = sklearn.feature_extraction.text.CountVectorizer( 75 | analyzer='char', 76 | ngram_range=(3, 5), 77 | min_df=1e-4, 78 | max_df=1.0) 79 | counts_matrix = alexa_cv.fit_transform(series) 80 | alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1()) 81 | return alexa_cv, alexa_counts 82 | 83 | def calc_ngram_hits(df, cv, counts): 84 | return counts * cv.transform(df['domain_alpha_chars']).T 85 | 86 | 87 | def cross_validate(fts, labels, clf, nfolds): 88 | scores = [] 89 | true_labels = [] 90 | for fold in range(nfolds): 91 | X_train, X_test, y_train, y_test = train_test_split(fts, labels, test_size=.2) 92 | clf.fit(X_train, y_train) 93 | 94 | scores.append(clf.predict_proba(X_test)[:,1]) 95 | true_labels.append(y_test) 96 | ret = {} 97 | ret['fpr'], ret['tpr'], ret['thr'] = roc_curve(np.array(true_labels).ravel(), np.array(scores).ravel()) 98 | ret['auc'] = auc(ret['fpr'], ret['tpr']) 99 | print ret['auc'] 100 | return ret 101 | 102 | def train(df, features, test_training=True, max_fpr=.05, nfolds = 10): 103 | for feature, feature_func in features.items(): 104 | df[feature] = feature_func(df) 105 | 106 | df = df.dropna() 107 | X = df.as_matrix(features.keys()) 108 | y = np.array(df['class'].tolist()) 109 | # Make 0-1 110 | y = [x=='dga' for x in y] 111 | try: 112 | clf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_depth=5, n_jobs=-1) 113 | validation_data = cross_validate(X, y, clf, nfolds) 114 | clf.fit(X, y) 115 | thr = validation_data['thr'][np.max(np.where(validation_data['fpr'] < .05))] 116 | except Exception as e: 117 | import pdb; pdb.set_trace() 118 | raise e 119 | return clf, thr, validation_data 120 | 121 | def predict(clf, df, features, threshold): 122 | for feature, feature_func in features.items(): 123 | df[feature] = feature_func(df) 124 | df = df.dropna() 125 | hold_X = df.as_matrix(features.keys()) 126 | hold_y_pred = clf.predict(hold_X) 127 | prob = clf.predict_proba(hold_X) 128 | df['label'] = ['DGA' if x > threshold else 'Benign' for x in prob[:, 1]] 129 | df['prob_dga'] = prob[:, 1] 130 | return df 131 | 132 | def prepare_df(df): 133 | df['domain'] = df['raw_domain'].apply(get_domain) 134 | df = df.dropna() 135 | df.loc[:,'domain_alpha_chars'] = df['domain'].apply(strip_non_alpha) 136 | return df 137 | 138 | def read_alexa_df(filepath): 139 | alexa_df = pd.read_csv(filepath, names=('rank', 'raw_domain'), header=None, encoding='utf-8') 140 | alexa_df = alexa_df[:500000] 141 | del alexa_df['rank'] 142 | alexa_df = prepare_df(alexa_df) 143 | alexa_df['class'] = 'benign' 144 | print 'Number of Alexa domains: %d' % alexa_df.shape[0] 145 | alexa_df = alexa_df.reindex(np.random.permutation(alexa_df.index)) 146 | return alexa_df 147 | 148 | def read_dga_df(filepath): 149 | dga_df = pd.read_csv(filepath, names=['raw_domain', 'family', 'date', 'link'], \ 150 | header=None, encoding='utf-8', comment='#') 151 | del dga_df['family'] 152 | del dga_df['date'] 153 | del dga_df['link'] 154 | dga_df = dga_df.drop_duplicates() 155 | dga_df = prepare_df(dga_df) 156 | dga_df['class'] = 'dga' 157 | dga_df = dga_df.dropna() 158 | print 'Number of DGA domains: %d' % dga_df.shape[0] 159 | return dga_df 160 | 161 | def train_and_serialize(filepath, max_fpr=.05, nfolds=10, dispr=True): 162 | alexa_df = read_alexa_df(ALEXA_FILEPATH) 163 | dga_df = read_dga_df(DGA_FILEPATH) 164 | all_domains = pd.concat([alexa_df, dga_df], ignore_index=True) 165 | alexa_cv, alexa_counts = train_vectorizer(alexa_df['domain_alpha_chars']) 166 | dict_df = pd.read_csv(DICT_FILEPATH, names=['word',]).dropna() 167 | dict_cv, dict_counts = train_vectorizer(dict_df['word']) 168 | features = { 169 | 'len': lambda df: df['domain'].apply(len), 170 | 'entropy':lambda df: df['domain'].apply(entropy), 171 | 'vowel_consonant_ratio': lambda df: df['domain'].apply(vowel_consonant_ratio), 172 | 'longest_consonant_sequence': lambda df: df['domain'].apply(longest_consonant_sequence), 173 | 'alexa_ngrams': partial(calc_ngram_hits, cv=alexa_cv, counts=alexa_counts), 174 | 'dict_ngrams': partial(calc_ngram_hits, cv=dict_cv, counts=dict_counts), 175 | } 176 | clf, thr, validation_data = train(all_domains, features, max_fpr=max_fpr, nfolds=nfolds) 177 | 178 | outf = {'clf':clf, 179 | 'thr':thr, 180 | 'alexa_cv':alexa_cv, 181 | 'alexa_counts':alexa_counts, 182 | 'dict_cv':dict_cv, 183 | 'dict_counts':dict_counts, 184 | 'validation_data':validation_data} 185 | 186 | with open(filepath, 'w') as fp: 187 | pickle.dump(outf, fp) 188 | 189 | if dispr: 190 | display_roc(outf) 191 | 192 | def display_roc(data): 193 | import matplotlib.pyplot as plt 194 | plt.plot(data['validation_data']['fpr'], data['validation_data']['tpr'], 195 | label='micro-average ROC curve (area = {0:0.2f})' 196 | ''.format(data['validation_data']['auc']), 197 | linewidth=2) 198 | idx = np.where(data['validation_data']['thr'] == data['thr'])[0] 199 | ax = plt.axes() 200 | ax.annotate("Threshold = %f" % (data['thr'], ), 201 | xy=(data['validation_data']['fpr'][idx], data['validation_data']['tpr'][idx]), 202 | xycoords='data', 203 | xytext=(data['validation_data']['fpr'][idx]+.1, data['validation_data']['tpr'][idx]-.3), 204 | textcoords='data', 205 | size=16, va="center", ha="left", 206 | arrowprops=dict(arrowstyle="simple", 207 | facecolor='black'), 208 | ) 209 | 210 | 211 | plt.plot([0, 1], [0, 1], 'k--') 212 | plt.xlim([0.0, 1.0]) 213 | plt.ylim([0.0, 1.05]) 214 | plt.xlabel('False Positive Rate') 215 | plt.ylabel('True Positive Rate') 216 | plt.title('ROC Curve') 217 | plt.legend(loc="lower right") 218 | plt.show() 219 | 220 | 221 | def load_and_predict(filepath, df): 222 | with open(filepath, 'r') as fp: 223 | data = pickle.load(fp) 224 | clf = data['clf'] 225 | alexa_cv, alexa_counts = data['alexa_cv'], data['alexa_counts'] 226 | dict_cv, dict_counts = data['dict_cv'], data['dict_counts'] 227 | threshold = data['thr'] 228 | features = { 229 | 'len': lambda df: df['domain'].apply(len), 230 | 'entropy':lambda df: df['domain'].apply(entropy), 231 | 'vowel_consonant_ratio': lambda df: df['domain'].apply(vowel_consonant_ratio), 232 | 'longest_consonant_sequence': lambda df: df['domain'].apply(longest_consonant_sequence), 233 | 'alexa_ngrams': partial(calc_ngram_hits, cv=alexa_cv, counts=alexa_counts), 234 | 'dict_ngrams': partial(calc_ngram_hits, cv=dict_cv, counts=dict_counts), 235 | } 236 | return predict(clf, df, features, threshold) 237 | 238 | if __name__ == '__main__': 239 | parser = argparse.ArgumentParser(description='Fit or Predict the DGA classifier') 240 | parser.add_argument('-f', '--fit', 241 | action='store_true', 242 | help='Predict a new DGA classifier model based', 243 | default=False) 244 | parser.add_argument('-p', '--predict', 245 | action='store', 246 | help='Predict label for domains from a file containing JSON encoded list of domains', 247 | default=None) 248 | args = parser.parse_args() 249 | if args.fit: 250 | train_and_serialize(CLASSIFIER_STORAGE, max_fpr=.05, nfolds=10, dispr=True) 251 | elif args.predict: 252 | inputfile = args.predict 253 | with open(inputfile, 'r') as fp: 254 | data = json.load(fp) 255 | df = pd.DataFrame(data, columns=['raw_domain']) 256 | df = prepare_df(df) 257 | df = load_and_predict(CLASSIFIER_STORAGE, df) 258 | print df[df['label'] == 'DGA'][['raw_domain', 'prob_dga']].sample(n=10) 259 | print df[df['label'] != 'DGA'][['raw_domain', 'prob_dga']].sample(n=10) 260 | print 'DGA = %d, Benign = %d, Total = %d'%( \ 261 | len(df[df['label'] == 'DGA']), 262 | len(df[df['label'] != 'DGA']), 263 | len(df)) 264 | df_list = df.to_dict(orient='list') 265 | res = {} 266 | keys = ('raw_domain', 'prob_dga', 'label') 267 | for (domain, prob_dga, label) in zip(*(df_list[key] for key in keys)): 268 | res[domain] = (label, prob_dga) 269 | with open(inputfile.split('.')[0] + '_res.json', 'w') as fp: 270 | json.dump(res, fp) 271 | else: 272 | parser.error('One of the options fit or predict must be selected') 273 | --------------------------------------------------------------------------------