├── .gitignore
├── LICENSE
├── README.md
├── ROC.png
└── dga_classifier.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 ENDGAME
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SANS_THIR16
 2 | SANS Hunting on the Cheap
 3 | 
 4 | ##DGA Classifier
 5 | RandomForest using domain features like vowel_to_consonant_ratio, longest_consonant_sequence, ngrams from dictionary and alexa top domains.
 6 | 
 7 | ###ROC Curve
 8 | ![roc](ROC.png)
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/endgameinc/SANS_THIR16/8b698f0450d51b40ed36fc5b4ab8e19344ab765a/ROC.png


--------------------------------------------------------------------------------
/dga_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sklearn.feature_extraction
  3 | import sklearn.ensemble
  4 | import pandas as pd
  5 | import matplotlib
  6 | import tldextract
  7 | import math
  8 | from collections import Counter
  9 | import pickle
 10 | import json
 11 | import sys
 12 | from functools import partial
 13 | from sklearn.cross_validation import train_test_split
 14 | from sklearn.metrics import roc_curve, roc_auc_score, auc
 15 | import argparse
 16 | 
 17 | #http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
 18 | ALEXA_FILEPATH = 'top-1m.csv'
 19 | DICT_FILEPATH = '/usr/share/dict/words'
 20 | #http://osint.bambenekconsulting.com/feeds/dga-feed.txt
 21 | DGA_FILEPATH = 'dga-feed.txt'
 22 | CLASSIFIER_STORAGE = 'dga_classifier.pickle'
 23 | 
 24 | def get_domain(hostname):
 25 |     try:
 26 |         return tldextract.extract(hostname).domain
 27 |     except ValueError:
 28 |         print 'Error extracting domain from %s'%(hostname,)
 29 |     return np.nan
 30 | 
 31 | def get_subdomain(hostname):
 32 |     try:
 33 |         return tldextract.extract(hostname).subdomain
 34 |     except ValueError:
 35 |         print 'Error extracting domain from %s'%(hostname,)
 36 |     return np.nan
 37 | 
 38 | def entropy(s):
 39 |     p, lns = Counter(s), float(len(s))
 40 |     return -sum(count/lns * math.log(count/lns, 2) for count in p.values())
 41 | 
 42 | def longest_consonant_sequence(s):
 43 |     vowels = set('aeiou')
 44 |     longest = 0
 45 |     current = 0
 46 |     for c in s:
 47 |         if c not in vowels:
 48 |             current += 1
 49 |         else:
 50 |             if current >= longest:
 51 |                 longest = current
 52 |             current = 0
 53 |     if current >= longest:
 54 |         longest = current
 55 |         current = 0    
 56 |     return longest
 57 | 
 58 | def vowel_consonant_ratio(s):
 59 |     classes = {v:'v' for v in 'aeiou'}
 60 |     classes.update({'.':'d'})
 61 |     d = Counter([classes.get(c, 'c') for c in s])
 62 |     return float(d.get('v', 0))/d.get('c', 0) if d.get('c', 0) else np.nan
 63 | 
 64 | def strip_non_alpha(string):
 65 |     #Time to move to Python 3?
 66 |     delchars = '0123456789-'
 67 |     if isinstance(string, unicode):
 68 |         table = {ord(c):None for c in delchars}
 69 |         return string.translate(table)
 70 |     else:
 71 |         return string.translate(None, delchars)
 72 | 
 73 | def train_vectorizer(series):
 74 |     alexa_cv = sklearn.feature_extraction.text.CountVectorizer(
 75 |                    analyzer='char',
 76 |                    ngram_range=(3, 5),
 77 |                    min_df=1e-4,
 78 |                    max_df=1.0)
 79 |     counts_matrix = alexa_cv.fit_transform(series)
 80 |     alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1())
 81 |     return alexa_cv, alexa_counts
 82 | 
 83 | def calc_ngram_hits(df, cv, counts):
 84 |     return counts * cv.transform(df['domain_alpha_chars']).T
 85 | 
 86 | 
 87 | def cross_validate(fts, labels, clf, nfolds):
 88 |     scores = []
 89 |     true_labels = []
 90 |     for fold in range(nfolds):
 91 |         X_train, X_test, y_train, y_test = train_test_split(fts, labels, test_size=.2)
 92 |         clf.fit(X_train, y_train)
 93 | 
 94 |         scores.append(clf.predict_proba(X_test)[:,1])
 95 |         true_labels.append(y_test)
 96 |     ret = {}
 97 |     ret['fpr'], ret['tpr'], ret['thr'] = roc_curve(np.array(true_labels).ravel(), np.array(scores).ravel())
 98 |     ret['auc'] = auc(ret['fpr'], ret['tpr'])
 99 |     print ret['auc']
100 |     return ret
101 | 
102 | def train(df, features, test_training=True, max_fpr=.05, nfolds = 10):
103 |     for feature, feature_func in features.items():
104 |         df[feature] = feature_func(df)
105 | 
106 |     df = df.dropna()
107 |     X = df.as_matrix(features.keys())
108 |     y = np.array(df['class'].tolist())
109 |     # Make 0-1
110 |     y = [x=='dga' for x in y]
111 |     try:
112 |         clf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_depth=5, n_jobs=-1)
113 |         validation_data = cross_validate(X, y, clf, nfolds)
114 |         clf.fit(X, y)
115 |         thr = validation_data['thr'][np.max(np.where(validation_data['fpr'] < .05))]
116 |     except Exception as e:
117 |         import pdb; pdb.set_trace()
118 |         raise e
119 |     return clf, thr, validation_data
120 | 
121 | def predict(clf, df, features, threshold):
122 |     for feature, feature_func in features.items():
123 |         df[feature] = feature_func(df)
124 |     df = df.dropna()
125 |     hold_X = df.as_matrix(features.keys())
126 |     hold_y_pred = clf.predict(hold_X)
127 |     prob = clf.predict_proba(hold_X)
128 |     df['label'] = ['DGA' if x > threshold else 'Benign' for x in prob[:, 1]]
129 |     df['prob_dga'] = prob[:, 1]
130 |     return df
131 | 
132 | def prepare_df(df):
133 |     df['domain'] = df['raw_domain'].apply(get_domain)
134 |     df = df.dropna()
135 |     df.loc[:,'domain_alpha_chars'] = df['domain'].apply(strip_non_alpha)
136 |     return df
137 | 
138 | def read_alexa_df(filepath):
139 |     alexa_df = pd.read_csv(filepath, names=('rank', 'raw_domain'), header=None, encoding='utf-8')
140 |     alexa_df = alexa_df[:500000]
141 |     del alexa_df['rank']
142 |     alexa_df = prepare_df(alexa_df)
143 |     alexa_df['class'] = 'benign'
144 |     print 'Number of Alexa domains: %d' % alexa_df.shape[0]
145 |     alexa_df = alexa_df.reindex(np.random.permutation(alexa_df.index))
146 |     return alexa_df
147 | 
148 | def read_dga_df(filepath):
149 |     dga_df = pd.read_csv(filepath, names=['raw_domain', 'family', 'date', 'link'], \
150 |                          header=None, encoding='utf-8', comment='#')
151 |     del dga_df['family']
152 |     del dga_df['date']
153 |     del dga_df['link']
154 |     dga_df = dga_df.drop_duplicates()
155 |     dga_df = prepare_df(dga_df)
156 |     dga_df['class'] = 'dga'
157 |     dga_df = dga_df.dropna()
158 |     print 'Number of DGA domains: %d' % dga_df.shape[0]
159 |     return dga_df
160 | 
161 | def train_and_serialize(filepath, max_fpr=.05, nfolds=10, dispr=True):
162 |     alexa_df = read_alexa_df(ALEXA_FILEPATH)
163 |     dga_df = read_dga_df(DGA_FILEPATH)
164 |     all_domains = pd.concat([alexa_df, dga_df], ignore_index=True)
165 |     alexa_cv, alexa_counts = train_vectorizer(alexa_df['domain_alpha_chars'])
166 |     dict_df = pd.read_csv(DICT_FILEPATH, names=['word',]).dropna()
167 |     dict_cv, dict_counts = train_vectorizer(dict_df['word'])
168 |     features = {
169 |         'len': lambda df: df['domain'].apply(len),
170 |         'entropy':lambda df: df['domain'].apply(entropy),
171 |         'vowel_consonant_ratio': lambda df: df['domain'].apply(vowel_consonant_ratio),
172 |         'longest_consonant_sequence': lambda df: df['domain'].apply(longest_consonant_sequence),
173 |         'alexa_ngrams': partial(calc_ngram_hits, cv=alexa_cv, counts=alexa_counts),
174 |         'dict_ngrams': partial(calc_ngram_hits, cv=dict_cv, counts=dict_counts),
175 |     }
176 |     clf, thr, validation_data = train(all_domains, features, max_fpr=max_fpr, nfolds=nfolds)
177 | 
178 |     outf = {'clf':clf,
179 |             'thr':thr,
180 |             'alexa_cv':alexa_cv,
181 |             'alexa_counts':alexa_counts,
182 |             'dict_cv':dict_cv,
183 |             'dict_counts':dict_counts,
184 |             'validation_data':validation_data}
185 | 
186 |     with open(filepath, 'w') as fp:
187 |         pickle.dump(outf, fp)
188 | 
189 |     if dispr:
190 |         display_roc(outf)
191 | 
192 | def display_roc(data):
193 |     import matplotlib.pyplot as plt
194 |     plt.plot(data['validation_data']['fpr'], data['validation_data']['tpr'],
195 |              label='micro-average ROC curve (area = {0:0.2f})'
196 |                    ''.format(data['validation_data']['auc']),
197 |              linewidth=2)
198 |     idx = np.where(data['validation_data']['thr'] == data['thr'])[0]
199 |     ax = plt.axes()
200 |     ax.annotate("Threshold = %f" % (data['thr'], ),
201 |             xy=(data['validation_data']['fpr'][idx], data['validation_data']['tpr'][idx]),
202 |             xycoords='data',
203 |             xytext=(data['validation_data']['fpr'][idx]+.1, data['validation_data']['tpr'][idx]-.3),
204 |             textcoords='data',
205 |             size=16, va="center", ha="left",
206 |             arrowprops=dict(arrowstyle="simple",
207 |                             facecolor='black'),
208 |             )
209 | 
210 | 
211 |     plt.plot([0, 1], [0, 1], 'k--')
212 |     plt.xlim([0.0, 1.0])
213 |     plt.ylim([0.0, 1.05])
214 |     plt.xlabel('False Positive Rate')
215 |     plt.ylabel('True Positive Rate')
216 |     plt.title('ROC Curve')
217 |     plt.legend(loc="lower right")
218 |     plt.show()
219 | 
220 | 
221 | def load_and_predict(filepath, df):
222 |     with open(filepath, 'r') as fp:
223 |         data = pickle.load(fp)
224 |     clf = data['clf']
225 |     alexa_cv, alexa_counts = data['alexa_cv'], data['alexa_counts']
226 |     dict_cv, dict_counts = data['dict_cv'], data['dict_counts']
227 |     threshold = data['thr']
228 |     features = {
229 |         'len': lambda df: df['domain'].apply(len),
230 |         'entropy':lambda df: df['domain'].apply(entropy),
231 |         'vowel_consonant_ratio': lambda df: df['domain'].apply(vowel_consonant_ratio),
232 |         'longest_consonant_sequence': lambda df: df['domain'].apply(longest_consonant_sequence),
233 |         'alexa_ngrams': partial(calc_ngram_hits, cv=alexa_cv, counts=alexa_counts),
234 |         'dict_ngrams': partial(calc_ngram_hits, cv=dict_cv, counts=dict_counts),
235 |     }
236 |     return predict(clf, df, features, threshold)
237 | 
238 | if __name__ == '__main__':
239 |     parser = argparse.ArgumentParser(description='Fit or Predict the DGA classifier')
240 |     parser.add_argument('-f', '--fit',
241 |                         action='store_true',
242 |                         help='Predict a new DGA classifier model based',
243 |                         default=False)
244 |     parser.add_argument('-p', '--predict',
245 |                         action='store',
246 |                         help='Predict label for domains from a file containing JSON encoded list of domains',
247 |                         default=None)
248 |     args = parser.parse_args()
249 |     if args.fit:
250 |         train_and_serialize(CLASSIFIER_STORAGE, max_fpr=.05, nfolds=10, dispr=True)
251 |     elif args.predict:
252 |         inputfile = args.predict
253 |         with open(inputfile, 'r') as fp:
254 |             data = json.load(fp)
255 |         df = pd.DataFrame(data, columns=['raw_domain'])
256 |         df = prepare_df(df)
257 |         df = load_and_predict(CLASSIFIER_STORAGE, df)
258 |         print df[df['label'] == 'DGA'][['raw_domain', 'prob_dga']].sample(n=10)
259 |         print df[df['label'] != 'DGA'][['raw_domain', 'prob_dga']].sample(n=10)
260 |         print 'DGA = %d, Benign = %d, Total = %d'%( \
261 |                     len(df[df['label'] == 'DGA']),
262 |                     len(df[df['label'] != 'DGA']),
263 |                     len(df))
264 |         df_list = df.to_dict(orient='list')
265 |         res = {}
266 |         keys = ('raw_domain', 'prob_dga', 'label')
267 |         for (domain, prob_dga, label) in zip(*(df_list[key] for key in keys)):
268 |             res[domain] = (label, prob_dga)
269 |         with open(inputfile.split('.')[0] + '_res.json', 'w') as fp:
270 |             json.dump(res, fp)
271 |     else:
272 |         parser.error('One of the options fit or predict must be selected')
273 | 


--------------------------------------------------------------------------------