├── data ├── raw │ └── .gitkeep └── processed │ └── .gitkeep ├── test ├── __init__.py ├── test_viz.py ├── test_mock.py ├── test_rebalancers.py ├── test_temporal.py ├── test_evaluation.py ├── test_selectors.py └── test_rejectors.py ├── examples ├── __init__.py ├── rebalance.py ├── timeline-evaluation.py ├── active-learning.py ├── decay-plot.py ├── feature-reduct.py ├── reject.py ├── parallel-predict.py ├── constraints.py └── tesseract-plots.py ├── tesseract ├── __init__.py ├── rebalancing.py ├── mock.py ├── loader.py ├── utils.py ├── selection.py ├── temporal.py ├── rejection.py ├── plot_utils.py ├── viz.py ├── spatial.py ├── metrics.py ├── evaluation.py └── transcendent.py ├── .gitignore ├── DATASET-USESec19.md ├── setup.py ├── LICENSE ├── Makefile └── README.md /data/raw/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/processed/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tesseract/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *$py.class 5 | *DS_Store 6 | Tesseract.egg-info/* 7 | # Pycharm information 8 | .idea/* 9 | /features/ 10 | build/* 11 | dist/*egg* 12 | 13 | -------------------------------------------------------------------------------- /examples/rebalance.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier 2 | 3 | from tesseract import temporal, mock, evaluation, metrics 4 | from tesseract.rebalancing import PositiveRateRebalancer 5 | 6 | 7 | def main(): 8 | X, y, t = mock.generate_binary_test_data(10000, '2000') 9 | 10 | splits = temporal.time_aware_train_test_split( 11 | X, y, t, train_size=6, test_size=1, granularity='month') 12 | 13 | clf = RandomForestClassifier() 14 | 15 | pr_rebalancer = PositiveRateRebalancer(0.7, max_pos_rate=0.8, schedule='first') 16 | results = evaluation.fit_predict_update( 17 | clf, *splits, rebalancers=[pr_rebalancer]) 18 | 19 | metrics.print_metrics(results) 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /DATASET-USESec19.md: -------------------------------------------------------------------------------- 1 | # USENIX Security 2019 Dataset 2 | 3 | We provide the links to download the dataset used in the Tesseract paper [1]: 4 | 5 | - [AndroZoo apps hashes](https://www.dropbox.com/s/pw83zohwjk1yden/hashes.txt.gz)* 6 | - [Drebin feature space](https://www.dropbox.com/s/i7q8ysi5agi6n0f/drebin-features.tar.gz) 7 | - [MaMaDroid feature space](https://www.dropbox.com/s/wl23fjvjtj2ncsg/mamadroid-features.tar.gz) 8 | 9 | *The original Android apks can be downloaded through the [AndroZoo Official API](https://androzoo.uni.lu/api_doc). 10 | 11 | [1] Feargus Pendlebury*, Fabio Pierazzi*, Roberto Jordaney, Johannes Kinder, 12 | Lorenzo Cavallaro. "TESSERACT: Eliminating Experimental Bias in Malware Classification 13 | across Space and Time". USENIX Security Symposium, 2019. -------------------------------------------------------------------------------- /test/test_viz.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | test_viz.py 5 | ~~~~~~~~~~~ 6 | 7 | Unit tests for testing viz.py. 8 | 9 | """ 10 | import unittest 11 | 12 | from tesseract import mock, metrics 13 | 14 | 15 | class TestViz(unittest.TestCase): 16 | def setUp(self): 17 | self.X, self.y, self.t = mock.generate_binary_test_data(10000, '2012') 18 | 19 | # def test_plot_by_time(self): 20 | # viz.plot_by_time(self.y, self.t, 'day', 'line') 21 | # viz.plot_by_time(self.y, self.t, 'week', 'line') 22 | # viz.plot_by_time(self.y, self.t, 'month', 'line') 23 | # viz.plot_by_time(self.y, self.t, 'month', 'bar') 24 | # viz.plot_by_time(self.y, self.t, 'quarter', 'bar') 25 | 26 | def test_summarize(self): 27 | metrics.summarize(self.y) 28 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | _dependencies = [ 4 | 'cycler==0.10.0', 5 | 'kiwisolver==1.0.1', 6 | 'matplotlib==3.5.2', 7 | 'numpy==1.22.4', 8 | 'pandas==1.4.2', 9 | 'pyparsing==3.0.9', 10 | 'python-dateutil==2.8.1', 11 | 'pytz==2022.1', 12 | 'scikit-learn>=1.1.1,<2.0.0', 13 | 'scipy==1.8.1', 14 | 'seaborn==0.9.0', 15 | 'six==1.11.0', 16 | 'tqdm==4.25.0'] 17 | 18 | setup( 19 | name='Tesseract', 20 | version='0.9', 21 | description='Tesseract: A library for performing ' 22 | 'time-aware classifications.', 23 | maintainer='Feargus Pendlebury', 24 | maintainer_email='Feargus.Pendlebury[at]rhul.ac.uk', 25 | url='', 26 | packages=['tesseract'], 27 | setup_requires=_dependencies, 28 | install_requires=_dependencies 29 | ) 30 | -------------------------------------------------------------------------------- /examples/timeline-evaluation.py: -------------------------------------------------------------------------------- 1 | from sklearn.svm import LinearSVC 2 | 3 | from tesseract import evaluation, temporal, metrics, mock, viz 4 | 5 | 6 | def main(): 7 | # Generate dummy predictors, labels and timestamps from Gaussians 8 | X, y, t = mock.generate_binary_test_data(10000, '2014', '2016') 9 | 10 | # Partition dataset 11 | splits = temporal.time_aware_train_test_split( 12 | X, y, t, train_size=12, test_size=1, granularity='month') 13 | 14 | # Perform a timeline evaluation 15 | clf = LinearSVC() 16 | results = evaluation.fit_predict_update(clf, *splits) 17 | 18 | # View results 19 | metrics.print_metrics(results) 20 | 21 | # View AUT(F1, 24 months) as a measure of robustness over time 22 | print(metrics.aut(results, 'f1')) 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /examples/active-learning.py: -------------------------------------------------------------------------------- 1 | from sklearn.svm import LinearSVC 2 | 3 | from tesseract import temporal, mock, evaluation, metrics 4 | from tesseract.selection import UncertaintySamplingSelector 5 | 6 | 7 | def main(): 8 | X, y, t = mock.generate_binary_test_data(10000, '2000') 9 | 10 | splits = temporal.time_aware_train_test_split( 11 | X, y, t, train_size=6, test_size=1, granularity='month') 12 | 13 | clf = LinearSVC() 14 | 15 | selector = UncertaintySamplingSelector('20%') 16 | results = evaluation.fit_predict_update(clf, *splits, selectors=[selector]) 17 | 18 | metrics.print_metrics(results) 19 | 20 | print('Number of test objects selected each period:') 21 | print(results['selected']) 22 | 23 | print('Array indices for selected objects from first test period:') 24 | print(selector.selection_history[0]) 25 | 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /examples/decay-plot.py: -------------------------------------------------------------------------------- 1 | from sklearn.svm import LinearSVC 2 | 3 | from tesseract import evaluation, temporal, metrics, mock, viz 4 | import os 5 | 6 | def main(): 7 | os.environ["PATH"] += os.pathsep + '/Library/TeX/texbin' 8 | 9 | # Generate dummy predictors, labels and timestamps from Gaussians 10 | X, y, t = mock.generate_binary_test_data(10000, '2014', '2016') 11 | 12 | # Partition dataset 13 | splits = temporal.time_aware_train_test_split( 14 | X, y, t, train_size=12, test_size=1, granularity='month') 15 | 16 | # Perform a timeline evaluation 17 | clf = LinearSVC() 18 | results = evaluation.fit_predict_update(clf, *splits) 19 | 20 | # View results 21 | metrics.print_metrics(results) 22 | 23 | # View AUT(F1, 24 months) as a measure of robustness over time 24 | print(metrics.aut_with_granularity(results, 'week', 'f1')) 25 | 26 | plt = viz.plot_decay(results) 27 | plt.show() 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /examples/feature-reduct.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from sklearn.svm import LinearSVC 5 | 6 | from tesseract import loader, temporal 7 | 8 | 9 | def main(): 10 | 11 | data_dir = 'DATA DIRECTORY GOES HERE' 12 | 13 | # Load features 14 | X, y, t, _ = loader.load_features(os.path.join(data_dir, 'raw', 'extended-features', 'extended-features')) 15 | 16 | # Split into training and testing sets 17 | X_train_full, X_tests_full, y_train, y_tests, t_train, t_tests = \ 18 | temporal.time_aware_train_test_split(X, y, t, train_size=12, test_size=1, granularity='month') 19 | 20 | # SelectKBest feature selection for a classifier 21 | clf = LinearSVC(dual="auto", max_iter=50000) 22 | clf.fit(X_train_full, y_train) 23 | 24 | select_index = loader.feature_reduce(clf=clf, dim=10000) 25 | 26 | with open('reduced-Indexes-10000.json', 'w') as fp: 27 | json.dump(select_index, fp, default=lambda x: int(x)) 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /examples/reject.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier 2 | 3 | from tesseract import temporal, mock, evaluation, metrics 4 | from tesseract.rejection import ThresholdRejector 5 | 6 | 7 | def main(): 8 | X, y, t = mock.generate_binary_test_data(10000, '2000') 9 | 10 | splits = temporal.time_aware_train_test_split( 11 | X, y, t, train_size=6, test_size=1, granularity='month') 12 | 13 | clf = RandomForestClassifier() 14 | 15 | rejector = ThresholdRejector('<', 0.9) 16 | results = evaluation.fit_predict_update(clf, *splits, rejectors=[rejector]) 17 | 18 | metrics.print_metrics(results) 19 | 20 | print('Number of rejected predictions each period:') 21 | print(results['rejected']) 22 | 23 | print('Array indices for rejected objects from first test period:') 24 | print(rejector.rejection_history[0]) 25 | 26 | print('Array indices for kept objects from first test period:') 27 | print(rejector.kept_history[0]) 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /examples/parallel-predict.py: -------------------------------------------------------------------------------- 1 | from sklearn.svm import SVC 2 | 3 | from tesseract import evaluation, temporal, metrics, mock 4 | 5 | 6 | def main(): 7 | # Generate dummy predictors, labels and timestamps from Gaussians 8 | X, y, t = mock.generate_binary_test_data(10000, '2014', '2016') 9 | 10 | # Partition dataset 11 | splits = temporal.time_aware_train_test_split( 12 | X, y, t, train_size=12, test_size=1, granularity='month') 13 | 14 | X_train, X_tests, y_train, y_tests, t_train, t_tests = splits 15 | 16 | # Perform a timeline evaluation 17 | clf = SVC(kernel='linear', probability=True) 18 | clf.fit(X_train, y_train) 19 | 20 | y_preds = evaluation.predict(clf, X_tests, nproc=4) 21 | results = metrics.calculate_metrics(y_tests, y_preds, periods=-1) 22 | 23 | # View results 24 | metrics.print_metrics(results) 25 | 26 | # View AUT(F1, 24 months) as a measure of robustness over time 27 | print(metrics.aut(results, 'f1')) 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /examples/constraints.py: -------------------------------------------------------------------------------- 1 | from sklearn.svm import SVC 2 | 3 | from tesseract import temporal, metrics, mock, spatial, evaluation 4 | 5 | 6 | # TODO | Note that constraint checks are not currently integrated into the 7 | # TODO | evaluation cycle fit_predict_update, so need to be checked manually 8 | 9 | def main(): 10 | # Generate dummy predictors, labels and timestamps from Gaussians 11 | X, y, t = mock.generate_binary_test_data(10000, '2014', '2016') 12 | 13 | # Partition dataset 14 | splits = temporal.time_aware_train_test_split( 15 | X, y, t, train_size=12, test_size=1, granularity='month') 16 | 17 | X_train, X_tests, y_train, y_tests, t_train, t_tests = splits 18 | 19 | for y_test, t_test in zip(y_tests, t_tests): 20 | temporal.assert_positive_negative_temporal_consistency(y_test, t_test) 21 | temporal.assert_train_test_temporal_consistency(t_train, t_test) 22 | spatial.assert_class_distribution(y, 0.5, 0.1) 23 | 24 | # Perform a timeline evaluation 25 | clf = SVC(kernel='linear', probability=True) 26 | results = evaluation.fit_predict_update(clf, *splits) 27 | 28 | # View results 29 | metrics.print_metrics(results) 30 | 31 | # View AUT(F1, 24 months) as a measure of robustness over time 32 | print(metrics.aut(results, 'f1')) 33 | 34 | 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /tesseract/rebalancing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | rebalancing.py 5 | ~~~~~~~~~~~~~~ 6 | 7 | # TODO | Add module description 8 | 9 | """ 10 | 11 | import numpy as np 12 | 13 | from tesseract import spatial 14 | from tesseract.evaluation import Stage 15 | 16 | 17 | class Rebalancer(Stage): 18 | def alter_wrapper(self, clf, X_train, y_train, t_train, X_test, 19 | y_test, t_test): 20 | # Pass parameters straight through to rebalance implementation 21 | rebalanced = self.alter(clf, X_train, y_train, t_train, 22 | X_test, y_test, t_test) 23 | 24 | return np.array(rebalanced) 25 | 26 | def alter(self, clf, X_train, y_train, t_train, X_test, y_test, t_test): 27 | raise NotImplementedError('Rebalancer must be subclassed') 28 | 29 | 30 | class PositiveRateRebalancer(Rebalancer): 31 | def __init__(self, min_pos_rate, max_pos_rate=None, noise_deviation=0.0, 32 | fixed_size=False, schedule=1): 33 | super().__init__(schedule=schedule) 34 | self.min_pos_rate = min_pos_rate 35 | self.max_pos_rate = max_pos_rate 36 | self.noise_deviation = noise_deviation 37 | self.fixed_size = fixed_size 38 | 39 | def alter(self, clf, X_train, y_train, t_train, X_test, y_test, t_test): 40 | return spatial.downsample_set( 41 | X_train, y_train, t_train, self.min_pos_rate, 42 | self.max_pos_rate, self.noise_deviation, self.fixed_size) 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Royal Holloway, University of London. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | -------------------------------------------------------------------------------- /test/test_mock.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | test_mock.py 5 | ~~~~~~~ 6 | 7 | Unit tests for testing mock.py. 8 | 9 | """ 10 | import unittest 11 | 12 | from tesseract import mock 13 | 14 | 15 | class TestMock(unittest.TestCase): 16 | def test_generate_binary_test_data(self): 17 | X, y, t = mock.generate_binary_test_data(10000, '2016') 18 | self.assertEqual(len(X), len(y)) 19 | self.assertEqual(len(y), len(t)) 20 | 21 | def test_generate_time_data(self): 22 | expected = ['2012-09-22', '2012-11-26', '2012-09-11', '2012-07-15'] 23 | dates = mock.generate_time_data(4, '2012', random_state=22) 24 | actual = [d.strftime('%Y-%m-%d') for d in dates] 25 | self.assertEqual(expected, actual) 26 | 27 | expected = ['2012-11-07', '2011-12-20', '2015-11-08', '2011-09-13'] 28 | dates = mock.generate_time_data(4, '2010', '2016', random_state=22) 29 | actual = [d.strftime('%Y-%m-%d') for d in dates] 30 | self.assertEqual(expected, actual) 31 | 32 | years = (2010, 2011, 2012, 2013, 2014) 33 | dates = mock.generate_time_data(10000, '2010', '2014-12-31') 34 | for date in dates: 35 | self.assertIn(date.year, years) 36 | 37 | expected = {2010}, {1}, {1} 38 | dates = mock.generate_time_data(10000, '2010-01-01', '2010-01-02') 39 | actual = (set(d.year for d in dates), 40 | set(d.month for d in dates), 41 | set(d.day for d in dates)) 42 | self.assertEqual(expected, actual) 43 | -------------------------------------------------------------------------------- /test/test_rebalancers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | test_rebalancers.py 5 | ~~~~~~~~~~~~~~~~~~~ 6 | 7 | Unit tests for testing rebalancers.py. 8 | 9 | """ 10 | 11 | import unittest 12 | 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn.svm import SVC 15 | 16 | from tesseract import temporal, mock, metrics, evaluation 17 | from tesseract.rebalancing import PositiveRateRebalancer 18 | 19 | 20 | class TestRebalancers(unittest.TestCase): 21 | def setUp(self): 22 | # Test partitions of 1 year 23 | X, y, t = mock.generate_binary_test_data(10000, '2020') 24 | 25 | splits = temporal.time_aware_train_test_split( 26 | X, y, t, train_size=6, test_size=2, granularity='month') 27 | X_train, X_tests, y_train, y_tests, t_train, t_tests = splits 28 | 29 | self.X_train = X_train 30 | self.y_train = y_train 31 | self.X_tests = X_tests 32 | self.y_tests = y_tests 33 | self.t_train = t_train 34 | self.t_tests = t_tests 35 | 36 | self.svm = SVC(kernel='linear', probability=False) 37 | self.svm.fit(X_train, y_train) 38 | 39 | self.rf = RandomForestClassifier(n_estimators=101, max_depth=64) 40 | self.rf.fit(X_train, y_train) 41 | 42 | def test_positive_rate_rebalancer(self): 43 | for clf in (self.svm, self.rf): 44 | pr_rebalancer = PositiveRateRebalancer(0.5) 45 | results = evaluation.fit_predict_update( 46 | clf, self.X_train, self.X_tests, 47 | self.y_train, self.y_tests, 48 | self.t_train, self.t_tests, 49 | rebalancers=[pr_rebalancer]) 50 | 51 | metrics.print_metrics(results) 52 | -------------------------------------------------------------------------------- /tesseract/mock.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | mock.py 5 | ~~~~~~~ 6 | 7 | A module for generating test distributions for use with Tesseract. 8 | 9 | """ 10 | 11 | from datetime import datetime 12 | 13 | import numpy as np 14 | from dateutil.relativedelta import relativedelta 15 | from sklearn.datasets import make_classification 16 | 17 | from tesseract.utils import resolve_date 18 | 19 | 20 | def generate_binary_test_data(n_samples, start, end=None, random_state=None): 21 | """Generate a test dataset suitable for binary classification. 22 | 23 | Args: 24 | n_samples (int): The number of examples to create between start and end. 25 | start (str): The start date of the range to generate examples within. 26 | end (str): The end date of the range to generate examples within. 27 | random_state (int): A random number seed. 28 | 29 | Returns: 30 | np.ndarray: Array of two-dimensional predictors X. 31 | np.ndarray: Array of output variables y. 32 | np.ndarray: Array of datetimes for each example. 33 | 34 | """ 35 | X, y = make_classification(n_samples, n_features=2, n_informative=2, n_redundant=0, class_sep=1.5, 36 | random_state=random_state) 37 | t = generate_time_data(n_samples, start, end) 38 | return X, y, t 39 | 40 | 41 | def generate_time_data(n_samples, start, end=None, random_state=None): 42 | """Randomly sample from the given date range. 43 | 44 | Args: 45 | n_samples (int): The number of dates to create between start and end. 46 | start (str): The start date of the range to sample from. 47 | end (str): The end date of the range to sample from. 48 | random_state (int): A random number seed. 49 | 50 | Returns: 51 | np.ndarray: Array of datetimes sampled within the given range. 52 | 53 | """ 54 | start = resolve_date(start) 55 | end = resolve_date(end) if end else datetime(start.year, 12, 31) 56 | 57 | np.random.seed(random_state) 58 | delta = int((end - start).total_seconds()) 59 | offsets = [np.random.randint(delta) for _ in range(n_samples)] 60 | return np.array([start + relativedelta(seconds=x) for x in offsets]) 61 | -------------------------------------------------------------------------------- /examples/tesseract-plots.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import datetime 4 | import numpy as np 5 | from sklearn.svm import LinearSVC 6 | from sklearn.feature_extraction import DictVectorizer 7 | from tesseract import evaluation, temporal, metrics, mock, viz, loader 8 | 9 | os.environ["PATH"] += os.pathsep + '/Library/TeX/texbin' 10 | 11 | ## Loading features 12 | 13 | def load_dataset(dataset_path): 14 | print(f'Loading dataset from {dataset_path}') 15 | 16 | with open('{}-X-updated-reduced-10k.json'.format(dataset_path), 'r') as f: 17 | X = json.load(f) 18 | 19 | print('Loading labels...') 20 | with open('{}-y-updated.json'.format(dataset_path), 'rt') as f: 21 | y = json.load(f) 22 | 23 | print('Loading timestamps...') 24 | with open('{}-meta-updated.json'.format(dataset_path), 'rt') as f: 25 | T = json.load(f) 26 | T = [o['dex_date'] for o in T] 27 | T = np.array([datetime.datetime.strptime(o, '%Y-%m-%dT%H:%M:%S') if "T" in o 28 | else datetime.datetime.strptime(o, '%Y-%m-%d %H:%M:%S') for o in T]) 29 | 30 | # Convert to numpy array and get feature names 31 | vec = DictVectorizer() 32 | X = vec.fit_transform(X).astype("float32") 33 | y = np.asarray(y) 34 | feature_names = vec.get_feature_names_out() 35 | 36 | # Get time index of each sample for easy reference 37 | time_index = {} 38 | for i in range(len(T)): 39 | t = T[i] 40 | if t.year not in time_index: 41 | time_index[t.year] = {} 42 | if t.month not in time_index[t.year]: 43 | time_index[t.year][t.month] = [] 44 | time_index[t.year][t.month].append(i) 45 | 46 | return X, y, time_index, feature_names, T 47 | 48 | X, y, time_index, feature_names, T = load_dataset('../extended-features/extended-features') 49 | 50 | # Partition dataset 51 | splits = temporal.time_aware_train_test_split( 52 | X, y, T, train_size=12, test_size=1, granularity='month') 53 | 54 | # Perform a timeline evaluation 55 | clf = LinearSVC(C=1) 56 | results = evaluation.fit_predict_update(clf, *splits) 57 | 58 | 59 | # ################ 60 | # View Results 61 | # ################ 62 | from pylab import * 63 | 64 | pendleblue='#1f8fff' 65 | pendleyellow='#ffa600' 66 | 67 | # '#FF9999', '#FFDD99', '#AAEEEE' 68 | plot(results['precision'], marker='o', color=pendleyellow) 69 | plot(results['recall'], marker='o', color='red') 70 | plot(results['f1'], marker='o', color=pendleblue) 71 | legend(['Precision', 'Recall', 'F1']) 72 | xlim([0,23]) 73 | xlabel('Testing period (month)') 74 | ylabel('Performance') 75 | grid(axis = 'y') 76 | show() -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean data 2 | 3 | ################################################################################# 4 | # GLOBALS # 5 | ################################################################################# 6 | 7 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) 8 | PROJECT_NAME = tesseract 9 | PYTHON_INTERPRETER = python3 10 | 11 | ################################################################################# 12 | # COMMANDS # 13 | ################################################################################# 14 | 15 | ## Make Dataset 16 | data: 17 | wget -O ./data/raw/drebin-features.tar.gz https://www.dropbox.com/s/i7q8ysi5agi6n0f/drebin-features.tar.gz 18 | tar -zxf ./data/raw/drebin-features.tar.gz --directory ./data/processed/ 19 | rm ./data/raw/drebin-features.tar.gz 20 | 21 | 22 | ## Delete all compiled Python files 23 | clean: 24 | find . -type f -name "*.py[co]" -delete 25 | find . -type d -name "__pycache__" -delete 26 | 27 | 28 | ################################################################################# 29 | # Self Documenting Commands # 30 | ################################################################################# 31 | 32 | .DEFAULT_GOAL := help 33 | 34 | # Inspired by 35 | # sed script explained: 36 | # /^##/: 37 | # * save line in hold space 38 | # * purge line 39 | # * Loop: 40 | # * append newline + line to hold space 41 | # * go to next line 42 | # * if line starts with doc comment, strip comment character off and loop 43 | # * remove target prerequisites 44 | # * append hold space (+ newline) to line 45 | # * replace newline plus comments by `---` 46 | # * print line 47 | # Separate expressions are necessary because labels cannot be delimited by 48 | # semicolon; see 49 | .PHONY: help 50 | help: 51 | @echo "$$(tput bold)Available rules:$$(tput sgr0)" 52 | @echo 53 | @sed -n -e "/^## / { \ 54 | h; \ 55 | s/.*//; \ 56 | :doc" \ 57 | -e "H; \ 58 | n; \ 59 | s/^## //; \ 60 | t doc" \ 61 | -e "s/:.*//; \ 62 | G; \ 63 | s/\\n## /---/; \ 64 | s/\\n/ /g; \ 65 | p; \ 66 | }" ${MAKEFILE_LIST} \ 67 | | LC_ALL='C' sort --ignore-case \ 68 | | awk -F '---' \ 69 | -v ncol=$$(tput cols) \ 70 | -v indent=19 \ 71 | -v col_on="$$(tput setaf 6)" \ 72 | -v col_off="$$(tput sgr0)" \ 73 | '{ \ 74 | printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ 75 | n = split($$2, words, " "); \ 76 | line_length = ncol - indent; \ 77 | for (i = 1; i <= n; i++) { \ 78 | line_length -= length(words[i]) + 1; \ 79 | if (line_length <= 0) { \ 80 | line_length = ncol - indent - length(words[i]) - 1; \ 81 | printf "\n%*s ", -indent, " "; \ 82 | } \ 83 | printf "%s ", words[i]; \ 84 | } \ 85 | printf "\n"; \ 86 | }' \ 87 | | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') 88 | 89 | -------------------------------------------------------------------------------- /tesseract/loader.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | from datetime import datetime 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import scipy 9 | import numpy 10 | from sklearn.datasets import load_svmlight_file 11 | from sklearn.feature_extraction import DictVectorizer 12 | 13 | 14 | def load_features(fname, shas=False): 15 | """Load feature set. 16 | 17 | Args: 18 | fname (str): The common prefix for the dataset. 19 | (e.g., 'data/features/drebin' -> 'data/features/drebin-[X|Y|meta].json') 20 | 21 | shas (bool): Whether to include shas. In some versions of the dataset, 22 | shas were included to double-check alignment - these are _not_ features 23 | and _must_ be removed before training. 24 | 25 | Returns: 26 | Tuple[List[Dict], List, List]: The features, labels, and timestamps 27 | for the dataset. 28 | 29 | """ 30 | time_index = {} 31 | 32 | feature_path = os.path.join(os.path.dirname(fname), 'extended-features-{}.json') 33 | 34 | with open(feature_path.format("X"), 'rb') as f: 35 | X = json.load(f) 36 | with open(feature_path.format("y"), 'r') as f: 37 | y = json.load(f) 38 | 39 | with open(feature_path.format("meta"), 'r') as f: 40 | T = json.load(f) 41 | T = [o['dex_date'] for o in T] 42 | T = numpy.array([datetime.strptime(o, '%Y-%m-%dT%H:%M:%S') if "T" in o 43 | else datetime.strptime(o, '%Y-%m-%d %H:%M:%S') for o in T]) 44 | 45 | vec = DictVectorizer() 46 | X = vec.fit_transform(X) 47 | y = numpy.asarray(y) 48 | 49 | for i in range(len(T)): 50 | t = T[i] 51 | if t.year not in time_index: 52 | time_index[t.year] = {} 53 | if t.month not in time_index[t.year]: 54 | time_index[t.year][t.month] = [] 55 | time_index[t.year][t.month].append(i) 56 | 57 | return X, y, T, time_index 58 | 59 | 60 | def load_range_dataset_w_benign(data_name, start_month, end_month, folder='data/'): 61 | if start_month != end_month: 62 | dataset_name = f'{start_month}to{end_month}' 63 | else: 64 | dataset_name = f'{start_month}' 65 | saved_data_file = os.path.join(folder, data_name, f'{dataset_name}_selected.npz') 66 | data = np.load(saved_data_file, allow_pickle=True) 67 | X_train, y_train = data['X_train'], data['y_train'] 68 | y_mal_family = data['y_mal_family'] 69 | return X_train, y_train, y_mal_family 70 | 71 | 72 | def feature_reduce(clf, dim): 73 | if hasattr(clf, 'coef_'): 74 | select_index = np.argpartition(abs(clf.coef_[0]), -dim)[-dim:] 75 | return select_index 76 | else: 77 | print('Wrong classifier') 78 | exit(-1) 79 | 80 | 81 | def load_dates(infile): 82 | """ 83 | Parses infile for any dates formatted as YYYY/MM/DD, at most one 84 | per line. Returns a list of datetime.date objects, in order of 85 | encounter. 86 | """ 87 | datere = re.compile(r'\d{4}/\d{2}/\d{2}') 88 | dates = [] 89 | for line in open(infile, 'r', encoding='utf-8'): 90 | match = re.search(datere, line) 91 | if match: 92 | dates.append(datetime(*(map(int, match.group().split('/'))))) 93 | return dates 94 | -------------------------------------------------------------------------------- /tesseract/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | utils.py 5 | ~~~~~~~~ 6 | 7 | A selection of useful helper functions used throughout the Tesseract library. 8 | 9 | """ 10 | 11 | import logging 12 | from datetime import datetime, date 13 | from functools import wraps 14 | from timeit import default_timer as timer 15 | 16 | import numpy as np 17 | 18 | 19 | def resolve_date(d): 20 | """Convert a str or date to an appropriate datetime. 21 | 22 | Strings should be of the format '%Y', '%Y-%m or '%Y-%m-%d', for example: 23 | '2012', '1994-02' or '1991-12-11'. Date objects with no time information 24 | will be rounded down to the midnight beginning that date. 25 | 26 | Args: 27 | d (Union[str, date]): The string or date to convert. 28 | 29 | Returns: 30 | datetime: The parsed datetime equivalent of d. 31 | """ 32 | if isinstance(d, datetime): 33 | return d 34 | 35 | if isinstance(d, date): 36 | return datetime.combine(d, datetime.min.time()) 37 | 38 | for fmt in ('%Y', '%Y-%m', '%Y-%m-%d'): 39 | try: 40 | return datetime.strptime(d, fmt) 41 | except ValueError: 42 | pass 43 | 44 | raise ValueError('date string format not recognized.') 45 | 46 | 47 | def check_for_raw_scores(y_pred): 48 | # Heuristic to check if input are raw scores 49 | if y_pred.ndim > 1: 50 | for v in y_pred: 51 | if ((np.linalg.norm(v, 0), 52 | np.linalg.norm(v), 2) != (1, 1)): 53 | return True 54 | return False 55 | 56 | 57 | def select_prediction_function(clf, scores_only=False, labels_only=False): 58 | if hasattr(clf, 'predict_proba') and not labels_only: 59 | prediction_function = clf.predict_proba 60 | elif hasattr(clf, 'decision_function') and not labels_only: 61 | prediction_function = clf.decision_function 62 | elif hasattr(clf, 'predict') and not scores_only: 63 | prediction_function = clf.predict 64 | else: 65 | raise TypeError( 66 | 'Unsure how to handle predictions with ' 67 | 'classifier of type {}.'.format(clf.__class__)) 68 | return prediction_function 69 | 70 | 71 | def resolve_categorical(y): 72 | return np.argmax(y, 1) if y.ndim > 1 else y 73 | 74 | 75 | def binary_labels(array, positive='malicious', negative='benign'): 76 | return [positive if x else negative for x in array] 77 | 78 | 79 | def parse_percentage(n): 80 | return float(n[:-1]) / 100 81 | 82 | 83 | def resolve_percentage(n): 84 | return parse_percentage(n) if isinstance(n, str) else n 85 | 86 | 87 | def seconds_to_time(seconds): 88 | """Return a nicely formatted time given the number of seconds.""" 89 | m, s = divmod(seconds, 60) 90 | h, m = divmod(m, 60) 91 | d, h = divmod(h, 24) 92 | return "%d days, %02d hours, %02d minutes, %02d seconds" % (d, h, m, s) 93 | 94 | 95 | def timing(f): 96 | @wraps(f) 97 | def wrap(*args, **kwargs): 98 | start = timer() 99 | result = f(*args, **kwargs) 100 | elapsed = seconds_to_time(timer() - start) 101 | logging.debug('{} took: {}'.format(f.__name__, elapsed)) 102 | return result 103 | 104 | return wrap 105 | -------------------------------------------------------------------------------- /test/test_temporal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | test_temporal.py 5 | ~~~~~~~~~~~~~~~~ 6 | 7 | Unit tests for temporal.py. 8 | 9 | """ 10 | import random 11 | import unittest 12 | from datetime import datetime 13 | 14 | import numpy as np 15 | from sklearn.svm import LinearSVC 16 | 17 | from tesseract import temporal, mock, selection, evaluation 18 | 19 | 20 | class TestTemporal(unittest.TestCase): 21 | def test_train(self): 22 | # Test partitions of 1 year 23 | X, y, t = mock.generate_binary_test_data(10000, '2020') 24 | splits = temporal.time_aware_train_test_split( 25 | X, y, t, 6, 2, granularity='month', start_date='2020') 26 | X_train, X_tests, y_train, y_tests, t_train, t_tests = splits 27 | 28 | results = evaluation.fit_predict_update(LinearSVC(), X_train, X_tests, 29 | y_train, y_tests, t_train, 30 | t_tests) 31 | print(results) 32 | results = evaluation.fit_predict_update( 33 | LinearSVC(), X_train, X_tests, y_train, y_tests, t_train, t_tests) 34 | print(results) 35 | 36 | def test_time_aware_indexes(self): 37 | # Test partitions of 1 year 38 | t = np.array([datetime(2020, x, 1) for x in range(1, 13)]) 39 | random.shuffle(t) 40 | train, tests = temporal.time_aware_indexes( 41 | t, 6, 2, granularity='month', start_date='2020') 42 | 43 | # Smoke tests 44 | self.assertEqual(6, len(train)) 45 | self.assertEqual(3, len(tests)) 46 | 47 | for test in tests: 48 | self.assertEqual(2, len(test)) 49 | 50 | # Check partition is complete and non-destructive 51 | recreated = train + [x for sub in tests for x in sub] 52 | self.assertEqual(len(recreated), len(t)) 53 | self.assertEqual(set(recreated), set(range(len(t)))) 54 | 55 | t_train = t[train] 56 | t_tests = [t[index_set] for index_set in tests] 57 | 58 | # Check partition is history-aware 59 | for m in t_train: 60 | for n in t_tests[0]: 61 | self.assertTrue(m < n) 62 | 63 | for i in range(0, len(t_tests) - 1): 64 | for m in t_tests[i]: 65 | for n in t_tests[i + 1]: 66 | self.assertTrue(m < n) 67 | 68 | def test_time_aware_train_test_split(self): 69 | # Test partitions of 1 year 70 | X, y, t = mock.generate_binary_test_data(10000, '2020') 71 | X_train, X_tests, y_train, y_tests, t_train, t_tests = \ 72 | temporal.time_aware_train_test_split( 73 | X, y, t, 6, 2, granularity='month', start_date='2020') 74 | 75 | # Smoke tests 76 | self.assertEqual(len(X_train), len(y_train)) 77 | self.assertEqual(len(X_tests), len(y_tests)) 78 | self.assertEqual(len(X_tests[0]), len(y_tests[0])) 79 | 80 | for i in range(len(X_tests)): 81 | self.assertEqual(len(X_tests[i]), len(y_tests[i])) 82 | 83 | def test_closest_to_hyperplane(self): 84 | narray = np.array([3, -1, 7, 2, 5, -4]) 85 | indexes = selection.closest_to_hyperplane(narray, 2) 86 | self.assertTrue(all([1, 3] == indexes)) 87 | self.assertTrue(all([-1, 2] == narray[indexes])) 88 | -------------------------------------------------------------------------------- /test/test_evaluation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | test_evaluation.py 5 | ~~~~~~~~~~~~~~~~~~ 6 | 7 | Unit tests used to help redesign the typical Tesseract workflow. 8 | """ 9 | 10 | import unittest 11 | 12 | from sklearn.svm import SVC 13 | 14 | from tesseract import temporal, mock, metrics, evaluation 15 | 16 | 17 | class TestWorkflow(unittest.TestCase): 18 | def setUp(self): 19 | # Test partitions of 1 year 20 | X, y, t = mock.generate_binary_test_data(10000, '2020') 21 | 22 | splits = temporal.time_aware_train_test_split( 23 | X, y, t, train_size=6, test_size=2, 24 | granularity='month', start_date='2020') 25 | X_train, X_tests, y_train, y_tests, t_train, t_tests = splits 26 | 27 | self.X_train = X_train 28 | self.y_train = y_train 29 | self.X_tests = X_tests 30 | self.y_tests = y_tests 31 | self.t_train = t_train 32 | self.t_tests = t_tests 33 | 34 | self.clf = SVC(kernel='linear', probability=True) 35 | self.clf.fit(X_train, y_train) 36 | 37 | def test_use_case_1(self): 38 | # Predict each test period yourself and get individual results 39 | for i, (X_test, y_true) in enumerate(zip(self.X_tests, self.y_tests)): 40 | y_pred = self.clf.predict(X_test) 41 | 42 | print('Test period {}'.format(i)) 43 | results = metrics.calculate_metrics(y_true, y_pred) 44 | metrics.print_metrics(results, header=False) 45 | 46 | def test_use_case_2(self): 47 | # Keep a running data structure for results 48 | results = {} 49 | for i, (X_test, y_true) in enumerate(zip(self.X_tests, self.y_tests)): 50 | y_pred = self.clf.predict(X_test) 51 | 52 | results = metrics.calculate_metrics( 53 | y_true, y_pred, existing=results) 54 | metrics.print_metrics(results) 55 | 56 | def test_use_case_3(self): 57 | # Use a library method to run the entire prediction 58 | y_preds = evaluation.predict(self.clf, self.X_tests) 59 | results = metrics.calculate_metrics(self.y_tests, y_preds, periods=3) 60 | metrics.print_metrics(results) 61 | 62 | def test_use_case_4(self): 63 | # Parallelising computation of test periods 64 | y_preds = evaluation.predict(self.clf, self.X_tests, nproc=3) 65 | results = metrics.calculate_metrics(self.y_tests, y_preds, periods=-1) 66 | metrics.print_metrics(results) 67 | 68 | def test_use_case_5(self): 69 | # Forcing output to be labels rather than probabilities 70 | y_preds = evaluation.predict( 71 | self.clf, self.X_tests, labels_only=True) 72 | results = metrics.calculate_metrics(self.y_tests, y_preds, periods=-1) 73 | metrics.print_metrics(results) 74 | print(metrics.aut(results, 'f1')) 75 | print(metrics.aut(results['f1'])) 76 | 77 | def test_use_case_6(self): 78 | # Use full fit_predict_update to measure the performance 79 | results = evaluation.fit_predict_update( 80 | self.clf, self.X_train, self.X_tests, 81 | self.y_train, self.y_tests, 82 | self.t_train, self.t_tests) 83 | 84 | metrics.print_metrics(results) 85 | print(metrics.aut(results, 'f1')) 86 | print(metrics.aut(results['f1'])) 87 | -------------------------------------------------------------------------------- /test/test_selectors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | test_selectors.py 5 | ~~~~~~~~~~~~~~~~~ 6 | 7 | Unit tests for testing selectors.py. 8 | 9 | """ 10 | 11 | import unittest 12 | 13 | import numpy as np 14 | from sklearn.ensemble import RandomForestClassifier 15 | from sklearn.svm import LinearSVC 16 | 17 | from tesseract import temporal, mock, metrics, evaluation 18 | from tesseract.selection import FullRetrainingSelector, ActiveLearningSelector, \ 19 | UncertaintySamplingSelector 20 | 21 | 22 | class TestSelectors(unittest.TestCase): 23 | def setUp(self): 24 | # Test partitions of 1 year 25 | X, y, t = mock.generate_binary_test_data(10000, '2020') 26 | 27 | splits = temporal.time_aware_train_test_split( 28 | X, y, t, train_size=6, test_size=2, granularity='month') 29 | X_train, X_tests, y_train, y_tests, t_train, t_tests = splits 30 | 31 | self.X_train = X_train 32 | self.y_train = y_train 33 | self.X_tests = X_tests 34 | self.y_tests = y_tests 35 | self.t_train = t_train 36 | self.t_tests = t_tests 37 | 38 | self.svm = LinearSVC() 39 | self.svm.fit(X_train, y_train) 40 | 41 | self.rf = RandomForestClassifier(n_estimators=101, max_depth=64) 42 | self.rf.fit(X_train, y_train) 43 | 44 | def test_full_retraining(self): 45 | for clf in (self.svm, self.rf): 46 | results = evaluation.fit_predict_update( 47 | clf, self.X_train, self.X_tests, 48 | self.y_train, self.y_tests, 49 | self.t_train, self.t_tests, 50 | selectors=[FullRetrainingSelector()]) 51 | 52 | metrics.print_metrics(results) 53 | 54 | for i in range(1, len(self.y_tests)): 55 | expected = results['train_tot'][i - 1] + results['tot'][i - 1] 56 | actual = results['train_tot'][i] 57 | 58 | self.assertEqual(expected, actual) 59 | 60 | def test_active_learning(self): 61 | def closest_to_hyperplane(*args): 62 | clf, X_test, n = args[0], args[4], args[-1] 63 | y_raw = clf.decision_function(X_test) 64 | absolute = np.abs(y_raw) 65 | indexes = np.argsort(absolute) 66 | return indexes[:n] 67 | 68 | results = evaluation.fit_predict_update( 69 | self.svm, self.X_train, self.X_tests, 70 | self.y_train, self.y_tests, 71 | self.t_train, self.t_tests, 72 | selectors=[ActiveLearningSelector( 73 | '20%', closest_to_hyperplane)]) 74 | 75 | metrics.print_metrics(results) 76 | 77 | for i in range(1, len(self.y_tests)): 78 | expected = int(results['train_tot'][i - 1] + 79 | results['tot'][i - 1] * 0.2) 80 | actual = results['train_tot'][i] 81 | 82 | self.assertEqual(expected, actual) 83 | 84 | def test_uncertainty_sampling(self): 85 | for clf in (self.svm, self.rf): 86 | results = evaluation.fit_predict_update( 87 | clf, self.X_train, self.X_tests, 88 | self.y_train, self.y_tests, 89 | self.t_train, self.t_tests, 90 | selectors=[UncertaintySamplingSelector('20%')]) 91 | 92 | metrics.print_metrics(results) 93 | 94 | for i in range(1, len(self.y_tests)): 95 | expected = int(results['train_tot'][i - 1] + 96 | results['tot'][i - 1] * 0.2) 97 | actual = results['train_tot'][i] 98 | 99 | self.assertEqual(expected, actual) 100 | -------------------------------------------------------------------------------- /test/test_rejectors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | test_rejectors.py 5 | ~~~~~~~~~~~~~~~~~ 6 | 7 | Unit tests for testing rejectors.py. 8 | 9 | """ 10 | 11 | import unittest 12 | 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn.svm import SVC, LinearSVC 15 | 16 | from tesseract import temporal, mock, metrics, rejection, evaluation 17 | from tesseract.rejection import ThresholdRejector 18 | 19 | 20 | class TestRejectors(unittest.TestCase): 21 | def setUp(self): 22 | # Test partitions of 1 year 23 | X, y, t = mock.generate_binary_test_data(10000, '2020') 24 | 25 | splits = temporal.time_aware_train_test_split( 26 | X, y, t, train_size=6, test_size=2, granularity='month') 27 | X_train, X_tests, y_train, y_tests, t_train, t_tests = splits 28 | 29 | self.X_train = X_train 30 | self.y_train = y_train 31 | self.X_tests = X_tests 32 | self.y_tests = y_tests 33 | self.t_train = t_train 34 | self.t_tests = t_tests 35 | 36 | self.svm = SVC(kernel='linear', probability=False) 37 | self.svm.fit(X_train, y_train) 38 | 39 | self.rf = RandomForestClassifier(n_estimators=101, max_depth=64) 40 | self.rf.fit(X_train, y_train) 41 | 42 | def test_threshold_rejector_rf(self): 43 | t_rejector = ThresholdRejector('<', 0.9) 44 | results = evaluation.fit_predict_update( 45 | self.rf, self.X_train, self.X_tests, 46 | self.y_train, self.y_tests, 47 | self.t_train, self.t_tests, 48 | rejectors=[t_rejector]) 49 | 50 | metrics.print_metrics(results) 51 | 52 | # Check that something was rejected each period, 53 | # more thorough tests are certainly desired! 54 | 55 | for i in range(len(self.y_tests)): 56 | self.assertGreater(results['rejected'][i], 0) 57 | 58 | def test_threshold_rejector_svm_between(self): 59 | t_rejector = ThresholdRejector('><', (-5, 5)) 60 | results = evaluation.fit_predict_update( 61 | self.svm, self.X_train, self.X_tests, 62 | self.y_train, self.y_tests, 63 | self.t_train, self.t_tests, 64 | rejectors=[t_rejector]) 65 | 66 | metrics.print_metrics(results) 67 | 68 | for i in range(len(self.y_tests)): 69 | self.assertGreater(results['rejected'][i], 0) 70 | 71 | def test_threshold_rejector_svm_outside(self): 72 | t_rejector = ThresholdRejector('<>', (-5, 5)) 73 | results = evaluation.fit_predict_update( 74 | self.svm, self.X_train, self.X_tests, 75 | self.y_train, self.y_tests, 76 | self.t_train, self.t_tests, 77 | rejectors=[t_rejector]) 78 | 79 | metrics.print_metrics(results) 80 | 81 | for i in range(len(self.y_tests)): 82 | self.assertGreater(results['rejected'][i], 0) 83 | 84 | 85 | class TestRejection(unittest.TestCase): 86 | def test_thresholds(self): 87 | # Test partitions of 1 year 88 | X, y, t = mock.generate_binary_test_data(10000, '2014', end='2016') 89 | splits = temporal.time_aware_train_test_split( 90 | X, y, t, 12, 1, granularity='month', start_date='2014') 91 | X_train, X_tests, y_train, y_tests, t_train, t_tests = splits 92 | 93 | clf = LinearSVC() 94 | aa = rejection.alpha_assessment(clf, X_train, y_train, folds=5) 95 | n_quartiles, p_quartiles = rejection.quartiles(aa) 96 | n_threshold, p_threshold = n_quartiles[3], p_quartiles[1] 97 | print(n_threshold, p_threshold) 98 | 99 | rejection_options = {'thresholds': [n_threshold, p_threshold], 100 | 'comparators': ['<', '>']} 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TESSERACT 2 | 3 | As malware evolves over time, the performance of malware detectors tends to degrade. Many solutions in the security literature fail to consider the time information associated with the samples while evaluating their classifier which can induce positive bias in the results. 4 | 5 | This repository contains the source code for a prototype implementation of Tesseract. 6 | 7 | Further details can be found in the paper *TESSERACT: Eliminating Experimental Bias in Malware Classification across Space and Time*. F. Pendlebury, F. Pierazzi, R. Jordaney, J. Kinder, and L. Cavallaro. USENIX Sec 2019. Check also `https://s2lab.cs.ucl.ac.uk/projects/tesseract` for up-to-date information on the project, e.g., a talk at USENIX Enigma 2019 at `https://www.usenix.org/conference/enigma2019/presentation/cavallaro`. 8 | 9 | If you end up using Tesseract as part of a project or publication, please include a citation of the latest preprint: 10 | 11 | ```bibtex 12 | @inproceedings{pendlebury2019, 13 | author = {Feargus Pendlebury, Fabio Pierazzi, Roberto Jordaney, Johannes Kinder, and Lorenzo Cavallaro}, 14 | title = {{TESSERACT: Eliminating Experimental Bias in Malware Classification across Space and Time}}, 15 | booktitle = {28th USENIX Security Symposium}, 16 | year = {2019}, 17 | address = {Santa Clara, CA}, 18 | publisher = {USENIX Association}, 19 | note = {USENIX Sec} 20 | } 21 | ``` 22 | 23 | ## Getting Started 24 | 25 | ### Installation 26 | 27 | Tesseract requires Python 3 (preferably >= 3.5) as well as the statistical learning stack of NumPy, SciPy, and Scikit-learn. 28 | 29 | Create virtual environment (recommended) and install tesseract with script `setup.py`: 30 | 31 | ```shell 32 | python3 setup.py install 33 | ``` 34 | 35 | To download the data, run 36 | 37 | ```shell 38 | make data 39 | ``` 40 | 41 | This should download the feature vectors and store them in 42 | `data/processed`. An example that shows how to reproduce the experiments can be found in 43 | `notebooks/reproduce-tesseract.ipynb`. 44 | 45 | ### Usage 46 | 47 | Basic usage, dividing a dataset into time-aware sets and performing a time-aware evaluation. 48 | More complex examples can be found in the `examples/` and `test/` directories. 49 | 50 | ```python 51 | from sklearn.svm import LinearSVC 52 | from tesseract import evaluation, temporal, metrics, mock 53 | 54 | 55 | def main(): 56 | # Generate dummy predictors, labels and timestamps from Gaussians 57 | X, y, t = mock.generate_binary_test_data(10000, '2014', '2016') 58 | 59 | # Partition dataset 60 | splits = temporal.time_aware_train_test_split( 61 | X, y, t, train_size=12, test_size=1, granularity='month') 62 | 63 | # Perform a timeline evaluation 64 | clf = LinearSVC() 65 | results = evaluation.fit_predict_update(clf, *splits) 66 | 67 | # View results 68 | metrics.print_metrics(results) 69 | 70 | # View AUT(F1, 24 months) as a measure of robustness over time 71 | print(metrics.aut(results, 'f1')) 72 | 73 | 74 | if __name__ == '__main__': 75 | main() 76 | 77 | ``` 78 | 79 | ## Running the tests 80 | 81 | To run all unittests within the `test/` directory: 82 | 83 | ```shell 84 | python -m unittest 85 | ``` 86 | 87 | ## Current Working State 88 | 89 | Tesseract is still a research prototype and subject to breaking changes, although following a recent redesign we 90 | expect such changes to be kept to a minimum. Due to this redesign there may also be discrepancies between the current 91 | implementation and §6 of the Tesseract manuscript---although we are aiming to soon publish a short technical report 92 | that details the new design. We know this can be frustrating and thank you for your patience! 93 | 94 | If you encounter a bug or have a feature request, please feel free to contact the maintainer directly 95 | at `lorenzo.cavallaro [at] ucl.ac.uk` and cc `fabio.pierazzi [at] kcl.ac.uk`. 96 | 97 | 98 | ## Acknowledgements 99 | 100 | This project has been generously sponsored by the UK EP/L022710/1 and EP/P009301/1 EPSRC research grants. 101 | -------------------------------------------------------------------------------- /tesseract/selection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | selection.py 5 | ~~~~~~~~~~~~ 6 | 7 | # TODO | Add module description 8 | 9 | """ 10 | 11 | import numpy as np 12 | 13 | from tesseract import utils 14 | from tesseract.evaluation import TrackingStage 15 | 16 | 17 | class Selector(TrackingStage): 18 | def __init__(self, schedule=1, tracking=True, interaction='intersection'): 19 | super().__init__(schedule, tracking, interaction) 20 | self.selection_history = [] 21 | 22 | def query_wrapper(self, clf, X_train, y_train, t_train, 23 | X_test, y_test, t_test, previously_selected): 24 | # Pass parameters straight through to query implementation 25 | selected = self.query(clf, X_train, y_train, t_train, 26 | X_test, y_test, t_test, previously_selected) 27 | 28 | if self.tracking: 29 | self.selection_history.append(selected) 30 | 31 | # Merge results with those of previous selectors 32 | selected = self.merge_results(previously_selected, selected) 33 | 34 | return np.array(selected) 35 | 36 | def query(self, clf, X_train, y_train, t_train, 37 | X_test, y_test, t_test, previously_selected): 38 | raise NotImplementedError('Selector must be subclassed') 39 | 40 | 41 | class FullRetrainingSelector(Selector): 42 | def __init__(self, schedule=1, tracking=True, interaction='intersection'): 43 | super().__init__(schedule, tracking, interaction) 44 | 45 | def query(self, clf, X_train, y_train, t_train, 46 | X_test, y_test, t_test, previously_selected): 47 | return range(len(y_test)) 48 | 49 | 50 | class ActiveLearningSelector(Selector): 51 | def __init__(self, n, query_strategy, schedule=1, 52 | tracking=True, interaction='intersection'): 53 | super().__init__(schedule, tracking, interaction) 54 | self.n = n 55 | self.query_strategy = query_strategy 56 | 57 | def query(self, clf, X_train, y_train, t_train, 58 | X_test, y_test, t_test, previously_selected): 59 | # Parse percentage if string passed in as n (eg. '20%') 60 | m = int(utils.parse_percentage(self.n) * len(y_test) 61 | if isinstance(self.n, str) else self.n) 62 | return self.query_strategy(clf, X_train, y_train, t_train, 63 | X_test, y_test, t_test, 64 | previously_selected, m) 65 | 66 | 67 | class UncertaintySamplingSelector(Selector): 68 | def __init__(self, n, schedule=1, tracking=True, 69 | interaction='intersection'): 70 | super().__init__(schedule, tracking, interaction) 71 | self.n = n 72 | 73 | def query(self, clf, X_train, y_train, t_train, 74 | X_test, y_test, t_test, previously_selected): 75 | # Parse percentage if string passed in as n (eg. '20%') 76 | m = int(utils.parse_percentage(self.n) * len(y_test) 77 | if isinstance(self.n, str) else self.n) 78 | 79 | # e.g. clf is a RandomForestsClassifier or SVC(probability=True) 80 | if hasattr(clf, 'predict_proba'): 81 | y_probs = clf.predict_proba(X_test) 82 | selected_indexes = probabilistic_uncertainty(y_probs, m) 83 | 84 | # e.g. clf is a LinearSVC or SVC 85 | elif hasattr(clf, 'decision_function'): 86 | y_raw = clf.decision_function(X_test) 87 | selected_indexes = closest_to_hyperplane(y_raw, m) 88 | 89 | else: 90 | raise TypeError( 91 | 'Unsure how to handle uncertainty sampling with ' 92 | 'classifier of type {}.'.format(clf.__class__)) 93 | 94 | return selected_indexes 95 | 96 | 97 | def closest_to_hyperplane(distances, n): 98 | """Perform uncertainty sampling using distance from the hyperplane. 99 | 100 | Uncertainty sampling with SVMs is equivalent to selecting the samples 101 | closest to the decision boundary (hyperplane in binary classification). 102 | 103 | This is shown by Tong and Koller [ICML 2000]: 104 | https://dl.acm.org/citation.cfm?id=944793 105 | 106 | The intuition is also well explained by Kremer, Pederson, Igel [WIREs 2014]: 107 | http://image.diku.dk/jank/papers/WIREs2014.pdf 108 | 109 | The process for selecting the objects is as follows: 110 | 111 | 1. Consider only absolute distances. 112 | 2. Argsort from least distance to greatest. 113 | 3. Take the n smallest (closest to the hyperplane). 114 | 115 | Args: 116 | distances: The list of distances to use as metrics. 117 | n: The number of samples to mark as 'most uncertain'. 118 | 119 | Returns: 120 | list: The indexes corresponding to the 'most uncertain' samples. 121 | 122 | """ 123 | absolute = np.abs(distances) 124 | indexes = np.argsort(absolute) 125 | return indexes[:n] 126 | 127 | 128 | def probabilistic_uncertainty(probs, n): 129 | """Perform uncertainty sampling using least confidence. 130 | 131 | An excellent discussion of active learning strategies including a 132 | comparison of three different uncertainty measures: least confidence, 133 | margin sampling and entropy (all of which are equivalent in binary 134 | classification) can be found in Burr Settles' literature review: 135 | 136 | http://burrsettles.com/pub/settles.activelearning.pdf 137 | 138 | The process for selecting the objects is as follows: 139 | 140 | 1. Consider 'uncertainty' only (1 - the highest class probability). 141 | 2. Argsort and reverse to sort from least to most certain. 142 | 3. Take the n smallest (most uncertain). 143 | 144 | Args: 145 | probs: The list of probabilities to use as metrics. 146 | n: The number of samples to mark as 'most uncertain'. 147 | 148 | Returns: 149 | list: The indexes corresponding to the 'most uncertain' samples. 150 | 151 | """ 152 | uncertainty = np.array([1 - np.max(x) for x in probs]) 153 | indexes = np.argsort(uncertainty)[::-1] 154 | return indexes[:n] 155 | -------------------------------------------------------------------------------- /tesseract/temporal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | temporal.py 5 | ~~~~~~~~~~~ 6 | 7 | A module for working with and running time-aware evaluations. Most of the 8 | functionality of this module falls into one of two categories: working with 9 | arrays of datetimes or datetime-aligned series of data, and aggregating the 10 | steps of the ML pipeline needed to conduct sound, time-aware evaluations. 11 | 12 | """ 13 | import bisect 14 | import operator 15 | 16 | import numpy as np 17 | from dateutil.relativedelta import relativedelta 18 | 19 | import tesseract.utils as utils 20 | 21 | 22 | def assert_train_test_temporal_consistency(t_train, t_test): 23 | """Helper function to assert train-test temporal constraint (C1). 24 | 25 | All objects in the training set need to be temporally anterior to all 26 | objects in the testing set. Violating this constraint will positively bias 27 | the results by integrating "future" knowledge into the classifier. 28 | 29 | Args: 30 | t_train: An array of datetimes corresponding to the training set. 31 | t_test: An array of datetime corresponding to the testing set. 32 | 33 | Returns: 34 | bool: False if the partitioned dataset does _not_ adhere to C1, 35 | True otherwise. 36 | 37 | """ 38 | for train_date in t_train: 39 | for test_date in t_test: 40 | if train_date > test_date: 41 | return False 42 | return True 43 | 44 | 45 | def assert_positive_negative_temporal_consistency(y, t, month_variance=1): 46 | """Helper function to assert malware-goodware temporal constraint (C2). 47 | 48 | In any given testing period, all testing objects must be from the time 49 | window under test. In the malware domain this constraint has often been 50 | violated so that malware and goodware come from different time periods. 51 | 52 | If this is the case, it becomes impossible to tell whether a 53 | high-performing classifier is discriminating between malicious and benign 54 | objects or between old and new applications. 55 | 56 | Args: 57 | y: An array of ground-truth labels for each observation. 58 | t: An array of datetimes for each observation (aligned with y). 59 | month_variance: All malware and goodware should be between this many 60 | months. 61 | 62 | Returns: 63 | bool: False if the malware and goodware do not adhere to C2, 64 | True otherwise 65 | 66 | """ 67 | positive = np.where(y == 1)[0] 68 | negative = np.where(y != 1)[0] 69 | positive_dates = t[positive] 70 | negative_dates = t[negative] 71 | 72 | for pos_date in positive_dates: 73 | for neg_date in negative_dates: 74 | if month_difference(pos_date, neg_date) > month_variance: 75 | return False 76 | return True 77 | 78 | 79 | def month_difference(d1, d2): 80 | """Get the difference in months between two datetimes.""" 81 | return (d1.year - d2.year) * 12 + d1.month - d2.month 82 | 83 | 84 | def time_aware_train_test_split(X, y, t, train_size, test_size, 85 | granularity, start_date=None): 86 | """Partition a dataset composed of time-labelled objects. 87 | 88 | Args: 89 | X (np.ndarray, csr_matrix): Multi-dimensional array of predictors. 90 | y (np.ndarray): Array of output labels. 91 | t (np.ndarray): Array of timestamp tags. 92 | train_size (int): The training window size W (in τ). 93 | test_size (int): The testing window size Δ (in τ). 94 | granularity (str): The unit of time τ, used to denote the window size. 95 | Acceptable values are 'year|quarter|month|week|day'. 96 | start_date (date): The date to begin partioning from (eg. to align with 97 | the start of the year). 98 | 99 | Returns: 100 | (np.ndarray, list, np.ndarray, list, np.ndarray, list): 101 | Training partition of predictors X. 102 | List of testing partitions of predictors X. 103 | Training partition of output variables y. 104 | List of testing partitions of predictors y. 105 | Training partition of meta t. 106 | List of testing partitions of meta t. 107 | 108 | """ 109 | # Get partitioned indexes 110 | train, tests = time_aware_indexes(t, train_size, test_size, 111 | granularity, start_date) 112 | 113 | # Partition predictors and labels 114 | X_actual, y_actual, t_actual = X[train], y[train], t[train] 115 | 116 | X_tests = [X[index_set] for index_set in tests] 117 | y_tests = [y[index_set] for index_set in tests] 118 | t_tests = [t[index_set] for index_set in tests] 119 | 120 | return X_actual, X_tests, y_actual, y_tests, t_actual, t_tests 121 | 122 | 123 | def time_aware_indexes(t, train_size, test_size, granularity, start_date=None): 124 | """Return a list of indexes that partition the list t by time. 125 | 126 | Sorts the list of dates t before dividing into training and testing 127 | partitions, ensuring a 'history-aware' split in the ensuing classification 128 | task. 129 | 130 | 131 | Args: 132 | t (np.ndarray): Array of timestamp tags. 133 | train_size (int): The training window size W (in τ). 134 | test_size (int): The testing window size Δ (in τ). 135 | granularity (str): The unit of time τ, used to denote the window size. 136 | Acceptable values are 'year|quarter|month|week|day'. 137 | start_date (date): The date to begin partioning from (eg. to align with 138 | the start of the year). 139 | 140 | Returns: 141 | (list, list): 142 | Indexing for the training partition. 143 | List of indexings for the testing partitions. 144 | 145 | """ 146 | # Order the dates as well as their original positions 147 | with_indexes = zip(t, range(len(t))) 148 | ordered = sorted(with_indexes, key=operator.itemgetter(0)) 149 | 150 | # Split out the dates from the indexes 151 | dates = [tup[0] for tup in ordered] 152 | indexes = [tup[1] for tup in ordered] 153 | 154 | # Get earliest date 155 | start_date = utils.resolve_date(start_date) if start_date else ordered[0][0] 156 | 157 | # Slice out training partition 158 | boundary = start_date + get_relative_delta(train_size, granularity) 159 | to_idx = bisect.bisect_left(dates, boundary) 160 | train = indexes[:to_idx] 161 | 162 | tests = [] 163 | # Slice out testing partitions 164 | while to_idx < len(indexes): 165 | boundary += get_relative_delta(test_size, granularity) 166 | from_idx = to_idx 167 | to_idx = bisect.bisect_left(dates, boundary) 168 | tests.append(indexes[from_idx:to_idx]) 169 | 170 | return train, tests 171 | 172 | 173 | def time_aware_partition(t, proportion): 174 | """Partition an array of dates based on the given proportion. 175 | 176 | The set of timestamps will be bisected with the left bisection sized by 177 | the given proportion. 178 | 179 | Args: 180 | t: An array of datetimes. 181 | proportion: The proportion by which to split the array. 182 | 183 | Returns: 184 | tuple: The two bisections of the array. 185 | """ 186 | # Order the dates as well as their original positions 187 | indexes = np.argsort(t) 188 | 189 | # Divide ordered set in two 190 | boundary = int(proportion * len(indexes)) 191 | 192 | return indexes[:boundary], indexes[boundary:] 193 | 194 | 195 | def temporal_slice(X, y, t): 196 | raise NotImplementedError 197 | 198 | 199 | def get_relative_delta(offset, granularity): 200 | """Get delta of size 'granularity'. 201 | 202 | Args: 203 | offset: The number of time units to offset by. 204 | granularity: The unit of time to offset by, expects one of 205 | 'year', 'quarter', 'month', 'week', 'day'. 206 | 207 | Returns: 208 | The timedelta equivalent to offset * granularity. 209 | 210 | """ 211 | # Make allowances for year(s), quarter(s), month(s), week(s), day(s) 212 | granularity = granularity[:-1] if granularity[-1] == 's' else granularity 213 | try: 214 | return { 215 | 'year': relativedelta(years=offset), 216 | 'quarter': relativedelta(months=3 * offset), 217 | 'month': relativedelta(months=offset), 218 | 'week': relativedelta(weeks=offset), 219 | 'day': relativedelta(days=offset), 220 | }[granularity] 221 | except KeyError: 222 | raise ValueError('granularity not recognised, try: ' 223 | 'year|quarter|month|week|day') 224 | -------------------------------------------------------------------------------- /tesseract/rejection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | rejection.py 5 | ~~~~~~~~~~~~ 6 | 7 | # TODO | Add module description 8 | 9 | """ 10 | import logging 11 | import os 12 | 13 | import numpy as np 14 | from sklearn.model_selection import KFold, cross_val_predict 15 | from tqdm import tqdm 16 | 17 | from tesseract import utils 18 | from tesseract.evaluation import TrackingStage 19 | from tesseract.temporal import time_aware_partition 20 | 21 | 22 | class Rejector(TrackingStage): 23 | def __init__(self, schedule=1, tracking=True, interaction='intersection'): 24 | super().__init__(schedule, tracking, interaction) 25 | self.kept_history = [] 26 | self.rejection_history = [] 27 | 28 | def reject_wrapper(self, clf, X_train, y_train, t_train, X_test, 29 | y_test, t_test, previously_kept, previously_rejected): 30 | # Pass parameters straight through to reject implementation 31 | kept, rejected = self.reject(clf, X_train, y_train, t_train, 32 | X_test, y_test, t_test, previously_kept, 33 | previously_rejected) 34 | 35 | if self.tracking: 36 | self.kept_history.append(kept) 37 | self.rejection_history.append(rejected) 38 | 39 | # Merge results with those of previous rejectors 40 | kept = self.merge_results(previously_kept, kept) 41 | rejected = self.merge_results(previously_rejected, rejected) 42 | 43 | return np.array(kept), np.array(rejected) 44 | 45 | def reject(self, clf, X_train, y_train, t_train, 46 | X_test, y_test, t_test, previously_kept, previously_rejected): 47 | raise NotImplementedError('Rejector must be subclassed') 48 | 49 | 50 | class ThresholdRejector(Rejector): 51 | def __init__(self, operator, thresholds, point_score='credibility', 52 | schedule=1, tracking=True, interaction='intersection'): 53 | super().__init__(schedule, tracking, interaction) 54 | 55 | self._single_threshold_ops = ('<', 'lesser', 56 | '>', 'greater') 57 | self._double_threshold_ops = ('<>', 'outside', 58 | '><', 'between') 59 | 60 | self._valid_operators = (self._single_threshold_ops + 61 | self._double_threshold_ops) 62 | 63 | self._valid_point_scores = ('credibility', 'confidence') 64 | 65 | self._check_params( 66 | operator, thresholds, point_score, tracking, interaction) 67 | 68 | self.point_score = point_score 69 | self.thresholds = thresholds 70 | self.operator = operator 71 | 72 | if hasattr(thresholds, '__len__'): 73 | self.threshold = max(thresholds) 74 | self.lower_threshold = min(thresholds) 75 | else: 76 | self.threshold = thresholds 77 | self.lower_threshold = None 78 | 79 | def reject(self, clf, X_train, y_train, t_train, 80 | X_test, y_test, t_test, previously_kept, previously_rejected): 81 | get_score = utils.select_prediction_function(clf) 82 | 83 | y_scores = get_score(X_test) 84 | 85 | # Resolve arrays where the scoring function outputs per-class scores 86 | 87 | if hasattr(y_scores[0], '__len__'): 88 | if self.point_score == 'credibility': 89 | # credibility = the highest score 90 | y_scores = np.array([max(v) for v in y_scores]) 91 | elif self.point_score == 'confidence': 92 | # confidence = the highest score minus the next highest 93 | y_scores = np.array([max(v) - np.partition(v, -2)[-2] 94 | for v in y_scores]) 95 | 96 | if self.operator in ('<', 'lesser'): 97 | rejected = np.where(y_scores < self.threshold)[0] 98 | 99 | elif self.operator in ('>', 'greater'): 100 | rejected = np.where(y_scores > self.threshold)[0] 101 | 102 | elif self.operator in ('<>', 'outside'): 103 | rejected = np.where(np.logical_or(y_scores < self.lower_threshold, 104 | y_scores > self.threshold))[0] 105 | elif self.operator in ('><', 'between'): 106 | rejected = np.where(np.logical_and(y_scores > self.lower_threshold, 107 | y_scores < self.threshold))[0] 108 | else: 109 | raise ValueError('Unrecognised comparator for rejection') 110 | # Add indexes that didn't pass to list of quarantined samples 111 | 112 | kept = np.setxor1d(rejected, np.arange(len(y_scores))) 113 | 114 | return kept, rejected 115 | 116 | def _check_params(self, operator, thresholds, 117 | point_score, tracking, interaction): 118 | 119 | if hasattr(thresholds, '__len__') and len(thresholds) > 2: 120 | raise ValueError( 121 | 'ThresholdRejector will only accept a ' 122 | 'maximum of 2 thresholds (one upper, one lower)') 123 | 124 | if operator not in self._valid_operators: 125 | raise ValueError( 126 | 'Threshold comparison operator must be one of the ' 127 | 'following: {}'.format(self._valid_operators)) 128 | 129 | if point_score not in self._valid_point_scores: 130 | raise ValueError( 131 | 'Point scores must be one of the ' 132 | 'following: {}'.format(self._valid_point_scores)) 133 | 134 | if (operator in self._double_threshold_ops and 135 | not hasattr(thresholds, '__len__')): 136 | raise ValueError('"{}" expects two thresholds'.format(operator)) 137 | 138 | if (operator in self._single_threshold_ops and 139 | hasattr(thresholds, '__len__')): 140 | raise ValueError('"{}" expects a single threshold'.format(operator)) 141 | 142 | 143 | def quartiles(alpha_assessment_results, subkey='incorrect'): 144 | """Considering an alpha assessment, return the quartiles from the results. 145 | 146 | In well-separated alpha assessment results, quartiles can be useful for 147 | finding a good threshold (below which, predictions are discarded). 148 | 149 | Typically thresholds are Q3 of incorrect predictions and Q1 of correct 150 | predictions. 151 | 152 | Args: 153 | alpha_assessment_results: The results to derive quartiles from. 154 | subkey: 'correct' or 'incorrect'. 155 | 156 | Returns: 157 | tuple: The quartiles as they relate to the negative and positive class. 158 | 159 | """ 160 | percentiles = [0, 25, 50, 75, 100] 161 | negative = alpha_assessment_results['negative_predictions'][subkey] 162 | positive = alpha_assessment_results['positive_predictions'][subkey] 163 | neg_quartiles = [np.percentile(negative, p) for p in percentiles] 164 | pos_quartiles = [np.percentile(positive, p) for p in percentiles] 165 | return neg_quartiles, pos_quartiles 166 | 167 | 168 | def alpha_assessment(clf, X, y, folds=10): 169 | """Perform an alpha assessment on the given classifier and data. 170 | 171 | An alpha assessment is an assessment used in conformal evaluation to 172 | visually discern how separable the classifier's correct and incorrect 173 | prediction scores are. 174 | 175 | Highly separable scores allow the user to control a threshold below which 176 | they can designate predictions as being low-confidence, unreliable or even 177 | rejected. In the domain of malware classification, the rate at which a 178 | greater proportion of samples appear _below_ the threshold is indicative 179 | of the rate at which concept drift is occuring. 180 | 181 | A formal description and thorough evaluation of its uses is given in the 182 | Transcend paper by Jordaney et. al [USENIX 2017]: 183 | https://www.usenix.org/system/files/conference/usenixsecurity17/sec17-jordaney.pdf 184 | 185 | Args: 186 | clf: The classifier to use to perform the assessment. 187 | X: An array of predictors. 188 | y: An array of output labels aligned with X. 189 | folds: The number of folds to perform during the K-fold. 190 | 191 | Returns: 192 | 193 | """ 194 | if hasattr(clf, 'predict_proba'): 195 | f = 'predict_proba' 196 | elif hasattr(clf, 'decision_function'): 197 | f = 'decision_function' 198 | else: 199 | raise TypeError( 200 | 'Unsure how to handle scoring with ' 201 | 'classifier of type {}.'.format(clf.__class__)) 202 | 203 | # random_state was set to 22 however removed due to ValueError since shuffle=False 204 | cv = KFold(n_splits=folds, shuffle=False) 205 | y_pred = cross_val_predict(clf, X, y, cv=cv) 206 | y_score = cross_val_predict(clf, X, y, cv=cv, method=f) 207 | 208 | negative = np.where(y_pred == 0)[0] 209 | positive = np.where(y_pred == 1)[0] 210 | correct = np.where(y_pred == y)[0] 211 | incorrect = np.where(y_pred != y)[0] 212 | 213 | return { 214 | 'negative_predictions': { 215 | 'correct': y_score[np.intersect1d(negative, correct)], 216 | 'incorrect': y_score[np.intersect1d(negative, incorrect)]}, 217 | 'positive_predictions': { 218 | 'correct': y_score[np.intersect1d(positive, correct)], 219 | 'incorrect': y_score[np.intersect1d(positive, incorrect)]} 220 | } 221 | -------------------------------------------------------------------------------- /tesseract/plot_utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import ujson as json 4 | from datetime import datetime 5 | 6 | import __main__ as main 7 | import numpy as np 8 | import os 9 | import seaborn as sns 10 | from sklearn.ensemble import RandomForestClassifier 11 | from sklearn.feature_extraction import DictVectorizer 12 | from sklearn.svm import LinearSVC 13 | 14 | from tesseract import temporal, spatial 15 | 16 | line_kwargs = {'linewidth': 1, 'markersize': 5} 17 | 18 | force = False 19 | 20 | 21 | # x_tick_size = 12 22 | # y_tick_size = 14 23 | # ax_label_size = 18 24 | # fig_title_size = 20 25 | 26 | def set_style(): 27 | sns.set_context('paper') 28 | sns.set(font='serif') 29 | 30 | sns.set('paper', font='serif', style='ticks', rc={ 31 | 'font.family': 'serif', 32 | 'legend.fontsize': 'medium', 33 | 'xtick.labelsize': 'medium', 34 | 'ytick.labelsize': 'medium', 35 | 'axes.labelsize': 'x-large', 36 | 'axes.titlesize': 'x-large', 37 | 'axes.labelpad': 6.0, 38 | 'figure.titlesize': 'x-large', 39 | 'text.usetex': True, 40 | 'text.latex.unicode': True, 41 | 'figure.figsize': (7.2, 4.45), 42 | 'figure.dpi': 1200, 43 | 'savefig.dpi': 1200 44 | }) 45 | 46 | 47 | def get_dataset(approach): 48 | return {'drebin': 'drebin-parrot-v2-down', 49 | 'mamadroid': 'mamadroidPackages-parrot-v2-down'}[approach] 50 | 51 | 52 | def get_classifier(approach, balance=False): 53 | kwargs = {'class_weights': 'balanced'} if balance else {} 54 | if approach == 'drebin': 55 | return LinearSVC(**kwargs) 56 | if approach == 'mamadroid': 57 | return RandomForestClassifier(n_estimators=101, max_depth=64, 58 | n_jobs=-1, **kwargs) 59 | raise ValueError 60 | 61 | 62 | def load_features(feature_set): 63 | fname = '../../features/{}-features'.format(feature_set) 64 | logging.info('Loading features...') 65 | with open('{}-X.json'.format(fname), 'rt') as f: 66 | X = json.load(f) 67 | [o.pop('sha256') for o in X] 68 | 69 | with open('{}-Y.json'.format(fname), 'rt') as f: 70 | y = json.load(f) 71 | y = [o[0] for o in y] 72 | 73 | with open('{}-meta.json'.format(fname), 'rt') as f: 74 | t = json.load(f) 75 | t = [o['dex_date'] for o in t] 76 | t = [datetime.strptime(o, '%Y-%m-%dT%H:%M:%S') for o in t] 77 | 78 | return X, y, t 79 | 80 | 81 | def load_meta(feature_set): 82 | logging.info('Loading meta...') 83 | with open('../../features/{}-features-meta.json'.format(feature_set), 84 | 'rt') as f: 85 | return json.load(f) 86 | 87 | 88 | def enforce_ratios(X, y, t): 89 | train, tests = temporal.time_aware_indexes(t, 0, 1, 'month', '2014') 90 | assert len(tests) == 36 91 | 92 | downsampled = None 93 | print('{:^6} {:^6} {:^6} {:^6}'.format('MW', 'GW', 'TOT', '%MW')) 94 | 95 | for period_idxs in tests: 96 | period_idxs = np.array(period_idxs) 97 | y_period = y[period_idxs] 98 | 99 | # IF DOWNSAMPLING 100 | selected_idxs = spatial.downsample_to_rate(y_period) 101 | selected = period_idxs[selected_idxs] 102 | 103 | # ELSE 104 | # selected = period_idxs 105 | 106 | labels = y[selected] 107 | tot = len(labels) 108 | p = sum(labels) 109 | n = tot - sum(labels) 110 | print('{:>6} {:>6} {:>6} {:>6.1f}%'.format(p, n, tot, 100 * p / tot)) 111 | 112 | if downsampled is None: 113 | downsampled = selected 114 | else: 115 | downsampled = np.hstack((downsampled, selected)) 116 | 117 | labels = y[downsampled] 118 | tot = len(labels) 119 | p = sum(labels) 120 | n = tot - sum(labels) 121 | print('Overall') 122 | print('{:>6} {:>6} {:>6} {:>6.1f}%'.format(p, n, tot, 100 * p / tot)) 123 | 124 | return downsampled 125 | 126 | 127 | def vectorize(X, y, t): 128 | """Transform input data into appropriate forms for an sklearn classifier. 129 | 130 | Args: 131 | X (list): A list of dictionaries of input features for each sample. 132 | y (list): A list of ground truths for the data. 133 | t (list): A list of datetimes for the data. 134 | 135 | """ 136 | logging.info('Vectorizing features...') 137 | vec = DictVectorizer() 138 | X = vec.fit_transform(X) 139 | y = np.asarray(y) 140 | t = np.asarray(t) 141 | return X, y, t 142 | 143 | 144 | def style_axes(axes, periods=10): 145 | for i, ax in enumerate(axes): 146 | # Labels 147 | ax.set_xlabel('Testing period (month)') # , fontsize=ax_label_size) 148 | # ax.set_ylabel('Score') # , fontsize=ax_label_size) 149 | ax.set_ylabel('') 150 | 151 | # Ticks 152 | ax.set_xticks(range(1, periods + 1)) 153 | ax.set_yticks(np.arange(0, 1.1, 0.1)) 154 | 155 | labels = [str(x + 1) if x % 3 == 0 else '' for x in range(periods + 1)] 156 | ax.set_xticklabels(labels) 157 | 158 | ax.tick_params(axis='x', which='major') # , labelsize=x_tick_size) 159 | ax.tick_params(axis='y', which='major') # , labelsize=y_tick_size) 160 | 161 | ax.yaxis.grid(b=True, which='major', color='lightgrey', linestyle='-') 162 | 163 | # Axe limits 164 | ax.set_xlim(0, periods) 165 | ax.set_ylim(0, 1) 166 | 167 | sns.despine(ax=ax, top=True, right=True, bottom=False, left=False) 168 | 169 | 170 | def plot_f1(ax, data, alpha=1, neg=False, label=None, color='dodgerblue', 171 | marker='o'): 172 | if label is None: 173 | label = 'F1 (gw)' if neg else 'F1 (mw)' 174 | color = '#BCDEFE' if neg else color 175 | series = data['f1_n'] if neg else data['f1'] 176 | ax.plot(data.index, series, label=label, alpha=alpha, marker=marker, 177 | c=color, markeredgewidth=1, **line_kwargs) 178 | 179 | 180 | def plot_roc(ax, data, alpha=1, label=None, color='dodgerblue', 181 | marker='o'): 182 | if label is None: 183 | label = 'AUC ROC' 184 | series = data['auc_roc'] 185 | ax.plot(data.index, series, label=label, alpha=alpha, marker=marker, 186 | c=color, markeredgewidth=1, **line_kwargs) 187 | 188 | 189 | def plot_f1_col(ax, data, alpha=1, neg=False, label=None, color='dodgerblue', 190 | marker='o'): 191 | if label is None: 192 | label = 'F1 (gw)' if neg else 'F1 (mw)' 193 | series = data['f1_n'] if neg else data['f1'] 194 | ax.plot(data.index, series, label=label, alpha=alpha, marker=marker, 195 | c=color, markeredgewidth=1, **line_kwargs) 196 | 197 | 198 | def plot_recall(ax, data, alpha=1, neg=False, color='red', marker='^'): 199 | color = '#FDB2B3' if neg else color 200 | label = 'Recall (gw)' if neg else 'Recall (mw)' 201 | series = data['recall_n'] if neg else data['recall'] 202 | ax.plot(data.index, series, label=label, alpha=alpha, 203 | marker=marker, c=color, markeredgewidth=1, **line_kwargs) 204 | 205 | 206 | def plot_precision(ax, data, alpha=1, neg=False, color='orange', marker='s'): 207 | color = '#FEE2B5' if neg else color 208 | label = 'Precision (gw)' if neg else 'Precision (mw)' 209 | series = data['precision_n'] if neg else data['precision'] 210 | ax.plot(data.index, series, label=label, alpha=alpha, 211 | marker=marker, c=color, markeredgewidth=1, **line_kwargs) 212 | 213 | 214 | def fill_under_f1(ax, data, alpha=1, neg=False): 215 | label = 'F1 (gw)' if neg else 'F1 (mw)' 216 | series = data['f1_n'] if neg else data['f1'] 217 | ax.fill_between(data.index, series, 218 | label='AUT({}, 24 months)'.format(label), 219 | alpha=alpha, facecolor='none', hatch='//', 220 | edgecolor='#BCDEFE', rasterized=True) 221 | 222 | 223 | def plot_old_f1(ax, data, alpha=1, neg=False, label=None, 224 | color='#C0C0C0', marker=''): 225 | if label is None: 226 | label = 'F1 (gw)' if neg else 'F1 (mw)' 227 | series = data['f1_n'] if neg else data['f1'] 228 | ax.plot(data.index, series, label=label, alpha=alpha, linestyle='--', 229 | marker=marker, c=color, markeredgewidth=1, linewidth=2) 230 | 231 | 232 | def plot_old_metric(ax, data, metric, alpha=1, neg=False, label=None, 233 | color='#C0C0C0', marker=''): 234 | if label is None: 235 | label = metric + ' (gw)' if neg else metric + ' (mw)' 236 | label = label.title() 237 | series = data[metric + '_n'] if neg else data[metric] 238 | ax.plot(data.index, series, label=label, alpha=alpha, linestyle='--', 239 | marker=marker, c=color, markeredgewidth=1, linewidth=2) 240 | 241 | 242 | def plot_cv_mean(ax, data, alpha=1): 243 | ax.axhline(y=float(data), linestyle='--', linewidth=1, c='red', 244 | alpha=alpha, label='F1 (10-fold CV)') 245 | 246 | 247 | def plot_x_intercept(ax, data, label='', c='limegreen', alpha=1, linewidth=1): 248 | ax.axvline(x=float(data), linestyle='--', linewidth=linewidth, c=c, 249 | alpha=alpha, label=label) 250 | 251 | 252 | def plot_prf(ax, results, alpha=1, neg=False): 253 | plot_recall(ax, results, alpha, neg) 254 | plot_precision(ax, results, alpha, neg) 255 | plot_f1(ax, results, alpha, neg) 256 | 257 | 258 | def add_legend(ax, loc='lower left'): 259 | lines = ax.get_lines() 260 | legend = ax.legend(frameon=True, handles=lines, loc=loc, prop={'size': 10}) 261 | legend.get_frame().set_facecolor('#FFFFFF') 262 | legend.get_frame().set_linewidth(0) 263 | return legend 264 | 265 | 266 | def set_title_sc(ax, text): 267 | text = text.replace('%', '\\%') # Make TeX-safe 268 | ax.set_title('\\textsf{{\\textsc{{{}}}}}'.format(text)) 269 | 270 | 271 | def plotname(): 272 | return os.path.splitext(os.path.basename(main.__file__))[0] 273 | 274 | 275 | def save_images(plt, plot_name=None): 276 | plt.tight_layout() 277 | plot_name = plotname() if plot_name is None else plot_name 278 | plt.savefig('./images/png/{}.png'.format(plot_name)) 279 | plt.savefig('./images/pdf/{}.pdf'.format(plot_name)) 280 | plt.savefig('./images/eps/{}.eps'.format(plot_name)) 281 | 282 | 283 | def parse_args(): 284 | global force 285 | p = argparse.ArgumentParser() 286 | p.add_argument('-f', '--force', action='store_true', help='Rerun all data') 287 | args = p.parse_args() 288 | force = args.force 289 | return args 290 | 291 | 292 | parse_args() 293 | -------------------------------------------------------------------------------- /tesseract/viz.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import seaborn as sns 7 | from collections import defaultdict 8 | 9 | # TODO | Remove pandas dependency 10 | 11 | line_kwargs = {'linewidth': 1, 'markersize': 3} 12 | 13 | 14 | # x_tick_size = 12 15 | # y_tick_size = 14 16 | # ax_label_size = 18 17 | # fig_title_size = 20 18 | 19 | def plot_decay(results, fill=True, titles=None, means=None, reject=False): 20 | # ------------------------------------------ # 21 | # Plotting prologue # 22 | # ------------------------------------------ # 23 | 24 | results = [results] if isinstance(results, dict) else results 25 | titles = titles if titles else [''] * len(results) 26 | means = means if means else [None] * len(results) 27 | 28 | # FIXME | This is all a bit of a naff hack from before the redesign, 29 | # FIXME | when there was a dependency on Pandas, remove as soon as possible 30 | 31 | for i in range(len(results)): 32 | # del results[i]['auc_roc'] # Otherwise hampers the DataFrame conversion 33 | print(len(results[i]['f1'])) 34 | # results[i]['f1_b'], results[i]['f1_r'], results[i]['reject_total_perc'] = [], [], [] 35 | # for j in range(len(results[i]['transcend'])): 36 | # results[i]['f1_b'].append(results[i]['transcend'][j]['f1_b']) 37 | # results[i]['f1_r'].append(results[i]['transcend'][j]['f1_r']) 38 | # results[i]['reject_total_perc'].append(results[i]['transcend'][j]['reject_total_perc']) 39 | # del results[i]['transcend'] 40 | results[i] = pd.DataFrame(dict(results[i]), 41 | index=range(1, len(results[i]['f1']) + 1)) 42 | 43 | # End of naffness 44 | 45 | set_style() 46 | fig, axes = plt.subplots(1, len(results)) 47 | 48 | axes = axes if hasattr(axes, '__iter__') else (axes,) 49 | 50 | # ------------------------------------------ # 51 | # Subplots # 52 | # ------------------------------------------ # 53 | 54 | for res, ax, title, mean in zip(results, axes, titles, means): 55 | # plot_prf(ax, res, 0.3, neg=True) 56 | plot_prf(ax, res) 57 | if mean is not None: 58 | plot_cv_mean(ax, mean) 59 | if fill: 60 | fill_under_f1(ax, res) 61 | if reject: 62 | plot_baseline_f1(ax, res) 63 | plot_rej_f1(ax, res) 64 | plot_rejected(ax, res) 65 | ax.set_title(title) 66 | 67 | # Legend 68 | add_legend(axes[0]) 69 | 70 | # ------------------------------------------ # 71 | # Plotting epilogue # 72 | # ------------------------------------------ # 73 | 74 | style_axes(axes, len(results[0]['f1'])) 75 | fig.set_size_inches(6 * len(results), 4) 76 | plt.tight_layout() 77 | 78 | return plt 79 | 80 | 81 | def plot_decay1(results, fill=True, titles=None, means=None, reject=False): 82 | # ------------------------------------------ # 83 | # Plotting prologue # 84 | # ------------------------------------------ # 85 | 86 | results = [results] if isinstance(results, dict) else results 87 | titles = titles if titles else [''] * len(results) 88 | means = means if means else [None] * len(results) 89 | 90 | # FIXME | This is all a bit of a naff hack from before the redesign, 91 | # FIXME | when there was a dependency on Pandas, remove as soon as possible 92 | data = defaultdict(lambda: []) 93 | for result in results: 94 | for i in result: 95 | data[i].append(result[i]) 96 | results = [pd.DataFrame(dict(data), index=range(1, len(data['f1_b']) + 1))] 97 | 98 | # End of naffness 99 | 100 | set_style() 101 | fig, axes = plt.subplots(1, len(results)) 102 | 103 | axes = axes if hasattr(axes, '__iter__') else (axes,) 104 | 105 | # ------------------------------------------ # 106 | # Subplots # 107 | # ------------------------------------------ # 108 | 109 | for res, ax, title, mean in zip(results, axes, titles, means): 110 | # plot_prf(ax, res, 0.3, neg=True) 111 | plot_prf(ax, res) 112 | if mean is not None: 113 | plot_cv_mean(ax, mean) 114 | if fill: 115 | fill_under_f1(ax, res) 116 | if reject: 117 | plot_baseline_f1(ax, res) 118 | plot_rej_f1(ax, res) 119 | plot_rejected(ax, res) 120 | ax.set_title(title) 121 | 122 | # Legend 123 | add_legend(axes[0]) 124 | 125 | # ------------------------------------------ # 126 | # Plotting epilogue # 127 | # ------------------------------------------ # 128 | 129 | style_axes(axes, len(results[0]['f1_b'])) 130 | fig.set_size_inches(6 * len(results), 4) 131 | plt.tight_layout() 132 | 133 | return plt 134 | 135 | 136 | def set_style(): 137 | sns.set_context('paper') 138 | sns.set(font='serif') 139 | 140 | sns.set('paper', font='serif', style='ticks', rc={ 141 | 'font.family': 'serif', 142 | 'legend.fontsize': 'medium', 143 | 'xtick.labelsize': 'medium', 144 | 'ytick.labelsize': 'medium', 145 | 'axes.labelsize': 'x-large', 146 | 'axes.titlesize': 'x-large', 147 | 'axes.labelpad': 6.0, 148 | 'figure.titlesize': 'x-large', 149 | 'text.usetex': True, 150 | 'figure.figsize': (3.6, 4.45), 151 | 'figure.dpi': 1200, 152 | 'savefig.dpi': 1200 153 | }) 154 | 155 | 156 | def style_axes(axes, periods, granularity='Month'): 157 | for i, ax in enumerate(axes): 158 | # Labels 159 | ax.set_xlabel(f'Testing period ({granularity})') # , fontsize=ax_label_size) 160 | # ax.set_ylabel('Score') # , fontsize=ax_label_size) 161 | ax.set_ylabel('') 162 | 163 | # Ticks 164 | ax.set_xticks(range(1, periods + 1)) 165 | ax.set_yticks(np.arange(0, 1.1, 0.1)) 166 | 167 | if periods > 12: 168 | labels = [str(x + 1) if x % 3 == 0 169 | else '' for x in range(periods)] 170 | else: 171 | labels = [str(x + 1) for x in range(periods)] 172 | 173 | ax.set_xticklabels(labels) 174 | 175 | ax.tick_params(axis='x', which='major') # , labelsize=x_tick_size) 176 | ax.tick_params(axis='y', which='major') # , labelsize=y_tick_size) 177 | 178 | ax.yaxis.grid(visible=True, which='major', color='lightgrey', linestyle='-') 179 | 180 | # Axe limits 181 | ax.set_xlim(0.8, periods) 182 | ax.set_ylim(0, 1) 183 | 184 | sns.despine(ax=ax, top=True, right=True, bottom=False, left=False) 185 | 186 | 187 | def plot_baseline_f1(ax, data, alpha=1.0, color='gray', linestyle='--'): 188 | label = 'F1 (no rejection)' 189 | series = data['f1_b'] 190 | ax.plot(data.index + 1, series, label=label, alpha=alpha, linestyle=linestyle, 191 | c=color, markeredgewidth=1, **line_kwargs) 192 | 193 | 194 | def plot_rej_f1(ax, data, alpha=1.0, color='red', marker='o'): 195 | label = 'F1 (rejection)' 196 | series = data['f1_r'] 197 | ax.plot(data.index + 1, series, label=label, alpha=alpha, marker=marker, 198 | c=color, markeredgewidth=1, **line_kwargs) 199 | 200 | 201 | def plot_rejected(ax, data, alpha=0.6, color='#C0C0C0'): 202 | series = data['reject_total_perc'] 203 | ax.bar(data.index + 1, series, width=0.7, color=color, alpha=alpha) 204 | 205 | 206 | def plot_f1(ax, data, alpha=1.0, neg=False, label=None, color='dodgerblue', 207 | marker='o'): 208 | if label is None: 209 | label = 'F1 (gw)' if neg else 'F1 (mw)' 210 | 211 | if neg: 212 | if color=='dodgerblue': 213 | color = '#BCDEFE' 214 | 215 | series = data['f1_n'] if neg else data['f1'] 216 | ax.plot(data.index + 1, series, label=label, alpha=alpha, marker=marker, 217 | c=color, markeredgewidth=1, **line_kwargs) 218 | 219 | 220 | def plot_recall(ax, data, alpha=1.0, neg=False, color='red', marker='^'): 221 | label = 'Recall (gw)' if neg else 'Recall (mw)' 222 | color = '#FDB2B3' if neg else color 223 | series = data['recall_n'] if neg else data['recall'] 224 | ax.plot(data.index + 1, series, label=label, alpha=alpha, 225 | marker=marker, c=color, markeredgewidth=1, **line_kwargs) 226 | 227 | 228 | def plot_precision(ax, data, alpha=1.0, neg=False, color='orange', marker='s'): 229 | label = 'Precision (gw)' if neg else 'Precision (mw)' 230 | color = '#FEE2B5' if neg else color 231 | series = data['precision_n'] if neg else data['precision'] 232 | ax.plot(data.index + 1, series, label=label, alpha=alpha, 233 | marker=marker, c=color, markeredgewidth=1, **line_kwargs) 234 | 235 | 236 | def fill_under_f1(ax, data, alpha=1, neg=False): 237 | label = 'F1 (gw)' if neg else 'F1 (mw)' 238 | series = data['f1_n'] if neg else data['f1'] 239 | ax.fill_between(data.index + 1, series, 240 | label='AUT({}, 24 months)'.format(label), 241 | alpha=alpha, facecolor='none', hatch='//', 242 | edgecolor='#BCDEFE', rasterized=True) 243 | 244 | 245 | def plot_cv_mean(ax, data, alpha=1): 246 | ax.axhline(y=float(data), linestyle='--', linewidth=1, c='red', 247 | alpha=alpha, label='F1 (10-fold CV)') 248 | 249 | 250 | def plot_origin(ax, data, alpha=1): 251 | ax.axhline(y=float(data), linestyle='-.', linewidth=1, c='black', 252 | alpha=alpha, label='F1 (original paper)') 253 | 254 | 255 | def plot_prf(ax, results, alpha=1.0, neg=False): 256 | plot_f1(ax, results, alpha, neg) 257 | plot_recall(ax, results, alpha, neg) 258 | plot_precision(ax, results, alpha, neg) 259 | 260 | 261 | 262 | def add_legend(ax, loc='lower left'): 263 | lines = ax.get_lines() 264 | legend = ax.legend(frameon=True, handles=lines, loc=loc, prop={'size': 8}, # Reduced font size 265 | borderpad=0.5, # Padding inside the legend box 266 | labelspacing=0.5, # Vertical spacing between legend items 267 | handlelength=1, # Length of the legend handles 268 | handletextpad=0.5) # Spacing between handle and text 269 | legend.get_frame().set_facecolor('#FFFFFF') 270 | legend.get_frame().set_linewidth(0) 271 | return legend 272 | 273 | 274 | def save_images(plt, path, plot_name): 275 | plt.tight_layout() 276 | plt.savefig(os.path.join(path, './png/{}.png'.format(plot_name))) 277 | plt.savefig(os.path.join(path, './pdf/{}.pdf'.format(plot_name))) 278 | plt.savefig(os.path.join(path, './eps/{}.eps'.format(plot_name))) 279 | 280 | 281 | def plot_old_f1(ax, data, alpha=1, neg=False, label=None, 282 | color='#C0C0C0', marker=''): 283 | if label is None: 284 | label = 'F1 (gw)' if neg else 'F1 (mw)' 285 | series = data['f1_n'] if neg else data['f1'] 286 | ax.plot(data.index, series, label=label, alpha=alpha, linestyle='--', 287 | marker=marker, c=color, markeredgewidth=1, linewidth=2) 288 | 289 | 290 | def plot_old_metric(ax, data, metric, alpha=1, neg=False, label=None, 291 | color='#C0C0C0', marker=''): 292 | if label is None: 293 | label = metric + ' (gw)' if neg else metric + ' (mw)' 294 | label = label.title() 295 | series = data[metric + '_n'] if neg else data[metric] 296 | ax.plot(data.index, series, label=label, alpha=alpha, linestyle='--', 297 | marker=marker, c=color, markeredgewidth=1, linewidth=2) 298 | 299 | 300 | def set_title_sc(ax, text): 301 | text = text.replace('%', '\\%') # Make TeX-safe 302 | ax.set_title('\\textsf{{\\textsc{{{}}}}}'.format(text)) 303 | 304 | 305 | def main(): 306 | import pickle as pkl 307 | 308 | results = pkl.load( 309 | open('/Users/mark/Documents/Git/transcend-release/timeseries_cred_conf/ice_p_val_results.p', 'rb')) 310 | plot = plot_decay1(results, reject=True, titles=['ICE default']) 311 | plot.savefig("/Users/mark/Desktop/Tesseract-journal/ICE.pdf") 312 | 313 | 314 | if __name__ == '__main__': 315 | main() 316 | -------------------------------------------------------------------------------- /tesseract/spatial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | spatial.py 5 | ~~~~~~~~~~ 6 | 7 | A module for working with the class balance of a dataset. Ensuring the class 8 | distribution of the testing data is similar to what will be encountered in a 9 | real deployment is imperative to sound evaluations -- particularly in the 10 | security domains. 11 | 12 | Unlike the testing set, the training set is entirely under the operator's 13 | control and class balance can be manipulated in order to over or underrepresent 14 | the positive class in order to achieve greater recall at the expense of 15 | precision (or vice-versa) during the operational phase. 16 | 17 | """ 18 | import copy 19 | import random 20 | 21 | import numpy as np 22 | 23 | import tesseract.metrics as metrics 24 | import tesseract.utils as utils 25 | 26 | 27 | def assert_class_distribution(y, positive_rate, variance): 28 | """Helper function to verify the rate of the positive class across y (C3). 29 | 30 | The testing distribution must reflect the real-world class balance observed 31 | in real-life, otherwise results can be highly inflated (or deflated) with 32 | respect to realistic performance. This function will verify that this 33 | constraint is being respected. 34 | 35 | Args: 36 | y: An array of output class labels y 37 | positive_rate: The acceptable rate for the positive class. 38 | variance: The acceptable deviation (+/-) for the positive rate. 39 | 40 | Returns: 41 | True if the rate of the positive class is acceptable. 42 | 43 | """ 44 | current = np.sum(y) / len(y) 45 | diff = np.abs(current - positive_rate) 46 | return diff <= variance 47 | 48 | 49 | def search_optimal_train_ratio(clf, X_train, y_train, t_train, 50 | proper_train_size, validation_size, granularity, 51 | start_tr_rate=None, end_tr_rate=0.6, step=0.05, 52 | test_noise=0.00, metric='f1'): 53 | """Find the optimal training ratio in order to maximise the given metric. 54 | 55 | This function performs a grid search between start_tr_rate and end_tr_rate, 56 | aiming to maximise the value of the given metric (f1|precision|recall), 57 | while reporting the error rates accumulated at each stage of the algorithm. 58 | 59 | In order to try and pick a training ratio that will be robust to 60 | fluctuations in the testing distribution, it's possible to specify a value 61 | for 'test_noise'. The average-best training ratio across a range of values 62 | between the tr_rates +/- noise will be reported at each stage of the 63 | algorithm. 64 | 65 | This function will be performed by taking an 'actual' training set and 66 | dividing it into a 'proper' training and a 'validation' set. For example, 67 | 12 months of data might be split into 8 months and 4 months. The 4 months 68 | validation aim to simulate the distribution of objects expected after the 69 | known 12 months so that the chosen training ratio will still be effective. 70 | 71 | Note that validation size refers to a single testing period, so to use 4 72 | months in the above example, a value of 1 for validation_size and 'month' 73 | for granularity will divide the remaining objects after the initial 8 74 | selected for training into 1 month chunks to use for validation. 75 | 76 | Args: 77 | clf: The classifier to use during the search. 78 | X_train: The array of predictors to use. 79 | y_train: The array of output labels to use. 80 | t_train: The array of aligned datetimes for X (and therefore y). 81 | proper_train_size: The size of the set to train with. 82 | validation_size: The size of a _single_ validation period. 83 | granularity: The granularity of the testing period (year|month|week|day) 84 | start_tr_rate: The start train rate (typically the natural distribution). 85 | end_tr_rate: The end train date to test (typically 0.5). 86 | step: The learning rate of the grid search. 87 | test_noise: How much noise in the testing ratio to account for. 88 | metric: The metric to maximise (f1|precision|recall). 89 | 90 | Returns: 91 | A dictionary of scores and errors for each tested training ratio. 92 | 93 | """ 94 | import tesseract.temporal as temporal 95 | import tesseract.evaluation as evaluation 96 | 97 | # Split again to get training and validation sets for finding K 98 | splits = temporal.time_aware_train_test_split( 99 | X_train, y_train, t_train, train_size=proper_train_size, 100 | test_size=validation_size, granularity=granularity) 101 | 102 | aut_list, error_list, fn_list, fp_list, total_list = [], [], [], [], [] 103 | 104 | natural_rate = np.mean([sum(y_val) / len(y_val) for y_val in splits[3]]) 105 | 106 | if start_tr_rate is None: 107 | # Start one step below the natural rate of malware 108 | start_tr_rate = max( 109 | (round(float(natural_rate) / step) * step) - step, 0) 110 | 111 | tr_proportions = np.arange(start_tr_rate, end_tr_rate + step, step) 112 | 113 | mid = np.round(natural_rate, 2) 114 | if test_noise == 0: 115 | te_proportions = (mid,) 116 | else: 117 | te_proportions = np.arange(mid - test_noise, mid + test_noise, 0.01) 118 | 119 | for m in tr_proportions: 120 | X_train_proper, _, \ 121 | y_train_proper, _, \ 122 | t_train_proper, _ = copy.deepcopy(splits) 123 | 124 | # Downsample training to match percentage of malware n 125 | train_idxs = downsample_to_rate(y_train_proper, m) 126 | 127 | X_train = X_train_proper[train_idxs] 128 | y_train = y_train_proper[train_idxs] 129 | t_train = t_train_proper[train_idxs] 130 | 131 | # Alter ratio of malware in testing periods 132 | errors, auts, total = [], [], [] 133 | fps, fns = [], [] 134 | for n in te_proportions: 135 | 136 | _, X_validations, \ 137 | _, y_validations, \ 138 | __, t_validations = copy.deepcopy(splits) 139 | 140 | for i, _ in enumerate(y_validations): 141 | val_idxs = downsample_to_rate(y_validations[i], n) 142 | X_validations[i] = X_validations[i][val_idxs] 143 | y_validations[i] = y_validations[i][val_idxs] 144 | t_validations[i] = t_validations[i][val_idxs] 145 | 146 | # Compute results 147 | results = evaluation.fit_predict_update(clf, X_train, X_validations, 148 | y_train, y_validations, 149 | t_train, t_validations) 150 | 151 | fps.append(np.sum(results['fp'])) 152 | fns.append(np.sum(results['fn'])) 153 | total.append(np.sum(results['p']) + np.sum(results['n'])) 154 | errors.append(metrics.error_rate(results, metric)) 155 | auts.append(metrics.aut(results, metric)) 156 | 157 | # print(m, np.mean(total), np.mean(errors), np.mean(auts)) 158 | error_list.append(np.mean(errors)) 159 | aut_list.append(np.mean(auts)) 160 | fp_list.append(np.mean(fps)) 161 | fn_list.append(np.mean(fns)) 162 | total_list.append(np.mean(total)) 163 | 164 | return { 165 | 'errors': error_list, 166 | 'auts': aut_list, 167 | 'phis': tr_proportions, 168 | 'fn': fn_list, 169 | 'fp': fp_list, 170 | 'total': total_list 171 | } 172 | 173 | 174 | def find_optimal_train_ratio(clf, X_train, y_train, t_train, 175 | proper_train_size, validation_size, granularity, 176 | start_tr_rate=None, end_tr_rate=0.6, step=0.05, 177 | test_noise=0.00, metric='f1', acceptable_errors=0): 178 | """Given an acceptable threshold for errors, find the optimal train ratio. 179 | 180 | NOTE: The output of the search function that this wraps has undergone quite 181 | a few tweaks in terms of input and output, and at least until the full 182 | release of the library, this implementation should be considered a 183 | prototype (mileage may vary!). 184 | 185 | Args; 186 | clf: The classifier to use during the search. 187 | X_train: The array of predictors to use. 188 | y_train: The array of output labels to use. 189 | t_train: The array of aligned datetimes for X (and therefore y). 190 | proper_train_size: The size of the set to train with. 191 | validation_size: The size of a _single_ validation period. 192 | granularity: The granularity of the testing period (year|month|week|day) 193 | start_tr_rate: The start train rate (typically the natural distribution). 194 | end_tr_rate: The end train date to test (typically 0.5). 195 | step: The learning rate of the grid search. 196 | test_noise: How much noise in the testing ratio to account for. 197 | metric: The metric to maximise (f1|precision|recall). 198 | acceptable_errors: The threshold of acceptable errors. 199 | 200 | Returns: 201 | tuple: The optimal discovered ratio, it's AUT and error rate. 202 | 203 | """ 204 | rates = search_optimal_train_ratio( 205 | clf, X_train, y_train, t_train, proper_train_size, 206 | validation_size, granularity, start_tr_rate, end_tr_rate, 207 | step, test_noise, metric) 208 | 209 | phis, auts, errors = rates['phis'], rates['auts'], rates['errors'] 210 | 211 | for i in np.argsort(auts)[::-1]: 212 | if errors[i] <= acceptable_errors: 213 | return phis[i], auts[i], errors[i] 214 | 215 | print('Warning: No training rate found that allows acceptable error rate') 216 | return None 217 | 218 | 219 | def downsample_set(X, y, t, min_pos_rate, max_pos_rate=None, 220 | noise_deviation=0.0, fixed_size=False): 221 | """Enforce a class distribution by downsampling. 222 | 223 | Args: 224 | X: The array of predictors to use. 225 | y: The array of output labels to use. 226 | t: The array of aligned datetimes for X (and therefore y). 227 | min_pos_rate: The minimum proportion of the positive class acceptable. 228 | max_pos_rate: The maximum proportion of the positive class acceptable. 229 | noise_deviation: Addition of noise either side of the given proportions. 230 | fixed_size: Whether to fix the total size of X to the size of the 231 | minimum class. 232 | 233 | Returns: 234 | tuple: A resized X, y and t 235 | """ 236 | new_idxs = downsample_to_rate(y, min_pos_rate, max_pos_rate, 237 | noise_deviation, fixed_size) 238 | return X[new_idxs], y[new_idxs], t[new_idxs] 239 | 240 | 241 | def downsample_to_rate(y, min_pos_rate, max_pos_rate=None, 242 | noise_deviation=0.0, fixed_size=False): 243 | """Enforce a class distribution by downsampling. 244 | 245 | Args: 246 | y: The array of output labels to use. 247 | min_pos_rate: The minimum proportion of the positive class acceptable. 248 | max_pos_rate: The maximum proportion of the positive class acceptable. 249 | noise_deviation: Addition of noise either side of the given proportions. 250 | fixed_size: Whether to fix the total size of X to the size of the 251 | minimum class. 252 | 253 | Returns: 254 | An array of selected indexes. 255 | 256 | """ 257 | # random.seed(33) 258 | if max_pos_rate is None: 259 | max_pos_rate = min_pos_rate 260 | 261 | min_pos_rate = utils.resolve_percentage(min_pos_rate) 262 | max_pos_rate = utils.resolve_percentage(max_pos_rate) 263 | 264 | if not (0 <= min_pos_rate <= 1 or 0 <= max_pos_rate <= 1): 265 | raise ValueError( 266 | 'Please supply a proportion in the interval [0, 1]') 267 | 268 | n_pos, n_neg = np.sum(y), np.sum(y == 0) 269 | 270 | # Fix the training set size while downsampling to minority class size 271 | if fixed_size: 272 | n_tot = min(n_pos, n_neg) 273 | else: 274 | n_tot = n_pos + n_neg 275 | 276 | current_pos_perc = float(n_pos) / float(n_tot) 277 | 278 | if current_pos_perc < min_pos_rate: 279 | pos_perc = min_pos_rate 280 | elif current_pos_perc > max_pos_rate: 281 | pos_perc = max_pos_rate 282 | else: # min_pos <= current_pos_perc <= max_pos: 283 | neg_indexes = np.where(y == 0)[0] 284 | pos_indexes = np.where(y == 1)[0] 285 | return np.hstack((neg_indexes, pos_indexes)) 286 | 287 | pos_perc += np.random.normal(0, noise_deviation) 288 | 289 | # print("Starting downsampling {:.1f}% malware function: n_gw = {:,} ; n_mw = {:,} ; n_tot = {:,}".format(perc_mw*100, n_gw, n_mw, n_tot)) 290 | 291 | can_downsample_pos = True 292 | can_downsample_neg = True 293 | 294 | # First, try downsampling goodware 295 | if fixed_size: 296 | n_neg_to_choose = int((1 - pos_perc) * n_tot) 297 | else: 298 | n_neg_to_choose = int( 299 | (float(1 - pos_perc) / float(pos_perc)) * n_pos) 300 | 301 | if n_neg_to_choose > n_neg: 302 | n_neg_to_choose = n_neg 303 | can_downsample_neg = False 304 | # print("Failed to downsample goodware, since: n_gw_to_pick ({}) > n_gw ({})".format(n_gw_to_pick, n_gw)) 305 | 306 | # updating the value n_tot after downsampling the goodware 307 | 308 | if fixed_size: 309 | n_pos_to_choose = int(pos_perc * n_tot) 310 | else: 311 | n_pos_to_choose = int( 312 | (float(pos_perc) / float(1 - pos_perc)) * n_neg) 313 | 314 | if n_pos_to_choose > n_pos: 315 | can_downsample_pos = False 316 | # print("Cannot oversample malware to {:.1f}% of {:,}!".format(perc_mw*100, n_tot)) 317 | 318 | # elif n_mw_to_pick < n_pos: 319 | # print("Downsampled malware to {:.1f}% (n_mw = {:,}, n_mw_to_pick = {:,})".format(perc_mw*100, n_mw, n_mw_to_pick)) 320 | 321 | # import IPython; IPython.embed(); exit() 322 | 323 | # print("After downsampling: n_gw = {:,} ; n_mw = {:,} ; n_tot = {:,}".format(n_gw_to_pick, n_mw_to_pick, n_gw_to_pick+n_mw_to_pick)) 324 | 325 | neg_indexes = np.where(y == 0)[0] 326 | pos_indexes = np.where(y == 1)[0] 327 | 328 | neg_idx_subsample, pos_idx_subsample = neg_indexes, pos_indexes 329 | 330 | # Downsample goodware 331 | if can_downsample_neg: 332 | neg_idx_subsample = random.sample(list(neg_indexes), 333 | n_neg_to_choose) 334 | 335 | if can_downsample_pos: 336 | pos_idx_subsample = random.sample(list(pos_indexes), 337 | n_pos_to_choose) 338 | 339 | if not (can_downsample_neg or can_downsample_pos): 340 | raise Exception("Downsampling failed") 341 | 342 | sampled = np.hstack((np.array(neg_idx_subsample), 343 | np.array(pos_idx_subsample))) 344 | 345 | return np.array(sampled, dtype=int) 346 | -------------------------------------------------------------------------------- /tesseract/metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | metrics.py 5 | ~~~~~~~~~~ 6 | 7 | A set of measurement tools to aid users designing time-aware experiments. 8 | 9 | """ 10 | from collections import defaultdict 11 | 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | import pandas as pd 15 | from sklearn import metrics as skmetrics 16 | from sklearn.linear_model import LinearRegression 17 | from sklearn.metrics import confusion_matrix 18 | from sklearn.preprocessing import LabelEncoder 19 | 20 | from tesseract import utils, temporal 21 | 22 | 23 | def t_slope(metric): 24 | """Compute the slope with respect to the given metric. 25 | 26 | Args: 27 | metric: The metric to operate with respect to. 28 | 29 | Returns: 30 | float: A measure of the trend in the given time range 31 | 32 | """ 33 | reg = LinearRegression() 34 | reg.fit(np.arange(1, len(metric)+1).reshape(-1, 1), np.array(metric).reshape(-1, 1)) 35 | 36 | return reg.coef_[0][0] 37 | 38 | 39 | def aut(results, metric=None, s_idn=None, e_ind=None): 40 | """Compute the AUT with respect to the given metric. 41 | 42 | Note that for results spanning a _single_ time period, AUT = 0 as this is 43 | not considered a time-aware evaluation. 44 | 45 | Args: 46 | results: The set of time-aware results to operate over. 47 | metric: The metric to operate with respect to granularity. 48 | s_idn: The index to start aut evaluation, default is beginning of results. 49 | e_ind: The index to end aut evaluation, default is end of results. 50 | Returns: 51 | float: A measure of robustness for the applied model over the time 52 | spanning the results. 53 | 54 | """ 55 | if isinstance(results, dict) or isinstance(results, pd.DataFrame): 56 | results = results[metric][s_idn:e_ind] 57 | 58 | if len(results) <= 1: 59 | return 0 60 | 61 | return np.trapz(results) / (len(results) - 1) 62 | 63 | 64 | def aut_with_observation_window(results, metric=None, window=None): 65 | """Compute the AUT with respect to the given metric broken down by a window size. 66 | 67 | Note that for results spanning a _single_ time period, AUT = 0 as this is 68 | not considered a time-aware evaluation. 69 | 70 | Args: 71 | results: The set of time-aware results to operate over. 72 | metric: The metric to operate with respect to granularity. 73 | window: The size of window to break aut down into. Eval period mod window must be 0 74 | Returns: 75 | list: A list of aut measures for the applied model over the time 76 | spanning the results, split into the window size. 77 | 78 | """ 79 | return [aut(results, metric, s_idn=w*window, e_ind=(1+w)*window) for w in range(len(results[metric])//window)] 80 | 81 | 82 | def aut_with_granularity(results, granularity, metric=None): 83 | """Compute the AUT with respect to the given metric. 84 | 85 | Note that for results spanning a _single_ time period, AUT = 0 as this is 86 | not considered a time-aware evaluation. 87 | 88 | Args: 89 | granularity: 90 | results: The set of time-aware results to operate over. 91 | metric: The metric to operate with respect to. 92 | 93 | Returns: 94 | float: A measure of robustness for the applied model over the time 95 | spanning the results. 96 | 97 | """ 98 | metric = {"f1": skmetrics.f1_score, 99 | "accuracy": skmetrics.accuracy_score, 100 | "precision": skmetrics.precision_score, 101 | "recall": skmetrics.recall_score, 102 | }[metric] 103 | 104 | if len(results["t_tests"]) <= 1: 105 | return 0 106 | 107 | y_tests = results["y_tests"] 108 | y_preds = results["y_preds"] 109 | t_tests = results["t_tests"] 110 | 111 | results = aut_granularity_split(y_tests, y_preds, t_tests, granularity, metric) 112 | 113 | return np.trapz(results) / (len(results) - 1) 114 | 115 | 116 | def aut_granularity_split(y_tests, y_preds, t_tests, granularity, metric, test_size=1): 117 | results = [] 118 | y_tests = np.concatenate(y_tests, axis=None) 119 | y_preds = np.concatenate(y_preds, axis=None) 120 | t_tests = np.concatenate(t_tests, axis=None) 121 | 122 | _, idxes = temporal.time_aware_indexes(t_tests, 0, test_size, granularity) 123 | for idx in idxes: 124 | y_test = y_tests[idx] 125 | y_pred = y_preds[idx] 126 | result = metric(y_test, y_pred) 127 | results.append(result) 128 | 129 | return results 130 | 131 | 132 | def error_rate(results, metric='f1'): 133 | """Return the error rate formulation as it relates to the given metric. 134 | 135 | Args: 136 | results: The set of time-aware evaluation results to operate over. 137 | metric: The metric to operate with respect to (f1|precision|recall). 138 | 139 | Returns: 140 | float: The rate representing error for the given metric. 141 | 142 | """ 143 | return { 144 | 'f1': errors(results) / (np.sum(results['p']) + np.sum(results['n'])), 145 | 'precision': np.sum(results['fn']) / ( 146 | np.sum(results['tp']) + np.sum(results['fn'])), 147 | 'recall': np.sum(results['fp']) / ( 148 | np.sum(results['tn']) + np.sum(results['fp'])), 149 | }[metric] 150 | 151 | 152 | def errors(results): 153 | """Return the total misclassifications in the results.""" 154 | return np.sum(results['fn']) + np.sum(results['fp']) 155 | 156 | 157 | def plot_alpha_assessment(alpha_assessment_results, outfile=None): 158 | fig = plt.figure() 159 | ax = fig.add_subplot(111) 160 | ax.boxplot((alpha_assessment_results['negative_predictions']['correct'], 161 | alpha_assessment_results['negative_predictions']['incorrect'], 162 | alpha_assessment_results['positive_predictions']['correct'], 163 | alpha_assessment_results['positive_predictions']['incorrect'])) 164 | ax.set_xticklabels(('Neg C', 'Neg IC', 'Pos C', 'Pos IC')) 165 | 166 | if outfile: 167 | plt.savefig(outfile) 168 | else: 169 | plt.show() 170 | 171 | return fig, ax 172 | 173 | 174 | # def plot_results(results, outfile=None, fields=None, title='Scores over time', 175 | # quiet=False): 176 | # if not quiet: 177 | # logging.info(results) 178 | # if outfile: 179 | # results.to_csv(os.path.splitext(outfile)[0] + '.csv') 180 | # 181 | # if fields is None: 182 | # fields = ['f1', 'precision', 'recall', 183 | # 'f1_n', 'precision_n', 'recall_n'] 184 | # 185 | # colors = ('#F2385A', '#F5A503', '#4AD9D9', 186 | # '#FF9999', '#FFDD99', '#AAEEEE') 187 | # ax = results[fields].plot(linestyle='--', marker='o', color=colors) 188 | # 189 | # plt.title(title) 190 | # ax.set_xlabel('Testing round') 191 | # ax.set_ylabel('Score') 192 | # ax.set_ylim([0, 1]) 193 | # ax.set_yticks(np.arange(0, 1.1, 0.1)) 194 | # ax.set_xticks(results.index) 195 | # ax.grid('on', which='major', linestyle=':', axis='y') 196 | # plt.tight_layout() 197 | # 198 | # if outfile: 199 | # plt.savefig(outfile) 200 | # else: 201 | # plt.show() 202 | # 203 | # return ax 204 | 205 | 206 | # def plot_by_time(y, t, granularity='month', type='line', outfile=None): 207 | # df = pd.DataFrame(y, columns=['positive'], index=t) 208 | # df['negative'] = [1 ^ x for x in df['positive']] 209 | # 210 | # try: 211 | # offset_alias = { 212 | # 'year': '1Y', 213 | # 'quarter': '1Q', 214 | # 'month': '1M', 215 | # 'week': '1W', 216 | # 'day': '1D' 217 | # }[granularity] 218 | # except KeyError: 219 | # # Allow a specific offset alias to be passed in 220 | # offset_alias = granularity 221 | # 222 | # df = df.resample(offset_alias).sum() 223 | # 224 | # colors = ('#cc0000', '#66b3ff') 225 | # plot_fn = df.plot.bar if type == 'bar' else df.plot 226 | # ax = plot_fn(color=colors, marker='o', linestyle='--') 227 | # 228 | # plt.title('Frequency of class membership by {}'.format(granularity)) 229 | # ax.set_xlabel('{}(s)'.format(granularity)) 230 | # ax.set_ylabel('Frequency') 231 | # ax.grid('on', which='major', linestyle=':', axis='y') 232 | # plt.tight_layout() 233 | # 234 | # if outfile: 235 | # plt.savefig(outfile) 236 | # else: 237 | # plt.show() 238 | # 239 | # return ax 240 | 241 | 242 | def summarize(y): 243 | positive = sum(y) 244 | negative = len(y) - positive 245 | print('Class counts:') 246 | print('-' * 20) 247 | print('negative: {}'.format(negative)) 248 | print('positive: {}'.format(positive)) 249 | print('\nTotal objects:') 250 | print('-' * 20) 251 | print('{} ({:.04}% positive)'.format(len(y), positive / len(y) * 100)) 252 | 253 | 254 | def get_train_info(X_train, y_train, t_train, existing=None): 255 | # Ensure results are a defaultdict(list) 256 | 257 | results = defaultdict(list, existing) if existing else defaultdict(list) 258 | 259 | # Ensure label array is a numpy array 260 | 261 | y_train = np.array(y_train) 262 | 263 | train_pos = np.sum(y_train) 264 | 265 | results['train_pos'].append(train_pos) 266 | results['train_neg'].append(len(y_train) - train_pos) 267 | results['train_tot'].append(len(y_train)) 268 | 269 | return results 270 | 271 | 272 | def calculate_metrics(y_true, y_pred, existing=None, 273 | raw_scores=None, periods=1): 274 | periods = len(y_pred) if periods == -1 else periods 275 | 276 | if periods > 1: 277 | for y_t, y_p in zip(y_true, y_pred): 278 | existing = calculate_metrics(y_t, y_p, existing, raw_scores) 279 | return existing 280 | 281 | # Ensure results are a defaultdict(list) 282 | 283 | results = defaultdict(list, existing) if existing else defaultdict(list) 284 | 285 | # Ensure both label vectors are Numpy arrays 286 | 287 | y_true = np.array(y_true) 288 | y_pred = np.array(y_pred) 289 | 290 | # Heuristic to check if input are raw scores 291 | 292 | y_raw = None 293 | if (raw_scores or 294 | (raw_scores is None and 295 | utils.check_for_raw_scores(y_pred))): 296 | y_raw = y_pred 297 | 298 | # Convert output scores or categorical labels to integer labels 299 | 300 | y_pred = utils.resolve_categorical(y_pred) 301 | y_true = utils.resolve_categorical(y_true) 302 | 303 | # Ensure labels are encoded as integer labels 304 | 305 | if isinstance(y_pred[0], str): 306 | if isinstance(y_true[0], str): 307 | try: 308 | y_pred = np.array(y_pred, dtype='int32') 309 | y_true = np.array(y_true, dtype='int32') 310 | except ValueError: 311 | enc = LabelEncoder().fit(y_true) 312 | y_true = enc.transform(y_true) 313 | y_pred = enc.transform(y_pred) 314 | else: 315 | try: 316 | y_pred = np.array(y_pred, dtype='int32') 317 | except ValueError: 318 | y_pred = LabelEncoder().fit_transform(y_pred) 319 | 320 | assert len(set(y_true)) <= 2 and len(set(y_pred)) <= 2 321 | 322 | # Update total positive and negative predictions 323 | 324 | tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=(0, 1)).ravel() 325 | p = tp + fn 326 | n = tn + fp 327 | 328 | results['tp'].append(tp) 329 | results['fp'].append(fp) 330 | results['tn'].append(tn) 331 | results['fn'].append(fn) 332 | 333 | results['p'].append(p) 334 | results['n'].append(n) 335 | results['tot'].append(p + n) 336 | 337 | # Update cumulative totals 338 | 339 | results['tp_cumu'].append(np.sum(results['tp'])) 340 | results['fp_cumu'].append(np.sum(results['fp'])) 341 | results['tn_cumu'].append(np.sum(results['tn'])) 342 | results['fn_cumu'].append(np.sum(results['fn'])) 343 | 344 | results['p_cumu'].append(np.sum(results['p'])) 345 | results['n_cumu'].append(np.sum(results['n'])) 346 | results['tot_cumu'].append(np.sum(results['tot'])) 347 | 348 | # Update true/false positive/negative rates 349 | 350 | if p == 0: 351 | results['tpr'].append(np.nan) 352 | results['fnr'].append(np.nan) 353 | else: 354 | results['tpr'].append(tp / p) 355 | results['fnr'].append(fn / p) 356 | 357 | if n == 0: 358 | results['fpr'].append(np.nan) 359 | results['tnr'].append(np.nan) 360 | else: 361 | results['fpr'].append(fp / n) 362 | results['tnr'].append(tn / n) 363 | 364 | # Calculate AUC-ROC if raw scores have been supplied 365 | 366 | if y_raw is not None: 367 | 368 | # Some classifiers output with a score/prob for each class, this 369 | # simply includes only the score/prob of the predicted class as 370 | # skmetrics.roc_auc_score expects both inputs to be the same shape 371 | if y_raw.shape != y_true.shape: 372 | y_scores = np.array([np.max(v) for v in y_raw]) 373 | else: 374 | y_scores = y_raw 375 | 376 | try: 377 | results['auc_roc'].append(skmetrics.roc_auc_score(y_true, y_scores)) 378 | except ValueError as e: 379 | print(e) 380 | results['auc_roc'].append(np.nan) 381 | 382 | # Calculate precision, recall and F1 wrt positive and negative classes 383 | 384 | results['precision'].append( 385 | skmetrics.precision_score(y_true, y_pred, pos_label=1)) 386 | results['recall'].append( 387 | skmetrics.recall_score(y_true, y_pred, pos_label=1)) 388 | results['f1'].append(skmetrics.f1_score(y_true, y_pred, pos_label=1)) 389 | 390 | results['precision_n'].append( 391 | skmetrics.precision_score(y_true, y_pred, pos_label=0)) 392 | results['recall_n'].append( 393 | skmetrics.recall_score(y_true, y_pred, pos_label=0)) 394 | results['f1_n'].append(skmetrics.f1_score(y_true, y_pred, pos_label=0)) 395 | 396 | return results 397 | 398 | 399 | def print_metrics(results, keys=None, header=True): 400 | if keys is None: 401 | keys = [ 402 | ('Actual pos', 'p'), 403 | ('Actual neg', 'n'), 404 | ('Total', 'tot'), 405 | ('hline', 'hline'), 406 | ('TPR', 'tpr'), 407 | ('FPR', 'fpr'), 408 | ('TNR', 'tnr'), 409 | ('FNR', 'fnr'), 410 | ('AUC ROC', 'auc_roc'), 411 | ('hline', 'hline'), 412 | ('Precision', 'precision'), 413 | ('Recall', 'recall'), 414 | ('F1', 'f1'), 415 | ('hline', 'hline')] 416 | else: 417 | if isinstance(keys[0], str): 418 | keys = [(k.title(), k) for k in keys] 419 | 420 | periods = max(len(v) for v in results.values()) 421 | 422 | def print_hline(): 423 | print(('-' * 12) + '+' + ('-' * 7 * periods)) 424 | 425 | if header: 426 | header = '{:12}| '.format('Test period') 427 | header += ''.join(['{:^7}'.format(i) for i in range(1, periods + 1)]) 428 | print_hline() 429 | print(header) 430 | print_hline() 431 | 432 | for label, key in keys: 433 | if label == 'hline': 434 | print_hline() 435 | 436 | elif results[key]: 437 | row = '{:12}|'.format(label) 438 | for result in results[key]: 439 | if isinstance(result, float): 440 | row += '{:>7.3f}'.format(result) 441 | else: 442 | row += '{:>7}'.format(result) 443 | print(row) 444 | 445 | else: 446 | pass # Silently skip missing keys 447 | 448 | # def cumulative(results, metric): 449 | # if metric not in ('f1', 'precision', 'recall', 450 | # 'f1_n', 'precision_n', 'recall_n'): 451 | # return np.cumsum(results[metric]) 452 | # 453 | # tps = np.cumsum(results['tp']) 454 | # tns = np.cumsum(results['tn']) 455 | # fps = np.cumsum(results['fp']) 456 | # fns = np.cumsum(results['fn']) 457 | # 458 | # precision = tps / (tps + fps) 459 | # recall = tps / (tps + fns) 460 | # f1 = 2 * precision * recall / (precision + recall) 461 | # 462 | # if metric == 'f1': 463 | # return f1 464 | # if metric == 'precision': 465 | # return precision 466 | # if metric == 'recall': 467 | # return recall 468 | # 469 | # precision_n = tns / (tns + fns) 470 | # recall_n = tns / (tns + fps) 471 | # f1_n = 2 * precision_n * recall_n / (precision_n + recall_n) 472 | # 473 | # if metric == 'f1_n': 474 | # return f1_n 475 | # if metric == 'precision_n': 476 | # return precision_n 477 | # if metric == 'recall_n': 478 | # return recall_n 479 | -------------------------------------------------------------------------------- /tesseract/evaluation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | evaluation.py 5 | ~~~~~~~~~~~~~ 6 | 7 | 8 | 9 | """ 10 | import multiprocessing as mp 11 | 12 | import numpy as np 13 | import scipy.sparse 14 | from sklearn.model_selection import train_test_split 15 | from tqdm import tqdm 16 | 17 | from tesseract import utils as utils, metrics as metrics, temporal 18 | from tesseract.transcendent import * 19 | 20 | 21 | class Stage: 22 | """Parent class representing stage of the time-aware evaluation cycle. 23 | 24 | The time-aware evaluation cycle is divided into stages, offering the 25 | ability for the system designer to interact with the classification 26 | process. The stages can generally be thought of as the following: 27 | 28 | * Rebalancing: Alterations can be made to the training set composition. 29 | * Training: The classifier is fit to the training data. 30 | * Prediction: Labels are predicted by the classifier. 31 | * Rejection: Low-quality predictions can be discarded/quarantined. 32 | * Selection: Test objects can be selected and added to the training. 33 | 34 | The rebalancing, prediction and selection stages can all be implemented by 35 | subclassing Stage or its children. 36 | 37 | Subclasses of Stage can be coupled together with Stages of the same type, 38 | for example, tesseract.evaluation.fit_predict_update accepts lists of 39 | Rejectors which will be activated in order during the rejection 'stage' of 40 | the evaluation cycle. To determine whether a Stage is activated during that 41 | cycle, it contains a schedule. 42 | 43 | A schedule is simply a list of booleans, the length of the total periods 44 | expected during that cycle; the Stage is active if the index of the 45 | schedule for that period is True. Some special values exist which will be 46 | resolved to valid schedules: 47 | 48 | * 'first': Activate on the first cycle only. 49 | * 'last': Activate on the last cycle only. 50 | * 1: Activate every cycle. 51 | * 0: Never activate. 52 | 53 | These settings don't require the total number of test periods to be known 54 | in advance, the schedule will be resolved once fit_predict_update has been 55 | called, by checking the X_tests parameter. 56 | 57 | Attributes: 58 | schedule (list): A list of booleans indicating when the Stage should be 59 | active during the evaluation cycle. 60 | 61 | """ 62 | 63 | def __init__(self, schedule=1): 64 | self.schedule = schedule 65 | 66 | def resolve_schedule(self, total_periods): 67 | """Produces a valid schedule for the total periods specified. 68 | 69 | A schedule is a list of booleans, the length of the total periods 70 | expected during that cycle; the Stage is active if the index of the 71 | schedule for that period is True. 72 | 73 | Some special values exist which will be resolved to valid schedules: 74 | 75 | * 'first': Activate on the first cycle only. 76 | * 'last': Activate on the last cycle only. 77 | * 1: Activate every cycle. 78 | * 0: Never activate. 79 | 80 | """ 81 | if self.schedule == 'first': 82 | self.schedule = [True] + [False] * (total_periods - 1) 83 | elif self.schedule == 'last': 84 | self.schedule = [False] * (total_periods - 1) + [True] 85 | elif self.schedule in (1, '1'): 86 | self.schedule = [True] * total_periods 87 | elif self.schedule in (0, '0'): 88 | self.schedule = [False] * total_periods 89 | elif hasattr(self.schedule, '__iter__'): 90 | self.schedule = [int(x) == 0 for x in self.schedule] 91 | else: 92 | raise ValueError('Schedule `{}` cannot be understood.'.format( 93 | self.schedule)) 94 | 95 | 96 | class TrackingStage(Stage): 97 | """ 98 | 99 | """ 100 | 101 | def __init__(self, schedule=1, tracking=True, interaction='intersection'): 102 | super().__init__(schedule=schedule) 103 | 104 | self._interactions = ('intersection', 'union', 'sym_diff', 'ignore') 105 | 106 | self.tracking = tracking 107 | self.interaction = interaction 108 | 109 | if interaction not in self._interactions: 110 | raise ValueError('Interaction mode must be one of {}'.format( 111 | self._interactions)) 112 | 113 | def merge_results(self, past, present): 114 | # Case for first test period in a cycle 115 | # (distinct from when past is an empty array) 116 | if past is None: 117 | return present 118 | 119 | if self.interaction == 'union': 120 | return np.union1d(past, present) 121 | elif self.interaction == 'intersection': 122 | return np.intersect1d(past, present) 123 | elif self.interaction == 'sym_diff': 124 | return np.setxor1d(past, present) 125 | 126 | 127 | def fit_predict_update(clf, X_train, X_tests, 128 | y_train, y_tests, t_train, t_tests, 129 | fit_function=None, predict_function=None, 130 | rebalancers=(), rejectors=(), selectors=()): 131 | """Sliding window classification of a timestamp partitioned dataset. 132 | 133 | This function assumes that the dataset has been partitioned into 134 | historically coherent training and testing sets such that all objects in 135 | the training set are historically anterior to all objects in the testing 136 | sets, and in each testing set i, all objects in the set are historically 137 | anterior to all objects in testing set i + 1. 138 | 139 | The set of testing objects X_tests is split into a series of rolling 140 | testing windows (as are the corresponding y_tests). Each round of 141 | prediction is performed on the next test partition in the series. 142 | 143 | This arrangement is depicted here with the parameters: 144 | 145 | * Training dataset size: 6 months 146 | * Testing dataset size: 2 months 147 | * Date range of the dataset: 12 months (Jan - Dec) 148 | 149 | Months tagged ■ are included in the training dataset. 150 | Months tagged □ are included in the testing dataset. 151 | Months tagged ▣ are included in training dataset but the results from the 152 | previous round of testing are concatenated to the latest results. 153 | 154 | Rolling testing 155 | --------------- 156 | 157 | Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 158 | 1 ■ ■ ■ ■ ■ ■ □ □ 159 | 2 ■ ■ ■ ■ ■ ■ □ □ 160 | 3 ■ ■ ■ ■ ■ ■ □ □ 161 | 162 | Example: 163 | >>> from sklearn.svm import LinearSVC 164 | >>> from tesseract import mock, temporal, evaluation 165 | >>> 166 | >>> X, y, t = mock.generate_binary_test_data(10000, '2000') 167 | >>> 168 | >>> splits = temporal.time_aware_train_test_split( 169 | >>> X, y, t, train_size=6, test_size=2, granularity='month') 170 | >>> 171 | >>> clf = LinearSVC() 172 | >>> 173 | >>> results = evaluation.fit_predict_update(clf, *splits) 174 | 175 | For comparison, here's the same set of parameters combined with 176 | a FullRetrainingSelector to achieve incremental retraining at each 177 | testing period: 178 | 179 | Rolling testing, incremental retraining 180 | --------------------------------------- 181 | 182 | Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 183 | 1 ■ ■ ■ ■ ■ ■ □ □ 184 | 2 ■ ■ ■ ■ ■ ■ ■ ■ □ □ 185 | 3 ■ ■ ■ ■ ■ ■ ■ ■ ■ ■ □ □ 186 | 187 | Example: 188 | >>> from tesseract.selection import FullRetrainingSelector 189 | >>> 190 | >>> results = evaluation.fit_predict_update( 191 | >>> clf, *splits, selectors=[FullRetrainingSelector()]) 192 | selectors=[ActiveLearningSelector()] 193 | 194 | The time-aware evaluation cycle is divided into stages, offering the 195 | ability for the system designer to interact with the classification 196 | process. The stages can generally be thought of as the following: 197 | 198 | * Rebalancing: Alterations can be made to the training set composition. 199 | * Training: The classifier is fit to the training data. 200 | * Prediction: Labels are predicted by the classifier. 201 | * Rejection: Low-quality predictions can be discarded/quarantined. 202 | * Selection: Test objects can be selected and added to the training. 203 | 204 | This cycle repeats for each testing period. The rebalancing, prediction 205 | and selection stages are each triggered by passing in lists of Rebalancer, 206 | Rejector or Selector objects respectively. These are then invoked 207 | (in order) at the appropriate stages in the training phase. Stages can be 208 | switched on and off for certain testing periods by passing them a 209 | schedule and the way they interact with previous stages of the same type 210 | can also be controlled. 211 | 212 | Fitting will use the fit() method of the classifier while prediction will 213 | try to resolve the most appropriate one for the classifier (either to 214 | produce output labels or raw scores). This behaviour can be overridden by 215 | passing a function to fit_function or predict_function. 216 | 217 | The form of these functions must maintain the following contract: 218 | 219 | * fit_function(X_train, y_train) 220 | * y_pred = predict_function(X_test) 221 | 222 | Note, there are plans to improve the rudimentary predict-function-detection 223 | and to perhaps replace the fit_function and predict_function parameters 224 | with Fitter and Predictor objects which would allow for greater control. 225 | 226 | Args: 227 | clf: A scikit-learn or Keras classifier with fit and predict methods. 228 | X_train (np.ndarray): Training partition of predictors X. 229 | X_tests (list): List of testing partitions of predictors X. 230 | y_train (np.ndarray): Training partition of output variables y. 231 | y_tests (list): List of testing partitions of predictors y. 232 | t_train (np.ndarray): Training partition of datetimes for X. 233 | t_tests (list): List of testing partitions of datetimes for X. 234 | fit_function (function): The function to use to fit clf. 235 | predict_function (function): The function to predict with. 236 | rebalancers (list): A list of rebalancers to alter the training set. 237 | rejectors (list): A list of rejectors to reject poor predictions. 238 | selectors (list): A list of selectors to pick test items to train with. 239 | 240 | Returns: 241 | dict: Performance metrics for each round of predictions, including 242 | precision, recall, F1 score, AUC ROC, TPR, TNR, FPR, FNR, TP, FP, 243 | TN, FN, actual positive and actual negative counts. 244 | 245 | See Also: 246 | tesseract.temporal.time_aware_train_test_split 247 | tesseract.evaluation.Stage 248 | tesseract.selection.Selector 249 | tesseract.rejection.Rejector 250 | tesseract.rebalancing.Rebalancer 251 | 252 | """ 253 | fit_function = clf.fit if fit_function is None else fit_function 254 | predict_function = (utils.select_prediction_function(clf, labels_only=True) 255 | if predict_function is None else predict_function) 256 | 257 | for stage in tuple(rebalancers) + tuple(rejectors) + tuple(selectors): 258 | stage.resolve_schedule(len(X_tests)) 259 | 260 | results = {} 261 | selected_indexes = None 262 | for i, (X_test, y_test, t_test) in tqdm(enumerate( 263 | zip(X_tests, y_tests, t_tests))): 264 | 265 | # --------------------------------------------------------------- # 266 | # Make alterations to the dataset before testing (optional) # 267 | # --------------------------------------------------------------- # 268 | 269 | for rebalancer in rebalancers: 270 | if not rebalancer.schedule[i]: 271 | continue 272 | 273 | X_train, y_train, t_train = rebalancer.alter( 274 | clf, X_train, y_train, t_train, X_test, y_test, t_test) 275 | 276 | # --------------------------------------------------------------- # 277 | # (Re)fit and predict # 278 | # --------------------------------------------------------------- # 279 | 280 | results = metrics.get_train_info( 281 | X_train, y_train, t_train, existing=results) 282 | 283 | if selected_indexes is not None or i == 0: 284 | fit_function(X_train, y_train) 285 | 286 | y_pred = predict_function(X_test) 287 | 288 | # --------------------------------------------------------------- # 289 | # Discard/quarantine observations (optional) # 290 | # --------------------------------------------------------------- # 291 | 292 | kept_indexes, rejected_indexes = None, None 293 | for rejector in rejectors: 294 | if not rejector.schedule[i]: 295 | continue 296 | 297 | kept_indexes, rejected_indexes = rejector.reject_wrapper( 298 | clf, X_train, y_train, t_train, 299 | X_test, y_test, t_test, 300 | kept_indexes, rejected_indexes) 301 | 302 | # cause bug that X_test doesn't change 303 | if kept_indexes is not None: 304 | y_test = y_test[kept_indexes] 305 | y_pred = y_pred[kept_indexes] 306 | t_test = t_test[kept_indexes] 307 | 308 | results['rejected'].append(rejected_indexes.size) 309 | else: 310 | results['rejected'].append(0) 311 | 312 | # --------------------------------------------------------------- # 313 | # Calculate performance # 314 | # --------------------------------------------------------------- # 315 | 316 | results = metrics.calculate_metrics( 317 | y_test, y_pred, existing=results) 318 | 319 | # --------------------------------------------------------------- # 320 | # Select test observations for retraining (optional) # 321 | # --------------------------------------------------------------- # 322 | 323 | selected_indexes = None 324 | for selector in selectors: 325 | if not selector.schedule[i]: 326 | continue 327 | 328 | selected_indexes = selector.query_wrapper( 329 | clf, X_train, y_train, t_train, 330 | X_test, y_test, t_test, selected_indexes) 331 | 332 | if selected_indexes is not None: 333 | # Select observations for training using chosen indices 334 | X_selected = X_test[selected_indexes] 335 | y_selected = y_test[selected_indexes] 336 | t_selected = t_test[selected_indexes] 337 | 338 | # Update training model with N selected points 339 | X_train = scipy.sparse.vstack((X_train, X_selected)) 340 | y_train = np.hstack((y_train, y_selected)) 341 | t_train = np.hstack((t_train, t_selected)) 342 | 343 | results['selected'].append(selected_indexes.size) 344 | else: 345 | results['selected'].append(0) 346 | 347 | if 'y_preds' not in results: 348 | results['y_tests'] = [y_test] 349 | results['y_preds'] = [y_pred] 350 | results['t_tests'] = [t_test] 351 | else: 352 | results['y_tests'].append(y_test) 353 | results['y_preds'].append(y_pred) 354 | results['t_tests'].append(t_test) 355 | 356 | return results 357 | 358 | 359 | def predict(clf, X_tests, decision_threshold=None, 360 | labels_only=False, predict_function=None, nproc=1): 361 | """Standalone prediction of a set of test periods. 362 | 363 | Takes a set of historically aware test periods and performs prediction 364 | across them. This can be useful when there is no need for the interactive 365 | stages of a prediction as in that case the process can be performed in 366 | parallel. 367 | 368 | Example: 369 | >>> from sklearn.ensemble import RandomForestClassifier 370 | >>> from tesseract import mock, temporal, evaluation, metrics 371 | >>> 372 | >>> X, y, t = mock.generate_binary_test_data(10000, '2000') 373 | >>> 374 | >>> splits = temporal.time_aware_train_test_split( 375 | >>> X, y, t, train_size=6, test_size=2, granularity='month') 376 | >>> 377 | >>> X_train, X_tests, y_train, y_tests, t_train, t_tests = splits 378 | >>> 379 | >>> clf = RandomForestClassifier(n_estimators=101, max_depth=64) 380 | >>> clf.fit(X_train, y_train) 381 | >>> 382 | >>> y_preds = evaluation.predict(clf, X_tests, nproc=4) 383 | >>> results = metrics.calculate_metrics(y_tests, y_preds, periods=-1) 384 | >>> metrics.print_metrics(results) 385 | 386 | Args: 387 | clf: A scikit-learn or Keras classifier with fit and predict methods. 388 | X_tests (list): List of testing partitions of predictors X. 389 | decision_threshold (float): Calibrate prediction function by 390 | supplying a threshold over which scores are labelled positive. 391 | This is intended for classifiers that output probabilities only. 392 | labels_only (bool): Prefer a labelling prediction function over one 393 | that outputs raw scores. 394 | predict_function (function): A custom function to predict with. 395 | nproc (int): The number of processors to use. 396 | 397 | Returns: 398 | list: A list of np.array objects containing the classification results 399 | for each test period in X_tests. 400 | 401 | """ 402 | predict_function = ( 403 | utils.select_prediction_function(clf, labels_only=labels_only) if 404 | predict_function is None else predict_function) 405 | 406 | # `nproc = -1` becomes `nproc = mp.cpu_count() + (- 1)`, etc 407 | nproc = mp.cpu_count() + nproc if nproc < 0 else nproc 408 | 409 | # Predictions have no dependencies in this context, we can parallelize them 410 | if nproc > 1: 411 | with mp.Pool(nproc) as p: 412 | y_preds = list(tqdm( 413 | p.imap(predict_function, X_tests), total=len(X_tests))) 414 | 415 | # Avoid invoking parallelism and associated overhead for a single CPU 416 | else: 417 | y_preds = [] 418 | for X_test in tqdm(X_tests): 419 | y_pred = predict_function(X_test) 420 | y_preds.append(y_pred) 421 | 422 | # TODO | Move to an "apply_decision_threshold" function to better test 423 | # TODO | and include the option in fit_predict_update (probas only). 424 | if decision_threshold: 425 | for i, y_pred in enumerate(y_preds): 426 | if y_pred.ndim > 1: 427 | y_scores = np.array([np.max(v) for v in y_pred]) 428 | else: 429 | y_scores = y_pred 430 | y_preds[i] = np.array(y_scores > decision_threshold, dtype=int) 431 | 432 | return y_preds 433 | 434 | 435 | -------------------------------------------------------------------------------- /tesseract/transcendent.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import statistics 4 | import numpy as np 5 | from tqdm import tqdm 6 | import pickle as pkl 7 | from sklearn import metrics as mtcs 8 | import multiprocessing as mp 9 | from itertools import repeat 10 | 11 | 12 | def sort_by_predicted_label( 13 | scores, predicted_labels, groundtruth_labels, consider='correct'): 14 | """Sort scores into lists of their respected predicted classes. 15 | 16 | Divide a set of scores into 'predicted positive' and 'predicted 17 | negative' results. Optionally consider only correct or incorrect 18 | predictions. `scores`, `predicted_labels`, and `groundtruth_labels` 19 | should be aligned (one per observation). 20 | 21 | Example: 22 | >>> s = np.array([0.8, 0.7, 0.6, 0.9]) 23 | >>> y_pred = np.array([1, 1, 0, 0]) 24 | >>> y_true = np.array([1, 0, 1, 0]) 25 | >>> sort_by_predicted_label(s, y_pred, y_true, 'correct') 26 | (array([0.8]), array([0.9])) 27 | >>> sort_by_predicted_label(s, y_pred, y_true, 'incorrect') 28 | (array([0.7]), array([0.6])) 29 | >>> sort_by_predicted_label(s, y_pred, y_true, 'all') 30 | (array([0.8, 0.7]), array([0.6, 0.9])) 31 | 32 | Args: 33 | scores (np.ndarray): Predicted scores to be sorted. 34 | predicted_labels (np.ndarray): The prediction outcome for each object. 35 | groundtruth_labels (np.ndarray): The groundtruth label for each object. 36 | consider (str): ['correct'|'incorrect'|'all']. Whether to consider only 37 | correct predictions, incorrect predictions, or not to distinguish 38 | between them. 39 | 40 | Returns: 41 | (np.ndarray, np.ndarray): Tuple of sorted scores (malware, goodware). 42 | 43 | """ 44 | 45 | def predicted(i, k): 46 | return predicted_labels[i] == k 47 | 48 | def correct(i, k): 49 | return predicted(i, k) and (groundtruth_labels[i] == k) 50 | 51 | def incorrect(i, k): 52 | return predicted(i, k) and (groundtruth_labels[i] == (k ^ 1)) 53 | 54 | if consider == 'correct': 55 | select = correct 56 | elif consider == 'incorrect': 57 | select = incorrect 58 | elif consider == 'all': 59 | select = predicted 60 | else: 61 | raise ValueError('Unknown thresholding criteria!') 62 | 63 | scores_mw = [scores[i] for i in range(len(scores)) if select(i, 1)] 64 | scores_gw = [scores[i] for i in range(len(scores)) if select(i, 0)] 65 | 66 | return np.array(scores_mw), np.array(scores_gw) 67 | 68 | 69 | def apply_threshold(binary_thresholds, test_scores, y_test): 70 | """Returns a 'keep mask' describing which elements to include. 71 | 72 | Elements that fall above the threshold (and should be kept) have 73 | their indexes marked TRUE. 74 | 75 | Elements that fall below the threshold (and should be rejected) have 76 | their indexes marked FALSE. 77 | 78 | `binary_thresholds` expects a dictionary keyed by 'cred' and/or 'conf', 79 | with sub-dictionaries containing the thresholds for the mw and gw classes. 80 | 81 | Note that the keys of `binary_thresholds` determine _which_ thresholding 82 | criteria will be enforced. That is, if only a 'cred' dictionary is supplied 83 | thresholding will be enforced on cred-only and the same for 'conf'. 84 | Supplying cred and conf dictionaries will enforce the 'cred+conf' 85 | thresholding criteria (all thresholds will be applied). 86 | 87 | `test_scores` expects a dictionary in much the same way, with at least the 88 | same keys as `binary_thresholds` ('cred' and/or 'conf' at the top level). 89 | 90 | Example: 91 | >>> thresholds = {'cred': {'mw': 0.4, 'gw': 0.6}, 92 | ... 'conf': {'mw': 0.5, 'gw': 0.8}} 93 | >>> scores = {'cred': [0.4, 0.2, 0.7, 0.8, 0.6], 94 | ... 'conf': [0.6, 0.8, 0.3, 0.2, 0.4]} 95 | >>> y = np.array([1, 1, 1, 0, 0]) 96 | >>> apply_threshold(thresholds, scores, y) 97 | array([ True, False, False, False, False]) 98 | 99 | Args: 100 | binary_thresholds(dict): The threshold to apply. 101 | test_scores (dict): The test scores to apply the threshold to. 102 | y_test (np.ndarray): The set of predictions to decide which 'per-class' 103 | threshold to use. Depending on the stage of conformal evaluation, 104 | this could be either the predicted or ground truth labels. 105 | 106 | Returns: 107 | np.ndarray: Boolean mask to use on the elements (1 = kept, 0 = reject). 108 | 109 | """ 110 | # Assert preconditions 111 | assert (set(binary_thresholds.keys()) in 112 | [{'cred'}, {'conf'}, {'cred', 'conf'}]) 113 | 114 | for key in binary_thresholds.keys(): 115 | assert key in test_scores.keys() 116 | assert set(binary_thresholds[key].keys()) == {'mw', 'gw'} 117 | 118 | def get_class_threshold(criteria, k): 119 | return (binary_thresholds[criteria]['mw'] if k == 1 120 | else binary_thresholds[criteria]['gw']) 121 | 122 | keep_mask = [] 123 | for i, y_prediction in enumerate(y_test): 124 | 125 | cred_threshold, conf_threshold = 0, 0 126 | current_cred, current_conf = 0, 0 127 | 128 | if 'cred' in binary_thresholds: 129 | key = 'cred' 130 | current_cred = test_scores[key][i] 131 | cred_threshold = get_class_threshold(key, y_prediction) 132 | 133 | if 'conf' in binary_thresholds: 134 | key = 'conf' 135 | current_conf = test_scores[key][i] 136 | conf_threshold = get_class_threshold(key, y_prediction) 137 | 138 | keep_mask.append( 139 | (current_cred >= cred_threshold) and 140 | (current_conf >= conf_threshold)) 141 | 142 | return np.array(keep_mask, dtype=bool) 143 | 144 | 145 | def get_performance_with_rejection(y_true, y_pred, keep_mask, full=True): 146 | """Get test results, rejecting predictions based on a given keep mask. 147 | 148 | Args: 149 | y_true (np.ndarray): The groundtruth label for each object. 150 | y_pred (np.ndarray): The set of predictions to decide which 'per-class' 151 | threshold to use. Depending on the stage of conformal evaluation, 152 | this could be either the predicted or ground truth labels. 153 | keep_mask (np.ndarray): A boolean mask describing which elements to 154 | keep (True) or reject (False). 155 | full (bool): True if full statistics are required, False otherwise. 156 | False is computationally less expensive. 157 | 158 | Returns: 159 | dict: A dictionary of results for baseline, kept, and rejected metrics. 160 | 161 | """ 162 | y_true = np.array(y_true) 163 | y_pred = np.array(y_pred) 164 | 165 | d = {} 166 | 167 | total_neg = len(y_pred) - sum(y_pred) 168 | total_pos = sum(y_pred) 169 | 170 | kept_total_perc = sum(keep_mask) / len(keep_mask) 171 | reject_total_perc = sum(~keep_mask) / len(keep_mask) 172 | 173 | kept_neg = len(y_pred[keep_mask]) - sum(y_pred[keep_mask]) 174 | kept_pos = sum(y_pred[keep_mask]) 175 | 176 | reject_neg = total_neg - kept_neg 177 | reject_pos = total_pos - kept_pos 178 | 179 | kept_neg_perc = (kept_neg / total_neg) 180 | kept_pos_perc = (kept_pos / total_pos) 181 | 182 | reject_neg_perc = 1 - kept_neg_perc 183 | reject_pos_perc = 1 - kept_pos_perc 184 | 185 | reject_neg_total = reject_neg / len(y_pred) 186 | reject_pos_total = reject_pos / len(y_pred) 187 | 188 | d.update({'total_neg': total_neg, 189 | 'total_pos': total_pos, 190 | 'kept_total_perc': kept_total_perc, 191 | 'reject_total_perc': reject_total_perc, 192 | 'kept_neg': kept_neg, 'kept_pos': kept_pos, 193 | 'reject_neg': reject_neg, 'reject_pos': reject_pos, 194 | 'kept_neg_perc': kept_neg_perc, 195 | 'kept_pos_perc': kept_pos_perc, 196 | 'reject_neg_perc': reject_neg_perc, 197 | 'reject_pos_perc': reject_pos_perc, 198 | 'reject_neg_total': reject_neg_total, 199 | 'reject_pos_total': reject_pos_total}) 200 | 201 | f1_b = mtcs.f1_score(y_true, y_pred) 202 | f1_k = mtcs.f1_score(y_true[keep_mask], 203 | y_pred[keep_mask]) 204 | f1_r = mtcs.f1_score(y_true[~keep_mask], 205 | y_pred[~keep_mask]) 206 | 207 | d.update({'f1_b': f1_b, 'f1_k': f1_k, 'f1_r': f1_r}) 208 | 209 | precision_b = mtcs.precision_score(y_true, y_pred) 210 | 211 | precision_k = mtcs.precision_score(y_true[keep_mask], 212 | y_pred[keep_mask]) 213 | precision_r = mtcs.precision_score(y_true[~keep_mask], 214 | y_pred[~keep_mask]) 215 | d.update({'precision_b': precision_b, 216 | 'precision_k': precision_k, 217 | 'precision_r': precision_r}) 218 | 219 | recall_b = mtcs.recall_score(y_true, y_pred) 220 | 221 | recall_k = mtcs.recall_score(y_true[keep_mask], 222 | y_pred[keep_mask]) 223 | recall_r = mtcs.recall_score(y_true[~keep_mask], 224 | y_pred[~keep_mask]) 225 | d.update({'recall_b': recall_b, 'recall_k': recall_k, 'recall_r': recall_r}) 226 | 227 | if full: 228 | cf_baseline = mtcs.confusion_matrix(y_true, y_pred) 229 | 230 | cf_keep = mtcs.confusion_matrix(y_true[keep_mask], 231 | y_pred[keep_mask]) 232 | cf_reject = mtcs.confusion_matrix(y_true[~keep_mask], 233 | y_pred[~keep_mask]) 234 | try: 235 | tn_b, fp_b, fn_b, tp_b = cf_baseline.ravel() 236 | tn_k, fp_k, fn_k, tp_k = cf_keep.ravel() 237 | tn_r, fp_r, fn_r, tp_r = cf_reject.ravel() 238 | except Exception as e: 239 | print(f'Transcendent met a problem: {e}') 240 | 241 | return d 242 | 243 | d.update({ 244 | 'tn_b': tn_b, 'fp_b': fp_b, 'fn_b': fn_b, 'tp_b': tp_b, 245 | 'tn_k': tn_k, 'fp_k': fp_k, 'fn_k': fn_k, 'tp_k': tp_k, 246 | 'tn_r': tn_r, 'fp_r': fp_r, 'fn_r': fn_r, 'tp_r': tp_r 247 | }) 248 | 249 | d['tpr_b'] = tp_b / (tp_b + fn_b) 250 | d['tpr_k'] = tp_k / (tp_k + fn_k) 251 | d['tpr_r'] = tp_r / (tp_r + fn_r) 252 | 253 | d['fpr_b'] = fp_b / (fp_b + tn_b) 254 | d['fpr_k'] = fp_k / (fp_k + tn_k) 255 | d['fpr_r'] = fp_r / (fp_r + tn_r) 256 | 257 | return d 258 | 259 | 260 | def test_with_rejection( 261 | binary_thresholds, test_scores, groundtruth_labels, predicted_labels, full=True): 262 | """Get test results, rejecting predictions based on a given threshold. 263 | 264 | `binary_thresholds` expects a dictionary keyed by 'cred' and/or 'conf', 265 | with sub-dictionaries containing the thresholds for the mw and gw classes. 266 | 267 | Note that the keys of `binary_thresholds` determine _which_ thresholding 268 | criteria will be enforced. That is, if only a 'cred' dictionary is supplied 269 | thresholding will be enforced on cred-only and the same for 'conf'. 270 | Supplying cred and conf dictionaries will enforce the 'cred+conf' 271 | thresholding criteria (all thresholds will be applied). 272 | 273 | `test_scores` expects a dictionary in much the same way, with at least the 274 | same keys as `binary_thresholds` ('cred' and/or 'conf' at the top level). 275 | 276 | See Also: 277 | - `apply_threshold` 278 | - `get_performance_with_rejection` 279 | 280 | Args: 281 | binary_thresholds (dict): The threshold to apply. 282 | test_scores (dict): The test scores to apply the threshold to. 283 | groundtruth_labels (np.ndarray): The groundtruth label for each object. 284 | predicted_labels (np.ndarray): The set of predictions to decide which 285 | 'per-class' threshold to use. Depending on the stage of conformal 286 | evaluation, this could be either the predicted or ground truth 287 | labels. 288 | full (boolean): Optimization flag which dictates how much data to return, 289 | default is True. False gives a lot more performance but removes a lot 290 | of metrics. 291 | 292 | Returns: 293 | dict: A dictionary of results for baseline, kept, and rejected metrics. 294 | 295 | """ 296 | keep_mask = apply_threshold( 297 | binary_thresholds=binary_thresholds, 298 | test_scores=test_scores, 299 | y_test=predicted_labels) 300 | 301 | results = get_performance_with_rejection( 302 | y_true=groundtruth_labels, 303 | y_pred=predicted_labels, 304 | keep_mask=keep_mask, 305 | full=full) 306 | 307 | return results 308 | 309 | 310 | def random_threshold(scores, predicted_labels): 311 | """Produce random thresholds over the given scores. 312 | 313 | Args: 314 | scores (dict): The test scores on which to produce a threshold. 315 | predicted_labels (np.ndarray): The set of predictions to decide which 316 | 'per-class' threshold to use. 317 | 318 | Returns: 319 | dict: Set of thresholds for malware ('gw') and goodware ('gw') classes. 320 | 321 | """ 322 | scores_mw, scores_gw = sort_by_predicted_label( 323 | scores, predicted_labels, np.array([]), 'all') 324 | mw_threshold = np.random.uniform(min(scores_mw), max(scores_mw)) 325 | gw_threshold = np.random.uniform(min(scores_gw), max(scores_gw)) 326 | return {'mw': mw_threshold, 'gw': gw_threshold} 327 | 328 | 329 | def format_opts(metrics, results): 330 | """Helper function for formatting the results of a list of metrics.""" 331 | return ('{}: {:.4f} | ' * len(metrics)).format( 332 | *[item for sublist in 333 | zip(metrics, [results[k] for k in metrics]) for 334 | item in sublist]) 335 | 336 | 337 | def find_random_search_thresholds( 338 | scores, predicted_labels, groundtruth_labels, 339 | max_metrics='f1_k,kept_total_perc', min_metrics='f1_r', 340 | ceiling=0.25, max_samples=100, objective_func=None): 341 | """Perform a random grid search to find the best thresholds on `scores`. 342 | 343 | `scores` expects a dictionary keyed by 'cred' and/or 'conf', 344 | with sub-dictionaries containing the thresholds for the mw and gw classes. 345 | 346 | Note that the keys of `scores` determine _which_ thresholding criteria will 347 | be enforced. That is, if only a 'cred' dictionary is supplied, thresholding 348 | will be enforced on cred-only and the same for 'conf'. Supplying cred and 349 | conf dictionaries will enforce the 'cred+conf' thresholding criteria (all 350 | thresholds will be applied). 351 | 352 | `max_metrics` and `min_metrics` describe the metrics that should be 353 | maximised or minimised if the default objective function is being used 354 | (a harmonic mean, selected with `objective_func=None`). It expects either 355 | a list of possible metrics, or a string or comma separated metrics. 356 | 357 | For example, both of the following are acceptable: 358 | 359 | > max_metrics = ['f1_k', 'kept_total_perc'] 360 | > max_metrics = 'f1_k,kept_total_perc' 361 | 362 | `ceiling` describes the constraints of the optimization function. If any of 363 | the selected metrics exceed the value given then the thresholds chosen are 364 | discarded. `ceiling` expects a dictionary of metrics and maximum acceptable 365 | values. Alternatively, arguments can be given in string form as comma- 366 | separated key:value pairs, e.g., 'key1:value1,key2:value2,key3:value3'. 367 | Finally, if a float is provided, it's interpreted as being the maximum 368 | acceptable value for the total number of rejected predictions. 369 | 370 | To summarise, all of the following are equivalent: 371 | 372 | > ceiling = {'total_reject_perc': 0.25} 373 | > ceiling = 'total_reject_perc:0.25' 374 | > ceiling = 0.25 375 | 376 | For a list of possible metrics, see the keys in the dict produced by 377 | `get_performance_with_rejection()`. Note that the default objective 378 | function assumes that the provided metrics are in the interval [0,1]. 379 | 380 | `objective_func` is the objective function to maximise during the random 381 | search. By default (`objective_func=None`), it will maximise the harmonic 382 | mean of the given `max_metrics` and 1 - each of the given `min_metrics`. 383 | 384 | A custom objective function can be provided which should expect a result 385 | dictionary of metrics just like the dictionary produced by 386 | `get_performance_with_rejection()`. 387 | 388 | See Also: 389 | - `get_performance_with_rejection` 390 | 391 | Args: 392 | scores (dict): The test scores on which to perform the random search. 393 | predicted_labels (np.ndarray): The set of predictions to decide which 394 | 'per-class' threshold to use. 395 | groundtruth_labels (np.ndarray): The groundtruth label for each object. 396 | max_metrics: The metrics that should be maximised. 397 | min_metrics: The metrics that should be minimised. 398 | ceiling: Can be passed an empty dict if you don't want to enforce any 399 | constraint in this way. 400 | max_samples (int): The maximum number of random threshold combinations 401 | to try before settling for the best performance up to that point. 402 | objective_func (function): The objective function to maximise. 403 | 404 | Returns: 405 | dict: Set of thresholds for malware ('gw') and goodware ('gw') classes. 406 | 407 | """ 408 | 409 | # Resolve possible formats for `max_metrics` and `min_metrics`. 410 | def resolve_opt_list(x): 411 | return x.split(',') if isinstance(x, str) else x 412 | 413 | min_metrics = resolve_opt_list(min_metrics) 414 | max_metrics = resolve_opt_list(max_metrics) 415 | 416 | # Resolve possible formats of `ceiling`. 417 | ceiling = {} if ceiling is None else ceiling 418 | 419 | ceiling = ({'total_reject_perc': ceiling} 420 | if isinstance(ceiling, (int, float)) else ceiling) 421 | 422 | if isinstance(ceiling, str): 423 | pairs = ceiling.split(',') 424 | pairs = [x.split(':') for x in pairs] 425 | ceiling = {k: float(v) for k, v in pairs} 426 | 427 | # Resolve objective function to use during the optimization. 428 | def harm_mean(d): 429 | maximise = [d[m] for m in max_metrics] 430 | maximise.extend([1 - d[m] for m in min_metrics]) 431 | return statistics.harmonic_mean(maximise) 432 | 433 | objective_func = harm_mean if objective_func is None else objective_func 434 | 435 | best_outcome, n_samples = 0, 0 436 | best_thresholds, best_results = {}, {} 437 | 438 | while True: 439 | # Choose and package random thresholds 440 | thresholds = {} 441 | if 'cred' in scores: 442 | cred_thresholds = random_threshold(scores['cred'], predicted_labels) 443 | thresholds['cred'] = cred_thresholds 444 | if 'conf' in scores: 445 | conf_thresholds = random_threshold(scores['conf'], predicted_labels) 446 | thresholds['conf'] = conf_thresholds 447 | 448 | # Test with chosen thresholds 449 | results = test_with_rejection( 450 | thresholds, scores, groundtruth_labels, predicted_labels, full=True) 451 | 452 | # Check if any results exceed given constraints (e.g. too many rejects) 453 | unacceptable = [results[k] > v for k, v in ceiling.items()] 454 | if any(unacceptable): 455 | continue 456 | 457 | # 'Score' current thresholds with objective function 458 | outcome = objective_func(results) 459 | 460 | # If current thresholds are better, save new best outcomes 461 | if outcome > best_outcome: 462 | best_outcome = outcome 463 | best_thresholds = thresholds 464 | best_results = results 465 | 466 | logging.info('New best: [{:.4f}] @ {} || Max: {}Min: {}'.format( 467 | outcome, thresholds, 468 | format_opts(max_metrics, results), 469 | format_opts(min_metrics, results))) 470 | # report_results(results) 471 | logging.warning('{} combinations sampled so far!'.format(n_samples)) 472 | 473 | # If the maximum number of thresholds have been sampled, abort search 474 | if max_samples is not None and n_samples >= max_samples: 475 | logging.warning( 476 | 'Max samples reached ({}) - search aborted'.format(max_samples)) 477 | logging.info('Settling for: [{}] @ {} || Max: {}Min: {}'.format( 478 | best_outcome, best_thresholds, 479 | format_opts(max_metrics, best_results), 480 | format_opts(min_metrics, best_results))) 481 | # report_results(results) 482 | 483 | return best_thresholds 484 | 485 | n_samples += 1 486 | 487 | 488 | def package_cred_conf(cred_values, conf_values, criteria): 489 | package = {} 490 | if 'cred' in criteria: 491 | package['cred'] = cred_values 492 | if 'conf' in criteria: 493 | package['conf'] = conf_values 494 | 495 | return package 496 | 497 | 498 | def compute_single_cred_p_value( 499 | train_ncms, groundtruth_train, single_test_ncm, single_y_test): 500 | """Compute a single credibility p-value. 501 | 502 | Credibility p-values describe how 'conformal' a point is with respect to 503 | the other objects of that class. They're computed as the proportion of 504 | points with greater NCMs (the number of points _less conforming_ than the 505 | reference point) over the total number of points. 506 | 507 | Intuitively, a point predicted as malware which is the further away from 508 | the decision boundary than any other point will have the highest p-value 509 | out of all other malware points. It will have the smallest NCM (as it is 510 | the least _non-conforming_) and thus no other points will have a greater 511 | NCM and it will have a credibility p-value of 1. 512 | 513 | Args: 514 | train_ncms (np.ndarray): An array of training NCMs to compare the 515 | reference point against. 516 | groundtruth_train (np.ndarray): An array of ground truths corresponding 517 | to `train_ncms`. 518 | single_test_ncm (float): A single reference point to compute the 519 | p-value of. 520 | single_y_test (int): Either the ground truth (calibration) or predicted 521 | label (testing) of `single_test_ncm`. 522 | 523 | See Also: 524 | - `compute_p_values_cred_and_conf` 525 | - `compute_single_conf_p_value` 526 | 527 | Returns: 528 | float: The p-value for `single_test_ncm` w.r.t. `train_ncms`. 529 | 530 | """ 531 | assert len(set(groundtruth_train)) == 2 # binary classification tasks only 532 | 533 | how_many_are_greater_than_single_test_ncm = 0 534 | 535 | for ncm, groundtruth in zip(train_ncms, groundtruth_train): 536 | if groundtruth == single_y_test and ncm >= single_test_ncm: 537 | how_many_are_greater_than_single_test_ncm += 1 538 | 539 | single_cred_p_value = (how_many_are_greater_than_single_test_ncm / 540 | sum(1 for y in groundtruth_train if 541 | y == single_y_test)) 542 | return single_cred_p_value 543 | 544 | 545 | def compute_single_conf_p_value( 546 | train_ncms, groundtruth_train, single_test_ncm, single_y_test): 547 | """Compute a single confidence p-value. 548 | 549 | The confidence p-value is computed similarly to the credibility p-value, 550 | except it aims to capture the confidence that the classifier has that the 551 | point _doesn't_ belong to the opposite class. 552 | 553 | To achieve this we assume that point has the label of the second highest 554 | scoring class---in binary classification, simply the opposite class---and 555 | compute the credibility p-value with respect to other points of that class. 556 | The confidence p-value is (1 - this value). 557 | 558 | Note that in transductive conformal evaluation, the entire classifier 559 | should be retrained with the reference point given the label of the 560 | opposite class. Usually, this is computationally prohibitive, and so this 561 | approximation assumes that the decision boundary undergoes only minimal 562 | changes when the label of a single point is flipped. 563 | 564 | See Also: 565 | - `compute_p_values_cred_and_conf` 566 | - `compute_single_cred_p_value` 567 | 568 | Args: 569 | train_ncms (np.ndarray): An array of training NCMs to compare the 570 | reference point against. 571 | groundtruth_train (np.ndarray): An array of ground truths corresponding 572 | to `train_ncms`. 573 | single_test_ncm (float): A single reference point to compute the 574 | p-value of. 575 | single_y_test (int): Either the ground truth (calibration) or predicted 576 | label (testing) of `single_test_ncm`. 577 | 578 | Returns: 579 | float: The p-value for `single_test_ncm` w.r.t. `train_ncms`. 580 | 581 | """ 582 | assert len(set(groundtruth_train)) == 2 # binary classification tasks only 583 | 584 | # 'Cast' NCMs to NCMs with respect to the opposite class (binary only) 585 | # train_ncms_opposite_class = -1 * np.array(train_ncms) 586 | single_y_test_opposite_class = 0 if single_y_test == 1 else 1 587 | single_test_ncm_opposite_class = -1 * single_test_ncm 588 | 589 | how_many_are_greater_than_single_test_ncm = 0 590 | 591 | for ncm, groundtruth in zip(train_ncms, groundtruth_train): 592 | if (groundtruth == single_y_test_opposite_class 593 | and ncm >= single_test_ncm_opposite_class): 594 | how_many_are_greater_than_single_test_ncm += 1 595 | 596 | single_cred_p_value_opposite_class = ( 597 | how_many_are_greater_than_single_test_ncm / 598 | sum(1 for y in groundtruth_train if 599 | y == single_y_test_opposite_class)) 600 | 601 | return 1 - single_cred_p_value_opposite_class # confidence p value 602 | 603 | 604 | def compute_p_values_cred_and_conf( 605 | train_ncms, groundtruth_train, test_ncms, y_test): 606 | """Helper function to compute p-values across an entire array.""" 607 | cred = [compute_single_cred_p_value(train_ncms=train_ncms, 608 | groundtruth_train=groundtruth_train, 609 | single_test_ncm=ncm, 610 | single_y_test=y) 611 | for ncm, y in tqdm( 612 | zip(test_ncms, y_test), total=len(y_test), desc='cred pvals', position=0, leave=True)] 613 | # conf = [compute_single_conf_p_value(train_ncms=train_ncms, 614 | # groundtruth_train=groundtruth_train, 615 | # single_test_ncm=ncm, 616 | # single_y_test=y) 617 | # for ncm, y in tqdm( 618 | # zip(test_ncms, y_test), total=len(y_test), desc='conf pvals', position=0, leave=True)] 619 | 620 | return {'cred': cred} 621 | # , 'conf': conf 622 | 623 | 624 | def get_svm_ncms(decision_function, X_in, y_in): 625 | """Helper functions to get NCMs across an entire pair of X,y arrays. """ 626 | return [get_single_svm_ncm(decision_function, x, y) for x, y in 627 | tqdm(zip(X_in, y_in), total=len(y_in), desc='svm ncms', position=0, leave=True)] 628 | 629 | 630 | def get_single_svm_ncm(decision_function, single_x, single_y): 631 | """Collect a non-conformity measure from the classifier for `single_x`. 632 | 633 | A note about SVM ncms: In binary classification with a linear SVM, the 634 | output score is the distance from the hyperplane with respect to the 635 | positive class. If the score is negative, the prediction is class 0, if 636 | positive, it's class 1 (in sklearn technically it will be clf.class_[0] and 637 | clf.class_[1] respectively). To perform thresholding with conformal 638 | evaluator, we need the distance from the hyperplane with respect to *both* 639 | classes, so we simply flip the sign to get the 'reflection' for the other 640 | class. 641 | 642 | Args: 643 | clf (sklearn.svm.SVC): The classifier to use for the NCMs. 644 | single_x (np.ndarray): An single feature vector to get the NCM for. 645 | single_y (int): The ground truth corresponding to feature vector 646 | `single_x`. 647 | 648 | Returns: 649 | float: The NCM for the given `single_x`. 650 | 651 | """ 652 | decision = decision_function(single_x) 653 | 654 | # If y (ground truth in calibration, prediction in testing) is malware 655 | # then flip the sign to ensure the most conforming point is most minimal. 656 | # decision = -abs(decision) 657 | # mal;1 -> 0 658 | if single_y == 1: 659 | return -decision 660 | elif single_y == 0: 661 | return decision 662 | raise Exception('Unknown class? Only binary decisions supported.') 663 | 664 | 665 | def cache_data(model, data_path): 666 | """Cache data (trained model, computed p-values, etc). 667 | 668 | Args: 669 | model: The data to save. 670 | data_path: (str) To avoid mix-ups, and to allow safe caching of models 671 | produced during calibration, it's advised to keep this location 672 | 'fold-specific'. 673 | 674 | See Also: 675 | - `load_cached_data` 676 | 677 | """ 678 | 679 | model_folder_path = os.path.dirname(data_path) 680 | 681 | if not os.path.exists(model_folder_path): 682 | os.makedirs(model_folder_path) 683 | 684 | logging.info('Saving data to {}...'.format(data_path)) 685 | with open(data_path, 'wb') as f: 686 | pkl.dump(model, f) 687 | logging.debug('Done.') 688 | 689 | 690 | def load_cached_data(data_path): 691 | """Load cached data (trained model, computed p-values, etc). 692 | 693 | Args: 694 | data_path: (str) To avoid mix-ups, and to allow safe caching of models 695 | produced during calibration, it's advised to keep this location 696 | 'fold-specific'. 697 | 698 | See Also: 699 | - `cache_data` 700 | 701 | Returns: 702 | The previously cached data. 703 | 704 | """ 705 | logging.info('Loading data from {}...'.format(data_path)) 706 | with open(data_path, 'rb') as f: 707 | model = pkl.load(f) 708 | logging.debug('Done.') 709 | return model 710 | 711 | 712 | def train_calibration_ice(clf, X_proper_train, X_cal, 713 | y_proper_train, y_cal, 714 | fold_index): 715 | """Train calibration set (for a single fold). 716 | 717 | Quite a bit of information is needed here for the later p-value 718 | computation and probability comparison. The returned dictionary has 719 | the following structure: 720 | 721 | 'cred_p_val_cal_fold' --> # Calibration credibility p values 722 | 'conf_p_val_cal_fold' --> # Calibration confidence p values 723 | 'ncms_cal_fold' --> # Calibration NCMs 724 | 'pred_cal_fold' --> # Calibration predictions 725 | 'groundtruth_cal_fold' --> # Calibration groundtruth 726 | 'probas_cal_fold' --> # Calibration probabilities 727 | 'pred_proba_cal_fold' --> # Calibration predictions 728 | 729 | Args: 730 | X_proper_train (np.ndarray): Features for the 'proper training 731 | set' partition. 732 | X_cal (np.ndarray): Features for a single calibration set 733 | partition. 734 | y_proper_train (np.ndarray): Ground truths for the 'proper 735 | training set' partition. 736 | y_cal (np.ndarray): Ground truths for a single calibration set 737 | partition. 738 | fold_index: An index to identify the current fold (used for caching). 739 | 740 | Returns: 741 | dict: Fold results, structure as in the docstring above. 742 | 743 | """ 744 | # Train model with proper training 745 | clf.fit(X_proper_train, y_proper_train) 746 | 747 | # Get ncms for proper training fold 748 | logging.debug('Getting training ncms for fold {}...'.format(fold_index)) 749 | groundtruth_proper_train_fold = y_proper_train 750 | 751 | # Get ncms for calibration fold 752 | 753 | logging.debug('Getting calibration ncms for fold {}...'.format(fold_index)) 754 | pred_cal_fold = clf.predict(X_cal) 755 | groundtruth_cal_fold = y_cal 756 | 757 | # Compute p values for calibration fold 758 | 759 | logging.debug('Computing cal p values for fold {}...'.format(fold_index)) 760 | ncms_cal_fold = get_svm_ncms(clf.decision_function, X_cal, y_cal) 761 | # data.cache_data(ncms_cal_fold, saved_ncms_name) 762 | 763 | # saved_pvals_name = 'p_vals_{}_cal_fold_{}.p'.format(alg_name, fold_index) 764 | # saved_pvals_name = os.path.join(saved_data_folder, saved_pvals_name) 765 | # 766 | # if os.path.exists(saved_pvals_name): 767 | # p_val_cal_fold_dict = data.load_cached_data(saved_pvals_name) 768 | # else: 769 | # # TODO | Doublecheck implications of duplicating the reference 770 | # # TODO | point in the 'train_ncms' 771 | p_val_cal_fold_dict = compute_p_values_cred_and_conf( 772 | train_ncms=ncms_cal_fold, 773 | groundtruth_train=groundtruth_cal_fold, 774 | test_ncms=ncms_cal_fold, 775 | y_test=groundtruth_cal_fold) 776 | # data.cache_data(p_val_cal_fold_dict, saved_pvals_name) 777 | 778 | # Compute values for calibration probabilities 779 | # logging.debug('Computing cal probas for fold {}...'.format(fold_index)) 780 | # probas_cal_fold, pred_proba_cal_fold = get_svm_probs(clf, X_cal) 781 | 782 | return { 783 | # Calibration credibility p values 784 | 'cred_p_val_cal': p_val_cal_fold_dict['cred'], 785 | # Calibration confidence p values 786 | # 'conf_p_val_cal': p_val_cal_fold_dict['conf'], 787 | 'ncms_cal': ncms_cal_fold, # Calibration NCMs 788 | 'pred_cal': pred_cal_fold, # Calibration predictions 789 | 'groundtruth_cal': groundtruth_cal_fold, # Calibration groundtruth 790 | # 'probas_cal': probas_cal_fold, # Calibration probabilities 791 | # 'pred_proba_cal': pred_proba_cal_fold, # Calibration predictions 792 | } 793 | 794 | 795 | def train_calibration_ice_withmodel( 796 | X_proper_train, X_cal, 797 | y_proper_train, y_cal, alg_name, fold_index, saved_data_folder, model_name): 798 | """Train calibration set (for a single fold). 799 | 800 | Quite a bit of information is needed here for the later p-value 801 | computation and probability comparison. The returned dictionary has 802 | the following structure: 803 | 804 | 'cred_p_val_cal_fold' --> # Calibration credibility p values 805 | 'conf_p_val_cal_fold' --> # Calibration confidence p values 806 | 'ncms_cal_fold' --> # Calibration NCMs 807 | 'pred_cal_fold' --> # Calibration predictions 808 | 'groundtruth_cal_fold' --> # Calibration groundtruth 809 | 'probas_cal_fold' --> # Calibration probabilities 810 | 'pred_proba_cal_fold' --> # Calibration predictions 811 | 812 | Args: 813 | X_proper_train (np.ndarray): Features for the 'proper training 814 | set' partition. 815 | X_cal (np.ndarray): Features for a single calibration set 816 | partition. 817 | y_proper_train (np.ndarray): Ground truths for the 'proper 818 | training set' partition. 819 | y_cal (np.ndarray): Ground truths for a single calibration set 820 | partition. 821 | fold_index: An index to identify the current fold (used for caching). 822 | 823 | Returns: 824 | dict: Fold results, structure as in the docstring above. 825 | 826 | """ 827 | # Train model with proper training 828 | model_name = os.path.join(saved_data_folder, model_name) 829 | svm = load_cached_data(model_name) 830 | 831 | # Get ncms for calibration fold 832 | logging.debug('Getting calibration ncms for fold {}...'.format(fold_index)) 833 | pred_cal_fold = svm.predict(X_cal) 834 | groundtruth_cal_fold = y_cal 835 | 836 | # Compute p values for calibration fold 837 | 838 | logging.debug('Computing cal p values for fold {}...'.format(fold_index)) 839 | 840 | ncms_cal_fold = get_svm_ncms(svm, X_cal, y_cal) 841 | p_val_cal_fold_dict = compute_p_values_cred_and_conf( 842 | train_ncms=ncms_cal_fold, 843 | groundtruth_train=groundtruth_cal_fold, 844 | test_ncms=ncms_cal_fold, 845 | y_test=groundtruth_cal_fold) 846 | 847 | return { 848 | # Calibration credibility p values 849 | 'cred_p_val_cal': p_val_cal_fold_dict['cred'], 850 | # Calibration confidence p values 851 | # 'conf_p_val_cal': p_val_cal_fold_dict['conf'], 852 | 'ncms_cal': ncms_cal_fold, # Calibration NCMs 853 | 'pred_cal': pred_cal_fold, # Calibration predictions 854 | 'groundtruth_cal': groundtruth_cal_fold, # Calibration groundtruth 855 | # 'probas_cal': probas_cal_fold, # Calibration probabilities 856 | # 'pred_proba_cal': pred_proba_cal_fold, # Calibration predictions 857 | 'model': svm 858 | } 859 | 860 | 861 | def test_with_rejection_keep_masks( 862 | binary_thresholds, test_scores, groundtruth_labels, predicted_labels, full=True): 863 | keep_mask = apply_threshold( 864 | binary_thresholds=binary_thresholds, 865 | test_scores=test_scores, 866 | y_test=predicted_labels) 867 | 868 | results = get_performance_with_rejection( 869 | y_true=groundtruth_labels, 870 | y_pred=predicted_labels, 871 | keep_mask=keep_mask, 872 | full=full) 873 | 874 | return results, keep_mask 875 | 876 | 877 | def report_results(d, quiet=False): 878 | """Produce a textual report based on the given results. 879 | 880 | Args: 881 | d (dict): Results for baseline, kept, and rejected metrics. 882 | quiet (bool): Whether to also print the results to stdout. 883 | 884 | Returns: 885 | str: A textual report of the results. 886 | 887 | """ 888 | report_str = '' 889 | 890 | def print_and_extend(report_line): 891 | nonlocal report_str 892 | if not quiet: 893 | print(report_line) 894 | report_str += report_line + '\n' 895 | 896 | s = '% kept elements: {:.1f}, % rejected elements: {:.1f}'.format( 897 | d['kept_total_perc'] * 100, d['reject_total_perc'] * 100) 898 | print_and_extend(s) 899 | 900 | s = '% benign rejected elements: {:.1f}, % malware rejected elements: {:.1f}'.format( 901 | d['reject_neg_total'] * 100, d['reject_pos_total'] * 100) 902 | print_and_extend(s) 903 | 904 | s = '% benign kept: {:.1f}, % benign rejected: {:.1f}'.format( 905 | d['kept_neg_perc'] * 100, d['reject_neg_perc'] * 100) 906 | 907 | print_and_extend(s) 908 | 909 | s = '% malware kept: {:.1f}, % malware rejected: {:.1f}'.format( 910 | d['kept_pos_perc'] * 100, d['reject_pos_perc'] * 100) 911 | 912 | print_and_extend(s) 913 | 914 | s = ('F1 baseline: {:>12.2f} | ' 915 | 'F1 keep: {:>12.2f} | ' 916 | 'F1 reject: {:>12.2f}').format( 917 | d['f1_b'], d['f1_k'], d['f1_r']) 918 | 919 | print_and_extend(s) 920 | 921 | s = ('Pr baseline: {:>12.2f} | ' 922 | 'Pr keep: {:>12.2f} | ' 923 | 'Pr reject: {:>12.2f}'.format( 924 | d['precision_b'], d['precision_k'], d['precision_r'])) 925 | 926 | print_and_extend(s) 927 | 928 | s = ('Rec baseline: {:>12.2f} | ' 929 | 'Rec keep: {:>12.2f} | ' 930 | 'Rec reject: {:>12.2f}'.format( 931 | d['recall_b'], d['recall_k'], d['recall_r'])) 932 | 933 | print_and_extend(s) 934 | 935 | s = ('TP baseline: {:>12.2f} | ' 936 | 'TP keep: {:>12.2f} | ' 937 | 'TP reject: {:>12.2f}'.format(d['tp_b'], d['tp_k'], d['tp_r'])) 938 | print_and_extend(s) 939 | 940 | s = ('FP baseline: {:>12.2f} | ' 941 | 'FP keep: {:>12.2f} | ' 942 | 'FP reject: {:>12.2f}'.format(d['fp_b'], d['fp_k'], d['fp_r'])) 943 | print_and_extend(s) 944 | 945 | s = ('TN baseline: {:>12.2f} | ' 946 | 'TN keep: {:>12.2f} | ' 947 | 'TN reject: {:>12.2f}'.format(d['tn_b'], d['tn_k'], d['tn_r'])) 948 | print_and_extend(s) 949 | 950 | s = ('FN baseline: {:>12.2f} | ' 951 | 'FN keep: {:>12.2f} | ' 952 | 'FN reject: {:>12.2f}'.format(d['fn_b'], d['fn_k'], d['fn_r'])) 953 | print_and_extend(s) 954 | 955 | s = ('TPR baseline: {:>12.2f} | ' 956 | 'TPR keep: {:>12.2f} | ' 957 | 'TPR reject: {:>12.2f}'.format(d['tpr_b'], d['tpr_k'], d['tpr_r'])) 958 | print_and_extend(s) 959 | 960 | s = ('FPR baseline: {:>12.2f} | ' 961 | 'FPR keep: {:>12.2f} | ' 962 | 'FPR reject: {:>12.2f}'.format(d['fpr_b'], d['fpr_k'], d['fpr_r'])) 963 | print_and_extend(s) 964 | 965 | return report_str 966 | 967 | 968 | def find_random_search_thresholds_with_constraints( 969 | scores, predicted_labels, groundtruth_labels, maximise_vals, 970 | constraint_vals, max_samples=100, quiet=False, ncpu=-1): 971 | """Perform a random grid search to find the best thresholds on `scores` in 972 | parallel. 973 | 974 | This method wraps `find_random_search_thresholds_with_constraints_discrete` 975 | and parallelizes it. For a full description of this, read the documentation 976 | of the aformentioned method. 977 | 978 | See Also: 979 | - `find_random_search_threhsolds_with_constraint_discrete`` 980 | 981 | Args: 982 | scores (dict): The test scores on which to perform the random search. 983 | predicted_labels (np.ndarray): The set of predictions to decide which 984 | 'per-class' threshold to use. 985 | groundtruth_labels (np.ndarray): The groundtruth label for each object. 986 | maximise_vals: The metrics that should be maximised. 987 | constraint_vals: The metrics that are constrained. 988 | max_samples (int): The maximum number of random threshold combinations 989 | to try before settling for the best performance up to that point. 990 | quiet (bool): If True, logging will be disabled. 991 | ncpu (int): Number of cpus to use, if negative then we compute it as 992 | total_cpu + ncpu, if ncpu=1 then we do not parallelize, this is done 993 | to avoid problems with nested parallelization 994 | 995 | Returns: 996 | dict: Set of thresholds for malware ('gw') and goodware ('gw') classes. 997 | 998 | """ 999 | 1000 | ncpu = mp.cpu_count() + ncpu if ncpu < 0 else ncpu 1001 | 1002 | if ncpu == 1: 1003 | results, thresholds = find_random_search_thresholds_with_constraints_discrete( 1004 | scores, predicted_labels, groundtruth_labels, maximise_vals, 1005 | constraint_vals, max_samples, quiet) 1006 | 1007 | return thresholds 1008 | 1009 | samples = [max_samples // ncpu for _ in range(ncpu)] 1010 | 1011 | with mp.Pool(processes=ncpu) as pool: 1012 | results = pool.starmap(find_random_search_thresholds_with_constraints_discrete, 1013 | zip(repeat(scores), repeat(predicted_labels), repeat(groundtruth_labels), 1014 | repeat(maximise_vals), repeat(constraint_vals), samples, repeat(quiet))) 1015 | 1016 | results_list = [res[0] for res in results] 1017 | thresholds_list = [res[1] for res in results] 1018 | 1019 | def resolve_keyvals(s): 1020 | if isinstance(s, str): 1021 | pairs = s.split(',') 1022 | pairs = [x.split(':') for x in pairs] 1023 | return {k: float(v) for k, v in pairs} 1024 | return s 1025 | 1026 | maximise_vals = resolve_keyvals(maximise_vals) 1027 | constraint_vals = resolve_keyvals(constraint_vals) 1028 | 1029 | best_maximised = {k: 0 for k in maximise_vals} 1030 | best_constrained = {k: 0 for k in constraint_vals} 1031 | best_thresholds, best_result = {}, {} 1032 | 1033 | for result, thresholds in zip(results_list, thresholds_list): 1034 | if any([result[k] > best_maximised[k] for k in maximise_vals]): 1035 | best_maximised = {k: result[k] for k in maximise_vals} 1036 | best_constrained = {k: result[k] for k in constraint_vals} 1037 | best_thresholds = thresholds 1038 | best_result = result 1039 | 1040 | if not quiet: 1041 | logging.info('New best: {} {} @ {} '.format( 1042 | format_opts(maximise_vals.keys(), result), 1043 | format_opts(constraint_vals.keys(), result), 1044 | best_thresholds)) 1045 | report_results(best_result) 1046 | 1047 | continue 1048 | 1049 | if all([result[k] == best_maximised[k] for k in maximise_vals]): 1050 | if all([result[k] >= best_constrained[k] for k in constraint_vals]): 1051 | best_maximised = {k: result[k] for k in maximise_vals} 1052 | best_constrained = {k: result[k] for k in constraint_vals} 1053 | best_thresholds = thresholds 1054 | best_result = result 1055 | 1056 | if not quiet: 1057 | logging.info('New best: {} {} @ {} '.format( 1058 | format_opts(maximise_vals.keys(), result), 1059 | format_opts(constraint_vals.keys(), result), 1060 | best_thresholds)) 1061 | report_results(best_result) 1062 | 1063 | continue 1064 | print(best_thresholds) 1065 | return best_thresholds 1066 | 1067 | 1068 | def find_random_search_thresholds_with_constraints_discrete( 1069 | scores, predicted_labels, groundtruth_labels, maximise_vals, 1070 | constraint_vals, max_samples=100, quiet=False, stop_condition=3000): 1071 | """Perform a random grid search to find the best thresholds on `scores`. 1072 | 1073 | `scores` expects a dictionary keyed by 'cred' and/or 'conf', 1074 | with sub-dictionaries containing the thresholds for the mw and gw classes. 1075 | 1076 | Note that the keys of `scores` determine _which_ thresholding criteria will 1077 | be enforced. That is, if only a 'cred' dictionary is supplied, thresholding 1078 | will be enforced on cred-only and the same for 'conf'. Supplying cred and 1079 | conf dictionaries will enforce the 'cred+conf' thresholding criteria (all 1080 | thresholds will be applied). 1081 | 1082 | `maximise_vals` describes the metrics that should be maximised and their 1083 | minimum acceptable values. It expects either a dictionary of metrics, or a 1084 | string or comma separated metrics. 1085 | 1086 | `constrained_vals` describes the floors for metrics that a threshold must 1087 | pass in order to be acceptable. The algorithm will also try to maximise 1088 | these metrics if possible, although never at the expense of `maximise_vals`. 1089 | 1090 | Both `maximise_vals` and `constrained_vals` expect a dictionary of metrics 1091 | and maximum acceptable values. Alternatively, arguments can be given in 1092 | string form as comma-separated key:value pairs, for example, 1093 | 'key1:value1,key2:value2,key3:value3'. 1094 | 1095 | Concretely, any of the following are acceptable: 1096 | 1097 | > maximise_vals = {'f1': 0.95} 1098 | > maximise_vals = 'f1_k:0.95' 1099 | 1100 | > constrained_vals = {'kept_pos_perc': 0.76, 'kept_neg_perc': 0.76} 1101 | > constrained_vals = kept_pos_perc:0.76,kept_neg_perc:0.76 1102 | 1103 | For a list of possible metrics, see the keys in the dict produced by 1104 | `get_performance_with_rejection()`. Note that the default objective 1105 | function assumes that the provided metrics are in the interval [0,1]. 1106 | 1107 | See Also: 1108 | - `get_performance_with_rejection` 1109 | 1110 | Args: 1111 | scores (dict): The test scores on which to perform the random search. 1112 | predicted_labels (np.ndarray): The set of predictions to decide which 1113 | 'per-class' threshold to use. 1114 | groundtruth_labels (np.ndarray): The groundtruth label for each object. 1115 | maximise_vals: The metrics that should be maximised. 1116 | constraint_vals: The metrics that are constrained. 1117 | max_samples (int): The maximum number of random threshold combinations 1118 | to try before settling for the best performance up to that point. 1119 | quiet (bool): If True, logging will be disabled. 1120 | 1121 | Returns: 1122 | dict: Set of thresholds for malware ('gw') and goodware ('gw') classes. 1123 | 1124 | """ 1125 | 1126 | # as this method is called from multiprocessing, we want to make sure each 1127 | # process has a different seed 1128 | seed = 0 1129 | for l in os.urandom(10): seed += l 1130 | np.random.seed(seed) 1131 | 1132 | def resolve_keyvals(s): 1133 | if isinstance(s, str): 1134 | pairs = s.split(',') 1135 | pairs = [x.split(':') for x in pairs] 1136 | return {k: float(v) for k, v in pairs} 1137 | return s 1138 | 1139 | maximise_vals = resolve_keyvals(maximise_vals) 1140 | constraint_vals = resolve_keyvals(constraint_vals) 1141 | 1142 | best_maximised = {k: 0 for k in maximise_vals} 1143 | best_constrained = {k: 0 for k in constraint_vals} 1144 | best_thresholds, best_result = {}, {} 1145 | 1146 | logging.info('Searching for threshold on calibration data...') 1147 | 1148 | stop_counter = 0 1149 | 1150 | for _ in tqdm(range(max_samples)): 1151 | # Choose and package random thresholds 1152 | thresholds = {} 1153 | if 'cred' in scores: 1154 | cred_thresholds = random_threshold(scores['cred'], predicted_labels) 1155 | thresholds['cred'] = cred_thresholds 1156 | if 'conf' in scores: 1157 | conf_thresholds = random_threshold(scores['conf'], predicted_labels) 1158 | thresholds['conf'] = conf_thresholds 1159 | 1160 | # Test with chosen thresholds 1161 | result = test_with_rejection( 1162 | thresholds, scores, groundtruth_labels, predicted_labels) 1163 | 1164 | # Check if any results exceed given constraints (e.g. too many rejects) 1165 | if any([result[k] < constraint_vals[k] for k in constraint_vals]): 1166 | if stop_counter > stop_condition: 1167 | logging.info('Exceeded stop condition, terminating calibration search...') 1168 | break 1169 | 1170 | stop_counter += 1 1171 | continue 1172 | 1173 | if any([result[k] < best_maximised[k] for k in maximise_vals]): 1174 | if stop_counter > stop_condition: 1175 | logging.info('Exceeded stop condition, terminating calibration search...') 1176 | break 1177 | 1178 | stop_counter += 1 1179 | continue 1180 | 1181 | if any([result[k] > best_maximised[k] for k in maximise_vals]): 1182 | best_maximised = {k: result[k] for k in maximise_vals} 1183 | best_constrained = {k: result[k] for k in constraint_vals} 1184 | best_thresholds = thresholds 1185 | best_result = result 1186 | 1187 | if not quiet: 1188 | logging.info('New best: {} {} @ {} '.format( 1189 | format_opts(maximise_vals.keys(), result), 1190 | format_opts(constraint_vals.keys(), result), 1191 | best_thresholds)) 1192 | report_results(best_result) 1193 | 1194 | stop_counter = 0 1195 | continue 1196 | 1197 | if all([result[k] == best_maximised[k] for k in maximise_vals]): 1198 | if all([result[k] >= best_constrained[k] for k in constraint_vals]): 1199 | best_maximised = {k: result[k] for k in maximise_vals} 1200 | best_constrained = {k: result[k] for k in constraint_vals} 1201 | best_thresholds = thresholds 1202 | best_result = result 1203 | 1204 | if not quiet: 1205 | logging.info('New best: {} {} @ {} '.format( 1206 | format_opts(maximise_vals.keys(), result), 1207 | format_opts(constraint_vals.keys(), result), 1208 | best_thresholds)) 1209 | report_results(best_result) 1210 | 1211 | stop_counter = 0 1212 | continue 1213 | 1214 | if not bool(best_result): 1215 | best_result = result 1216 | 1217 | return (best_result, best_thresholds) 1218 | 1219 | 1220 | def get_svm_probs(clf, X_in): 1221 | """Get scores and predictions for comparison with probabilities. 1222 | 1223 | Note that this function returns the predictions _and_ probabilities given 1224 | by the classifier and that these predictions may different from other 1225 | outputs from the same classifier (such as `predict` or `decision_function`. 1226 | This is due to Platt's scaling (and it's implementation in scikit-learn) in 1227 | which a 5-fold SVM is trained and used to score the observation 1228 | (`predict_proba()` is actually the average of these 5 classifiers). 1229 | 1230 | The takeaway is to be sure that you're always using probability scores with 1231 | probability predictions and not with the output of other SVC functions. 1232 | 1233 | Args: 1234 | clf (sklearn.svm.SVC): The classifier to use for the probabilities. 1235 | X_in (np.ndarray): An array of feature vectors to classify. 1236 | 1237 | Returns: 1238 | (list, list): (Probability scores, probability labels) for `X_in`. 1239 | 1240 | """ 1241 | assert hasattr(clf, 'predict_proba') 1242 | probability_results = clf.predict_proba(X_in) 1243 | probas_cal_fold = [np.max(t) for t in probability_results] 1244 | pred_proba_cal_fold = [np.argmax(t) for t in probability_results] 1245 | return probas_cal_fold, pred_proba_cal_fold 1246 | --------------------------------------------------------------------------------