├── data
    ├── raw
    │   └── .gitkeep
    └── processed
    │   └── .gitkeep
├── test
    ├── __init__.py
    ├── test_viz.py
    ├── test_mock.py
    ├── test_rebalancers.py
    ├── test_temporal.py
    ├── test_evaluation.py
    ├── test_selectors.py
    └── test_rejectors.py
├── examples
    ├── __init__.py
    ├── rebalance.py
    ├── timeline-evaluation.py
    ├── active-learning.py
    ├── decay-plot.py
    ├── feature-reduct.py
    ├── reject.py
    ├── parallel-predict.py
    ├── constraints.py
    └── tesseract-plots.py
├── tesseract
    ├── __init__.py
    ├── rebalancing.py
    ├── mock.py
    ├── loader.py
    ├── utils.py
    ├── selection.py
    ├── temporal.py
    ├── rejection.py
    ├── plot_utils.py
    ├── viz.py
    ├── spatial.py
    ├── metrics.py
    ├── evaluation.py
    └── transcendent.py
├── .gitignore
├── DATASET-USESec19.md
├── setup.py
├── LICENSE
├── Makefile
└── README.md


/data/raw/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/processed/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tesseract/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Python template
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | *$py.class
 5 | *DS_Store
 6 | Tesseract.egg-info/*
 7 | # Pycharm information
 8 | .idea/*
 9 | /features/
10 | build/*
11 | dist/*egg*
12 | 
13 | 


--------------------------------------------------------------------------------
/examples/rebalance.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier
 2 | 
 3 | from tesseract import temporal, mock, evaluation, metrics
 4 | from tesseract.rebalancing import PositiveRateRebalancer
 5 | 
 6 | 
 7 | def main():
 8 |     X, y, t = mock.generate_binary_test_data(10000, '2000')
 9 | 
10 |     splits = temporal.time_aware_train_test_split(
11 |         X, y, t, train_size=6, test_size=1, granularity='month')
12 | 
13 |     clf = RandomForestClassifier()
14 | 
15 |     pr_rebalancer = PositiveRateRebalancer(0.7, max_pos_rate=0.8, schedule='first')
16 |     results = evaluation.fit_predict_update(
17 |         clf, *splits, rebalancers=[pr_rebalancer])
18 | 
19 |     metrics.print_metrics(results)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 


--------------------------------------------------------------------------------
/DATASET-USESec19.md:
--------------------------------------------------------------------------------
 1 | # USENIX Security 2019 Dataset
 2 | 
 3 | We provide the links to download the dataset used in the Tesseract paper [1]:
 4 | 
 5 | - [AndroZoo apps hashes](https://www.dropbox.com/s/pw83zohwjk1yden/hashes.txt.gz)*
 6 | - [Drebin feature space](https://www.dropbox.com/s/i7q8ysi5agi6n0f/drebin-features.tar.gz)
 7 | - [MaMaDroid feature space](https://www.dropbox.com/s/wl23fjvjtj2ncsg/mamadroid-features.tar.gz)
 8 | 
 9 | *The original Android apks can be downloaded through the [AndroZoo Official API](https://androzoo.uni.lu/api_doc).
10 | 
11 | [1] Feargus Pendlebury*, Fabio Pierazzi*, Roberto Jordaney, Johannes Kinder, 
12 | Lorenzo Cavallaro. "TESSERACT: Eliminating Experimental Bias in Malware Classification 
13 | across Space and Time". USENIX Security Symposium, 2019. 


--------------------------------------------------------------------------------
/test/test_viz.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | test_viz.py
 5 | ~~~~~~~~~~~
 6 | 
 7 | Unit tests for testing viz.py.
 8 | 
 9 | """
10 | import unittest
11 | 
12 | from tesseract import mock, metrics
13 | 
14 | 
15 | class TestViz(unittest.TestCase):
16 |     def setUp(self):
17 |         self.X, self.y, self.t = mock.generate_binary_test_data(10000, '2012')
18 | 
19 |     # def test_plot_by_time(self):
20 |     #     viz.plot_by_time(self.y, self.t, 'day', 'line')
21 |     #     viz.plot_by_time(self.y, self.t, 'week', 'line')
22 |     #     viz.plot_by_time(self.y, self.t, 'month', 'line')
23 |     #     viz.plot_by_time(self.y, self.t, 'month', 'bar')
24 |     #     viz.plot_by_time(self.y, self.t, 'quarter', 'bar')
25 | 
26 |     def test_summarize(self):
27 |         metrics.summarize(self.y)
28 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | _dependencies = [
 4 |     'cycler==0.10.0',
 5 |     'kiwisolver==1.0.1',
 6 |     'matplotlib==3.5.2',
 7 |     'numpy==1.22.4',
 8 |     'pandas==1.4.2',
 9 |     'pyparsing==3.0.9',
10 |     'python-dateutil==2.8.1',
11 |     'pytz==2022.1',
12 |     'scikit-learn>=1.1.1,<2.0.0',
13 |     'scipy==1.8.1',
14 |     'seaborn==0.9.0',
15 |     'six==1.11.0',
16 |     'tqdm==4.25.0']
17 | 
18 | setup(
19 |     name='Tesseract',
20 |     version='0.9',
21 |     description='Tesseract: A library for performing '
22 |                 'time-aware classifications.',
23 |     maintainer='Feargus Pendlebury',
24 |     maintainer_email='Feargus.Pendlebury[at]rhul.ac.uk',
25 |     url='',
26 |     packages=['tesseract'],
27 |     setup_requires=_dependencies,
28 |     install_requires=_dependencies
29 | )
30 | 


--------------------------------------------------------------------------------
/examples/timeline-evaluation.py:
--------------------------------------------------------------------------------
 1 | from sklearn.svm import LinearSVC
 2 | 
 3 | from tesseract import evaluation, temporal, metrics, mock, viz
 4 | 
 5 | 
 6 | def main():
 7 |     # Generate dummy predictors, labels and timestamps from Gaussians
 8 |     X, y, t = mock.generate_binary_test_data(10000, '2014', '2016')
 9 | 
10 |     # Partition dataset
11 |     splits = temporal.time_aware_train_test_split(
12 |         X, y, t, train_size=12, test_size=1, granularity='month')
13 | 
14 |     # Perform a timeline evaluation
15 |     clf = LinearSVC()
16 |     results = evaluation.fit_predict_update(clf, *splits)
17 | 
18 |     # View results
19 |     metrics.print_metrics(results)
20 | 
21 |     # View AUT(F1, 24 months) as a measure of robustness over time
22 |     print(metrics.aut(results, 'f1'))
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     main()
27 | 


--------------------------------------------------------------------------------
/examples/active-learning.py:
--------------------------------------------------------------------------------
 1 | from sklearn.svm import LinearSVC
 2 | 
 3 | from tesseract import temporal, mock, evaluation, metrics
 4 | from tesseract.selection import UncertaintySamplingSelector
 5 | 
 6 | 
 7 | def main():
 8 |     X, y, t = mock.generate_binary_test_data(10000, '2000')
 9 | 
10 |     splits = temporal.time_aware_train_test_split(
11 |         X, y, t, train_size=6, test_size=1, granularity='month')
12 | 
13 |     clf = LinearSVC()
14 | 
15 |     selector = UncertaintySamplingSelector('20%')
16 |     results = evaluation.fit_predict_update(clf, *splits, selectors=[selector])
17 | 
18 |     metrics.print_metrics(results)
19 | 
20 |     print('Number of test objects selected each period:')
21 |     print(results['selected'])
22 | 
23 |     print('Array indices for selected objects from first test period:')
24 |     print(selector.selection_history[0])
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     main()
29 | 


--------------------------------------------------------------------------------
/examples/decay-plot.py:
--------------------------------------------------------------------------------
 1 | from sklearn.svm import LinearSVC
 2 | 
 3 | from tesseract import evaluation, temporal, metrics, mock, viz
 4 | import os
 5 | 
 6 | def main():
 7 |     os.environ["PATH"] += os.pathsep + '/Library/TeX/texbin'
 8 | 
 9 |     # Generate dummy predictors, labels and timestamps from Gaussians
10 |     X, y, t = mock.generate_binary_test_data(10000, '2014', '2016')
11 | 
12 |     # Partition dataset
13 |     splits = temporal.time_aware_train_test_split(
14 |         X, y, t, train_size=12, test_size=1, granularity='month')
15 | 
16 |     # Perform a timeline evaluation
17 |     clf = LinearSVC()
18 |     results = evaluation.fit_predict_update(clf, *splits)
19 | 
20 |     # View results
21 |     metrics.print_metrics(results)
22 | 
23 |     # View AUT(F1, 24 months) as a measure of robustness over time
24 |     print(metrics.aut_with_granularity(results, 'week', 'f1'))
25 | 
26 |     plt = viz.plot_decay(results)
27 |     plt.show()
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/examples/feature-reduct.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from sklearn.svm import LinearSVC
 5 | 
 6 | from tesseract import loader, temporal
 7 | 
 8 | 
 9 | def main():
10 | 
11 |     data_dir = 'DATA DIRECTORY GOES HERE'
12 | 
13 |     # Load features 
14 |     X, y, t, _ = loader.load_features(os.path.join(data_dir, 'raw', 'extended-features', 'extended-features'))
15 | 
16 |     # Split into training and testing sets
17 |     X_train_full, X_tests_full, y_train, y_tests, t_train, t_tests = \
18 |         temporal.time_aware_train_test_split(X, y, t, train_size=12, test_size=1, granularity='month')
19 | 
20 |     # SelectKBest feature selection for a classifier
21 |     clf = LinearSVC(dual="auto", max_iter=50000)
22 |     clf.fit(X_train_full, y_train)
23 | 
24 |     select_index = loader.feature_reduce(clf=clf, dim=10000)
25 | 
26 |     with open('reduced-Indexes-10000.json', 'w') as fp:
27 |         json.dump(select_index, fp, default=lambda x: int(x))
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/examples/reject.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier
 2 | 
 3 | from tesseract import temporal, mock, evaluation, metrics
 4 | from tesseract.rejection import ThresholdRejector
 5 | 
 6 | 
 7 | def main():
 8 |     X, y, t = mock.generate_binary_test_data(10000, '2000')
 9 | 
10 |     splits = temporal.time_aware_train_test_split(
11 |         X, y, t, train_size=6, test_size=1, granularity='month')
12 | 
13 |     clf = RandomForestClassifier()
14 | 
15 |     rejector = ThresholdRejector('<', 0.9)
16 |     results = evaluation.fit_predict_update(clf, *splits, rejectors=[rejector])
17 | 
18 |     metrics.print_metrics(results)
19 | 
20 |     print('Number of rejected predictions each period:')
21 |     print(results['rejected'])
22 | 
23 |     print('Array indices for rejected objects from first test period:')
24 |     print(rejector.rejection_history[0])
25 | 
26 |     print('Array indices for kept objects from first test period:')
27 |     print(rejector.kept_history[0])
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/examples/parallel-predict.py:
--------------------------------------------------------------------------------
 1 | from sklearn.svm import SVC
 2 | 
 3 | from tesseract import evaluation, temporal, metrics, mock
 4 | 
 5 | 
 6 | def main():
 7 |     # Generate dummy predictors, labels and timestamps from Gaussians
 8 |     X, y, t = mock.generate_binary_test_data(10000, '2014', '2016')
 9 | 
10 |     # Partition dataset
11 |     splits = temporal.time_aware_train_test_split(
12 |         X, y, t, train_size=12, test_size=1, granularity='month')
13 | 
14 |     X_train, X_tests, y_train, y_tests, t_train, t_tests = splits
15 | 
16 |     # Perform a timeline evaluation
17 |     clf = SVC(kernel='linear', probability=True)
18 |     clf.fit(X_train, y_train)
19 | 
20 |     y_preds = evaluation.predict(clf, X_tests, nproc=4)
21 |     results = metrics.calculate_metrics(y_tests, y_preds, periods=-1)
22 | 
23 |     # View results
24 |     metrics.print_metrics(results)
25 | 
26 |     # View AUT(F1, 24 months) as a measure of robustness over time
27 |     print(metrics.aut(results, 'f1'))
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/examples/constraints.py:
--------------------------------------------------------------------------------
 1 | from sklearn.svm import SVC
 2 | 
 3 | from tesseract import temporal, metrics, mock, spatial, evaluation
 4 | 
 5 | 
 6 | # TODO | Note that constraint checks are not currently integrated into the
 7 | # TODO | evaluation cycle fit_predict_update, so need to be checked manually
 8 | 
 9 | def main():
10 |     # Generate dummy predictors, labels and timestamps from Gaussians
11 |     X, y, t = mock.generate_binary_test_data(10000, '2014', '2016')
12 | 
13 |     # Partition dataset
14 |     splits = temporal.time_aware_train_test_split(
15 |         X, y, t, train_size=12, test_size=1, granularity='month')
16 | 
17 |     X_train, X_tests, y_train, y_tests, t_train, t_tests = splits
18 | 
19 |     for y_test, t_test in zip(y_tests, t_tests):
20 |         temporal.assert_positive_negative_temporal_consistency(y_test, t_test)
21 |         temporal.assert_train_test_temporal_consistency(t_train, t_test)
22 |         spatial.assert_class_distribution(y, 0.5, 0.1)
23 | 
24 |     # Perform a timeline evaluation
25 |     clf = SVC(kernel='linear', probability=True)
26 |     results = evaluation.fit_predict_update(clf, *splits)
27 | 
28 |     # View results
29 |     metrics.print_metrics(results)
30 | 
31 |     # View AUT(F1, 24 months) as a measure of robustness over time
32 |     print(metrics.aut(results, 'f1'))
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     main()
37 | 


--------------------------------------------------------------------------------
/tesseract/rebalancing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | rebalancing.py
 5 | ~~~~~~~~~~~~~~
 6 | 
 7 | # TODO | Add module description
 8 | 
 9 | """
10 | 
11 | import numpy as np
12 | 
13 | from tesseract import spatial
14 | from tesseract.evaluation import Stage
15 | 
16 | 
17 | class Rebalancer(Stage):
18 |     def alter_wrapper(self, clf, X_train, y_train, t_train, X_test,
19 |                       y_test, t_test):
20 |         # Pass parameters straight through to rebalance implementation
21 |         rebalanced = self.alter(clf, X_train, y_train, t_train,
22 |                                 X_test, y_test, t_test)
23 | 
24 |         return np.array(rebalanced)
25 | 
26 |     def alter(self, clf, X_train, y_train, t_train, X_test, y_test, t_test):
27 |         raise NotImplementedError('Rebalancer must be subclassed')
28 | 
29 | 
30 | class PositiveRateRebalancer(Rebalancer):
31 |     def __init__(self, min_pos_rate, max_pos_rate=None, noise_deviation=0.0,
32 |                  fixed_size=False, schedule=1):
33 |         super().__init__(schedule=schedule)
34 |         self.min_pos_rate = min_pos_rate
35 |         self.max_pos_rate = max_pos_rate
36 |         self.noise_deviation = noise_deviation
37 |         self.fixed_size = fixed_size
38 | 
39 |     def alter(self, clf, X_train, y_train, t_train, X_test, y_test, t_test):
40 |         return spatial.downsample_set(
41 |             X_train, y_train, t_train, self.min_pos_rate,
42 |             self.max_pos_rate, self.noise_deviation, self.fixed_size)
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, Royal Holloway, University of London. All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 8 | 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10 | 
11 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14 | 


--------------------------------------------------------------------------------
/test/test_mock.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | test_mock.py
 5 | ~~~~~~~
 6 | 
 7 | Unit tests for testing mock.py.
 8 | 
 9 | """
10 | import unittest
11 | 
12 | from tesseract import mock
13 | 
14 | 
15 | class TestMock(unittest.TestCase):
16 |     def test_generate_binary_test_data(self):
17 |         X, y, t = mock.generate_binary_test_data(10000, '2016')
18 |         self.assertEqual(len(X), len(y))
19 |         self.assertEqual(len(y), len(t))
20 | 
21 |     def test_generate_time_data(self):
22 |         expected = ['2012-09-22', '2012-11-26', '2012-09-11', '2012-07-15']
23 |         dates = mock.generate_time_data(4, '2012', random_state=22)
24 |         actual = [d.strftime('%Y-%m-%d') for d in dates]
25 |         self.assertEqual(expected, actual)
26 | 
27 |         expected = ['2012-11-07', '2011-12-20', '2015-11-08', '2011-09-13']
28 |         dates = mock.generate_time_data(4, '2010', '2016', random_state=22)
29 |         actual = [d.strftime('%Y-%m-%d') for d in dates]
30 |         self.assertEqual(expected, actual)
31 | 
32 |         years = (2010, 2011, 2012, 2013, 2014)
33 |         dates = mock.generate_time_data(10000, '2010', '2014-12-31')
34 |         for date in dates:
35 |             self.assertIn(date.year, years)
36 | 
37 |         expected = {2010}, {1}, {1}
38 |         dates = mock.generate_time_data(10000, '2010-01-01', '2010-01-02')
39 |         actual = (set(d.year for d in dates),
40 |                   set(d.month for d in dates),
41 |                   set(d.day for d in dates))
42 |         self.assertEqual(expected, actual)
43 | 


--------------------------------------------------------------------------------
/test/test_rebalancers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | test_rebalancers.py
 5 | ~~~~~~~~~~~~~~~~~~~
 6 | 
 7 | Unit tests for testing rebalancers.py.
 8 | 
 9 | """
10 | 
11 | import unittest
12 | 
13 | from sklearn.ensemble import RandomForestClassifier
14 | from sklearn.svm import SVC
15 | 
16 | from tesseract import temporal, mock, metrics, evaluation
17 | from tesseract.rebalancing import PositiveRateRebalancer
18 | 
19 | 
20 | class TestRebalancers(unittest.TestCase):
21 |     def setUp(self):
22 |         # Test partitions of 1 year
23 |         X, y, t = mock.generate_binary_test_data(10000, '2020')
24 | 
25 |         splits = temporal.time_aware_train_test_split(
26 |             X, y, t, train_size=6, test_size=2, granularity='month')
27 |         X_train, X_tests, y_train, y_tests, t_train, t_tests = splits
28 | 
29 |         self.X_train = X_train
30 |         self.y_train = y_train
31 |         self.X_tests = X_tests
32 |         self.y_tests = y_tests
33 |         self.t_train = t_train
34 |         self.t_tests = t_tests
35 | 
36 |         self.svm = SVC(kernel='linear', probability=False)
37 |         self.svm.fit(X_train, y_train)
38 | 
39 |         self.rf = RandomForestClassifier(n_estimators=101, max_depth=64)
40 |         self.rf.fit(X_train, y_train)
41 | 
42 |     def test_positive_rate_rebalancer(self):
43 |         for clf in (self.svm, self.rf):
44 |             pr_rebalancer = PositiveRateRebalancer(0.5)
45 |             results = evaluation.fit_predict_update(
46 |                 clf, self.X_train, self.X_tests,
47 |                 self.y_train, self.y_tests,
48 |                 self.t_train, self.t_tests,
49 |                 rebalancers=[pr_rebalancer])
50 | 
51 |             metrics.print_metrics(results)
52 | 


--------------------------------------------------------------------------------
/tesseract/mock.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | mock.py
 5 | ~~~~~~~
 6 | 
 7 | A module for generating test distributions for use with Tesseract.
 8 | 
 9 | """
10 | 
11 | from datetime import datetime
12 | 
13 | import numpy as np
14 | from dateutil.relativedelta import relativedelta
15 | from sklearn.datasets import make_classification
16 | 
17 | from tesseract.utils import resolve_date
18 | 
19 | 
20 | def generate_binary_test_data(n_samples, start, end=None, random_state=None):
21 |     """Generate a test dataset suitable for binary classification.
22 | 
23 |     Args:
24 |         n_samples (int): The number of examples to create between start and end.
25 |         start (str): The start date of the range to generate examples within.
26 |         end (str): The end date of the range to generate examples within.
27 |         random_state (int): A random number seed.
28 | 
29 |     Returns:
30 |         np.ndarray: Array of two-dimensional predictors X.
31 |         np.ndarray: Array of output variables y.
32 |         np.ndarray: Array of datetimes for each example.
33 | 
34 |     """
35 |     X, y = make_classification(n_samples, n_features=2, n_informative=2, n_redundant=0, class_sep=1.5,
36 |                                random_state=random_state)
37 |     t = generate_time_data(n_samples, start, end)
38 |     return X, y, t
39 | 
40 | 
41 | def generate_time_data(n_samples, start, end=None, random_state=None):
42 |     """Randomly sample from the given date range.
43 | 
44 |     Args:
45 |         n_samples (int): The number of dates to create between start and end.
46 |         start (str): The start date of the range to sample from.
47 |         end (str): The end date of the range to sample from.
48 |         random_state (int): A random number seed.
49 | 
50 |     Returns:
51 |         np.ndarray: Array of datetimes sampled within the given range.
52 | 
53 |     """
54 |     start = resolve_date(start)
55 |     end = resolve_date(end) if end else datetime(start.year, 12, 31)
56 | 
57 |     np.random.seed(random_state)
58 |     delta = int((end - start).total_seconds())
59 |     offsets = [np.random.randint(delta) for _ in range(n_samples)]
60 |     return np.array([start + relativedelta(seconds=x) for x in offsets])
61 | 


--------------------------------------------------------------------------------
/examples/tesseract-plots.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import datetime
 4 | import numpy as np
 5 | from sklearn.svm import LinearSVC
 6 | from sklearn.feature_extraction import DictVectorizer
 7 | from tesseract import evaluation, temporal, metrics, mock, viz, loader
 8 | 
 9 | os.environ["PATH"] += os.pathsep + '/Library/TeX/texbin'
10 | 
11 | ## Loading features
12 | 
13 | def load_dataset(dataset_path):
14 |     print(f'Loading dataset from {dataset_path}')
15 | 
16 |     with open('{}-X-updated-reduced-10k.json'.format(dataset_path), 'r') as f:
17 |         X = json.load(f)
18 | 
19 |     print('Loading labels...')
20 |     with open('{}-y-updated.json'.format(dataset_path), 'rt') as f:
21 |         y = json.load(f)
22 | 
23 |     print('Loading timestamps...')
24 |     with open('{}-meta-updated.json'.format(dataset_path), 'rt') as f:
25 |         T = json.load(f)
26 |     T = [o['dex_date'] for o in T]
27 |     T = np.array([datetime.datetime.strptime(o, '%Y-%m-%dT%H:%M:%S') if "T" in o
28 |              else datetime.datetime.strptime(o, '%Y-%m-%d %H:%M:%S') for o in T])
29 | 
30 |     # Convert to numpy array and get feature names
31 |     vec =  DictVectorizer()
32 |     X = vec.fit_transform(X).astype("float32")
33 |     y = np.asarray(y)
34 |     feature_names = vec.get_feature_names_out()
35 | 
36 |     # Get time index of each sample for easy reference
37 |     time_index = {}
38 |     for i in range(len(T)):
39 |         t = T[i]
40 |         if t.year not in time_index:
41 |             time_index[t.year] = {}
42 |         if t.month not in time_index[t.year]:
43 |             time_index[t.year][t.month] = []
44 |         time_index[t.year][t.month].append(i)
45 | 
46 |     return X, y, time_index, feature_names, T
47 | 
48 | X, y, time_index, feature_names, T = load_dataset('../extended-features/extended-features')
49 | 
50 | # Partition dataset
51 | splits = temporal.time_aware_train_test_split(
52 |     X, y, T, train_size=12, test_size=1, granularity='month')
53 | 
54 | # Perform a timeline evaluation
55 | clf = LinearSVC(C=1)
56 | results = evaluation.fit_predict_update(clf, *splits)
57 | 
58 | 
59 | # ################
60 | # View Results
61 | # ################
62 | from pylab import *
63 | 
64 | pendleblue='#1f8fff'
65 | pendleyellow='#ffa600'
66 | 
67 | # '#FF9999', '#FFDD99', '#AAEEEE'
68 | plot(results['precision'], marker='o', color=pendleyellow)
69 | plot(results['recall'], marker='o', color='red')
70 | plot(results['f1'], marker='o', color=pendleblue)
71 | legend(['Precision', 'Recall', 'F1'])
72 | xlim([0,23])
73 | xlabel('Testing period (month)')
74 | ylabel('Performance')
75 | grid(axis = 'y')
76 | show()


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean data
 2 | 
 3 | #################################################################################
 4 | # GLOBALS                                                                       #
 5 | #################################################################################
 6 | 
 7 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 8 | PROJECT_NAME = tesseract
 9 | PYTHON_INTERPRETER = python3
10 | 
11 | #################################################################################
12 | # COMMANDS                                                                      #
13 | #################################################################################
14 | 
15 | ## Make Dataset
16 | data:
17 | 	wget -O ./data/raw/drebin-features.tar.gz https://www.dropbox.com/s/i7q8ysi5agi6n0f/drebin-features.tar.gz
18 | 	tar -zxf ./data/raw/drebin-features.tar.gz --directory ./data/processed/
19 | 	rm ./data/raw/drebin-features.tar.gz
20 | 
21 | 
22 | ## Delete all compiled Python files
23 | clean:
24 | 	find . -type f -name "*.py[co]" -delete
25 | 	find . -type d -name "__pycache__" -delete
26 | 
27 | 
28 | #################################################################################
29 | # Self Documenting Commands                                                     #
30 | #################################################################################
31 | 
32 | .DEFAULT_GOAL := help
33 | 
34 | # Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
35 | # sed script explained:
36 | # /^##/:
37 | # 	* save line in hold space
38 | # 	* purge line
39 | # 	* Loop:
40 | # 		* append newline + line to hold space
41 | # 		* go to next line
42 | # 		* if line starts with doc comment, strip comment character off and loop
43 | # 	* remove target prerequisites
44 | # 	* append hold space (+ newline) to line
45 | # 	* replace newline plus comments by `---`
46 | # 	* print line
47 | # Separate expressions are necessary because labels cannot be delimited by
48 | # semicolon; see <http://stackoverflow.com/a/11799865/1968>
49 | .PHONY: help
50 | help:
51 | 	@echo "$$(tput bold)Available rules:$$(tput sgr0)"
52 | 	@echo
53 | 	@sed -n -e "/^## / { \
54 | 		h; \
55 | 		s/.*//; \
56 | 		:doc" \
57 | 		-e "H; \
58 | 		n; \
59 | 		s/^## //; \
60 | 		t doc" \
61 | 		-e "s/:.*//; \
62 | 		G; \
63 | 		s/\\n## /---/; \
64 | 		s/\\n/ /g; \
65 | 		p; \
66 | 	}" ${MAKEFILE_LIST} \
67 | 	| LC_ALL='C' sort --ignore-case \
68 | 	| awk -F '---' \
69 | 		-v ncol=$$(tput cols) \
70 | 		-v indent=19 \
71 | 		-v col_on="$$(tput setaf 6)" \
72 | 		-v col_off="$$(tput sgr0)" \
73 | 	'{ \
74 | 		printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
75 | 		n = split($$2, words, " "); \
76 | 		line_length = ncol - indent; \
77 | 		for (i = 1; i <= n; i++) { \
78 | 			line_length -= length(words[i]) + 1; \
79 | 			if (line_length <= 0) { \
80 | 				line_length = ncol - indent - length(words[i]) - 1; \
81 | 				printf "\n%*s ", -indent, " "; \
82 | 			} \
83 | 			printf "%s ", words[i]; \
84 | 		} \
85 | 		printf "\n"; \
86 | 	}' \
87 | 	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
88 | 
89 | 


--------------------------------------------------------------------------------
/tesseract/loader.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import re
 4 | from datetime import datetime
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | import scipy
 9 | import numpy
10 | from sklearn.datasets import load_svmlight_file
11 | from sklearn.feature_extraction import DictVectorizer
12 | 
13 | 
14 | def load_features(fname, shas=False):
15 |     """Load feature set.
16 | 
17 |     Args:
18 |         fname (str): The common prefix for the dataset.
19 |             (e.g., 'data/features/drebin' -> 'data/features/drebin-[X|Y|meta].json')
20 | 
21 |         shas (bool): Whether to include shas. In some versions of the dataset,
22 |             shas were included to double-check alignment - these are _not_ features
23 |             and _must_ be removed before training.
24 | 
25 |     Returns:
26 |         Tuple[List[Dict], List, List]: The features, labels, and timestamps
27 |             for the dataset.
28 | 
29 |     """
30 |     time_index = {}
31 | 
32 |     feature_path = os.path.join(os.path.dirname(fname), 'extended-features-{}.json')
33 | 
34 |     with open(feature_path.format("X"), 'rb') as f:
35 |         X = json.load(f)
36 |     with open(feature_path.format("y"), 'r') as f:
37 |         y = json.load(f)
38 | 
39 |     with open(feature_path.format("meta"), 'r') as f:
40 |         T = json.load(f)
41 |         T = [o['dex_date'] for o in T]
42 |         T = numpy.array([datetime.strptime(o, '%Y-%m-%dT%H:%M:%S') if "T" in o
43 |                          else datetime.strptime(o, '%Y-%m-%d %H:%M:%S') for o in T])
44 | 
45 |     vec = DictVectorizer()
46 |     X = vec.fit_transform(X)
47 |     y = numpy.asarray(y)
48 | 
49 |     for i in range(len(T)):
50 |         t = T[i]
51 |         if t.year not in time_index:
52 |             time_index[t.year] = {}
53 |         if t.month not in time_index[t.year]:
54 |             time_index[t.year][t.month] = []
55 |         time_index[t.year][t.month].append(i)
56 | 
57 |     return X, y, T, time_index
58 | 
59 | 
60 | def load_range_dataset_w_benign(data_name, start_month, end_month, folder='data/'):
61 |     if start_month != end_month:
62 |         dataset_name = f'{start_month}to{end_month}'
63 |     else:
64 |         dataset_name = f'{start_month}'
65 |     saved_data_file = os.path.join(folder, data_name, f'{dataset_name}_selected.npz')
66 |     data = np.load(saved_data_file, allow_pickle=True)
67 |     X_train, y_train = data['X_train'], data['y_train']
68 |     y_mal_family = data['y_mal_family']
69 |     return X_train, y_train, y_mal_family
70 | 
71 | 
72 | def feature_reduce(clf, dim):
73 |     if hasattr(clf, 'coef_'):
74 |         select_index = np.argpartition(abs(clf.coef_[0]), -dim)[-dim:]
75 |         return select_index
76 |     else:
77 |         print('Wrong classifier')
78 |         exit(-1)
79 | 
80 | 
81 | def load_dates(infile):
82 |     """
83 |     Parses infile for any dates formatted as YYYY/MM/DD, at most one
84 |     per line. Returns a list of datetime.date objects, in order of
85 |     encounter.
86 |     """
87 |     datere = re.compile(r'\d{4}/\d{2}/\d{2}')
88 |     dates = []
89 |     for line in open(infile, 'r', encoding='utf-8'):
90 |         match = re.search(datere, line)
91 |         if match:
92 |             dates.append(datetime(*(map(int, match.group().split('/')))))
93 |     return dates
94 | 


--------------------------------------------------------------------------------
/tesseract/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | utils.py
  5 | ~~~~~~~~
  6 | 
  7 | A selection of useful helper functions used throughout the Tesseract library.
  8 | 
  9 | """
 10 | 
 11 | import logging
 12 | from datetime import datetime, date
 13 | from functools import wraps
 14 | from timeit import default_timer as timer
 15 | 
 16 | import numpy as np
 17 | 
 18 | 
 19 | def resolve_date(d):
 20 |     """Convert a str or date to an appropriate datetime.
 21 | 
 22 |     Strings should be of the format '%Y', '%Y-%m or '%Y-%m-%d', for example:
 23 |     '2012', '1994-02' or '1991-12-11'. Date objects with no time information
 24 |     will be rounded down to the midnight beginning that date.
 25 | 
 26 |     Args:
 27 |         d (Union[str, date]): The string or date to convert.
 28 | 
 29 |     Returns:
 30 |         datetime: The parsed datetime equivalent of d.
 31 |     """
 32 |     if isinstance(d, datetime):
 33 |         return d
 34 | 
 35 |     if isinstance(d, date):
 36 |         return datetime.combine(d, datetime.min.time())
 37 | 
 38 |     for fmt in ('%Y', '%Y-%m', '%Y-%m-%d'):
 39 |         try:
 40 |             return datetime.strptime(d, fmt)
 41 |         except ValueError:
 42 |             pass
 43 | 
 44 |     raise ValueError('date string format not recognized.')
 45 | 
 46 | 
 47 | def check_for_raw_scores(y_pred):
 48 |     # Heuristic to check if input are raw scores
 49 |     if y_pred.ndim > 1:
 50 |         for v in y_pred:
 51 |             if ((np.linalg.norm(v, 0),
 52 |                  np.linalg.norm(v), 2) != (1, 1)):
 53 |                 return True
 54 |     return False
 55 | 
 56 | 
 57 | def select_prediction_function(clf, scores_only=False, labels_only=False):
 58 |     if hasattr(clf, 'predict_proba') and not labels_only:
 59 |         prediction_function = clf.predict_proba
 60 |     elif hasattr(clf, 'decision_function') and not labels_only:
 61 |         prediction_function = clf.decision_function
 62 |     elif hasattr(clf, 'predict') and not scores_only:
 63 |         prediction_function = clf.predict
 64 |     else:
 65 |         raise TypeError(
 66 |             'Unsure how to handle predictions with '
 67 |             'classifier of type {}.'.format(clf.__class__))
 68 |     return prediction_function
 69 | 
 70 | 
 71 | def resolve_categorical(y):
 72 |     return np.argmax(y, 1) if y.ndim > 1 else y
 73 | 
 74 | 
 75 | def binary_labels(array, positive='malicious', negative='benign'):
 76 |     return [positive if x else negative for x in array]
 77 | 
 78 | 
 79 | def parse_percentage(n):
 80 |     return float(n[:-1]) / 100
 81 | 
 82 | 
 83 | def resolve_percentage(n):
 84 |     return parse_percentage(n) if isinstance(n, str) else n
 85 | 
 86 | 
 87 | def seconds_to_time(seconds):
 88 |     """Return a nicely formatted time given the number of seconds."""
 89 |     m, s = divmod(seconds, 60)
 90 |     h, m = divmod(m, 60)
 91 |     d, h = divmod(h, 24)
 92 |     return "%d days, %02d hours, %02d minutes, %02d seconds" % (d, h, m, s)
 93 | 
 94 | 
 95 | def timing(f):
 96 |     @wraps(f)
 97 |     def wrap(*args, **kwargs):
 98 |         start = timer()
 99 |         result = f(*args, **kwargs)
100 |         elapsed = seconds_to_time(timer() - start)
101 |         logging.debug('{} took: {}'.format(f.__name__, elapsed))
102 |         return result
103 | 
104 |     return wrap
105 | 


--------------------------------------------------------------------------------
/test/test_temporal.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | test_temporal.py
 5 | ~~~~~~~~~~~~~~~~
 6 | 
 7 | Unit tests for temporal.py.
 8 | 
 9 | """
10 | import random
11 | import unittest
12 | from datetime import datetime
13 | 
14 | import numpy as np
15 | from sklearn.svm import LinearSVC
16 | 
17 | from tesseract import temporal, mock, selection, evaluation
18 | 
19 | 
20 | class TestTemporal(unittest.TestCase):
21 |     def test_train(self):
22 |         # Test partitions of 1 year
23 |         X, y, t = mock.generate_binary_test_data(10000, '2020')
24 |         splits = temporal.time_aware_train_test_split(
25 |             X, y, t, 6, 2, granularity='month', start_date='2020')
26 |         X_train, X_tests, y_train, y_tests, t_train, t_tests = splits
27 | 
28 |         results = evaluation.fit_predict_update(LinearSVC(), X_train, X_tests,
29 |                                                 y_train, y_tests, t_train,
30 |                                                 t_tests)
31 |         print(results)
32 |         results = evaluation.fit_predict_update(
33 |             LinearSVC(), X_train, X_tests, y_train, y_tests, t_train, t_tests)
34 |         print(results)
35 | 
36 |     def test_time_aware_indexes(self):
37 |         # Test partitions of 1 year
38 |         t = np.array([datetime(2020, x, 1) for x in range(1, 13)])
39 |         random.shuffle(t)
40 |         train, tests = temporal.time_aware_indexes(
41 |             t, 6, 2, granularity='month', start_date='2020')
42 | 
43 |         # Smoke tests
44 |         self.assertEqual(6, len(train))
45 |         self.assertEqual(3, len(tests))
46 | 
47 |         for test in tests:
48 |             self.assertEqual(2, len(test))
49 | 
50 |         # Check partition is complete and non-destructive
51 |         recreated = train + [x for sub in tests for x in sub]
52 |         self.assertEqual(len(recreated), len(t))
53 |         self.assertEqual(set(recreated), set(range(len(t))))
54 | 
55 |         t_train = t[train]
56 |         t_tests = [t[index_set] for index_set in tests]
57 | 
58 |         # Check partition is history-aware
59 |         for m in t_train:
60 |             for n in t_tests[0]:
61 |                 self.assertTrue(m < n)
62 | 
63 |         for i in range(0, len(t_tests) - 1):
64 |             for m in t_tests[i]:
65 |                 for n in t_tests[i + 1]:
66 |                     self.assertTrue(m < n)
67 | 
68 |     def test_time_aware_train_test_split(self):
69 |         # Test partitions of 1 year
70 |         X, y, t = mock.generate_binary_test_data(10000, '2020')
71 |         X_train, X_tests, y_train, y_tests, t_train, t_tests = \
72 |             temporal.time_aware_train_test_split(
73 |                 X, y, t, 6, 2, granularity='month', start_date='2020')
74 | 
75 |         # Smoke tests
76 |         self.assertEqual(len(X_train), len(y_train))
77 |         self.assertEqual(len(X_tests), len(y_tests))
78 |         self.assertEqual(len(X_tests[0]), len(y_tests[0]))
79 | 
80 |         for i in range(len(X_tests)):
81 |             self.assertEqual(len(X_tests[i]), len(y_tests[i]))
82 | 
83 |     def test_closest_to_hyperplane(self):
84 |         narray = np.array([3, -1, 7, 2, 5, -4])
85 |         indexes = selection.closest_to_hyperplane(narray, 2)
86 |         self.assertTrue(all([1, 3] == indexes))
87 |         self.assertTrue(all([-1, 2] == narray[indexes]))
88 | 


--------------------------------------------------------------------------------
/test/test_evaluation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | test_evaluation.py
 5 | ~~~~~~~~~~~~~~~~~~
 6 | 
 7 | Unit tests used to help redesign the typical Tesseract workflow.
 8 | """
 9 | 
10 | import unittest
11 | 
12 | from sklearn.svm import SVC
13 | 
14 | from tesseract import temporal, mock, metrics, evaluation
15 | 
16 | 
17 | class TestWorkflow(unittest.TestCase):
18 |     def setUp(self):
19 |         # Test partitions of 1 year
20 |         X, y, t = mock.generate_binary_test_data(10000, '2020')
21 | 
22 |         splits = temporal.time_aware_train_test_split(
23 |             X, y, t, train_size=6, test_size=2,
24 |             granularity='month', start_date='2020')
25 |         X_train, X_tests, y_train, y_tests, t_train, t_tests = splits
26 | 
27 |         self.X_train = X_train
28 |         self.y_train = y_train
29 |         self.X_tests = X_tests
30 |         self.y_tests = y_tests
31 |         self.t_train = t_train
32 |         self.t_tests = t_tests
33 | 
34 |         self.clf = SVC(kernel='linear', probability=True)
35 |         self.clf.fit(X_train, y_train)
36 | 
37 |     def test_use_case_1(self):
38 |         # Predict each test period yourself and get individual results
39 |         for i, (X_test, y_true) in enumerate(zip(self.X_tests, self.y_tests)):
40 |             y_pred = self.clf.predict(X_test)
41 | 
42 |             print('Test period {}'.format(i))
43 |             results = metrics.calculate_metrics(y_true, y_pred)
44 |             metrics.print_metrics(results, header=False)
45 | 
46 |     def test_use_case_2(self):
47 |         # Keep a running data structure for results
48 |         results = {}
49 |         for i, (X_test, y_true) in enumerate(zip(self.X_tests, self.y_tests)):
50 |             y_pred = self.clf.predict(X_test)
51 | 
52 |             results = metrics.calculate_metrics(
53 |                 y_true, y_pred, existing=results)
54 |         metrics.print_metrics(results)
55 | 
56 |     def test_use_case_3(self):
57 |         # Use a library method to run the entire prediction
58 |         y_preds = evaluation.predict(self.clf, self.X_tests)
59 |         results = metrics.calculate_metrics(self.y_tests, y_preds, periods=3)
60 |         metrics.print_metrics(results)
61 | 
62 |     def test_use_case_4(self):
63 |         # Parallelising computation of test periods
64 |         y_preds = evaluation.predict(self.clf, self.X_tests, nproc=3)
65 |         results = metrics.calculate_metrics(self.y_tests, y_preds, periods=-1)
66 |         metrics.print_metrics(results)
67 | 
68 |     def test_use_case_5(self):
69 |         # Forcing output to be labels rather than probabilities
70 |         y_preds = evaluation.predict(
71 |             self.clf, self.X_tests, labels_only=True)
72 |         results = metrics.calculate_metrics(self.y_tests, y_preds, periods=-1)
73 |         metrics.print_metrics(results)
74 |         print(metrics.aut(results, 'f1'))
75 |         print(metrics.aut(results['f1']))
76 | 
77 |     def test_use_case_6(self):
78 |         # Use full fit_predict_update to measure the performance
79 |         results = evaluation.fit_predict_update(
80 |             self.clf, self.X_train, self.X_tests,
81 |             self.y_train, self.y_tests,
82 |             self.t_train, self.t_tests)
83 | 
84 |         metrics.print_metrics(results)
85 |         print(metrics.aut(results, 'f1'))
86 |         print(metrics.aut(results['f1']))
87 | 


--------------------------------------------------------------------------------
/test/test_selectors.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | test_selectors.py
  5 | ~~~~~~~~~~~~~~~~~
  6 | 
  7 | Unit tests for testing selectors.py.
  8 | 
  9 | """
 10 | 
 11 | import unittest
 12 | 
 13 | import numpy as np
 14 | from sklearn.ensemble import RandomForestClassifier
 15 | from sklearn.svm import LinearSVC
 16 | 
 17 | from tesseract import temporal, mock, metrics, evaluation
 18 | from tesseract.selection import FullRetrainingSelector, ActiveLearningSelector, \
 19 |     UncertaintySamplingSelector
 20 | 
 21 | 
 22 | class TestSelectors(unittest.TestCase):
 23 |     def setUp(self):
 24 |         # Test partitions of 1 year
 25 |         X, y, t = mock.generate_binary_test_data(10000, '2020')
 26 | 
 27 |         splits = temporal.time_aware_train_test_split(
 28 |             X, y, t, train_size=6, test_size=2, granularity='month')
 29 |         X_train, X_tests, y_train, y_tests, t_train, t_tests = splits
 30 | 
 31 |         self.X_train = X_train
 32 |         self.y_train = y_train
 33 |         self.X_tests = X_tests
 34 |         self.y_tests = y_tests
 35 |         self.t_train = t_train
 36 |         self.t_tests = t_tests
 37 | 
 38 |         self.svm = LinearSVC()
 39 |         self.svm.fit(X_train, y_train)
 40 | 
 41 |         self.rf = RandomForestClassifier(n_estimators=101, max_depth=64)
 42 |         self.rf.fit(X_train, y_train)
 43 | 
 44 |     def test_full_retraining(self):
 45 |         for clf in (self.svm, self.rf):
 46 |             results = evaluation.fit_predict_update(
 47 |                 clf, self.X_train, self.X_tests,
 48 |                 self.y_train, self.y_tests,
 49 |                 self.t_train, self.t_tests,
 50 |                 selectors=[FullRetrainingSelector()])
 51 | 
 52 |             metrics.print_metrics(results)
 53 | 
 54 |             for i in range(1, len(self.y_tests)):
 55 |                 expected = results['train_tot'][i - 1] + results['tot'][i - 1]
 56 |                 actual = results['train_tot'][i]
 57 | 
 58 |                 self.assertEqual(expected, actual)
 59 | 
 60 |     def test_active_learning(self):
 61 |         def closest_to_hyperplane(*args):
 62 |             clf, X_test, n = args[0], args[4], args[-1]
 63 |             y_raw = clf.decision_function(X_test)
 64 |             absolute = np.abs(y_raw)
 65 |             indexes = np.argsort(absolute)
 66 |             return indexes[:n]
 67 | 
 68 |         results = evaluation.fit_predict_update(
 69 |             self.svm, self.X_train, self.X_tests,
 70 |             self.y_train, self.y_tests,
 71 |             self.t_train, self.t_tests,
 72 |             selectors=[ActiveLearningSelector(
 73 |                 '20%', closest_to_hyperplane)])
 74 | 
 75 |         metrics.print_metrics(results)
 76 | 
 77 |         for i in range(1, len(self.y_tests)):
 78 |             expected = int(results['train_tot'][i - 1] +
 79 |                            results['tot'][i - 1] * 0.2)
 80 |             actual = results['train_tot'][i]
 81 | 
 82 |             self.assertEqual(expected, actual)
 83 | 
 84 |     def test_uncertainty_sampling(self):
 85 |         for clf in (self.svm, self.rf):
 86 |             results = evaluation.fit_predict_update(
 87 |                 clf, self.X_train, self.X_tests,
 88 |                 self.y_train, self.y_tests,
 89 |                 self.t_train, self.t_tests,
 90 |                 selectors=[UncertaintySamplingSelector('20%')])
 91 | 
 92 |             metrics.print_metrics(results)
 93 | 
 94 |             for i in range(1, len(self.y_tests)):
 95 |                 expected = int(results['train_tot'][i - 1] +
 96 |                                results['tot'][i - 1] * 0.2)
 97 |                 actual = results['train_tot'][i]
 98 | 
 99 |                 self.assertEqual(expected, actual)
100 | 


--------------------------------------------------------------------------------
/test/test_rejectors.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | test_rejectors.py
  5 | ~~~~~~~~~~~~~~~~~
  6 | 
  7 | Unit tests for testing rejectors.py.
  8 | 
  9 | """
 10 | 
 11 | import unittest
 12 | 
 13 | from sklearn.ensemble import RandomForestClassifier
 14 | from sklearn.svm import SVC, LinearSVC
 15 | 
 16 | from tesseract import temporal, mock, metrics, rejection, evaluation
 17 | from tesseract.rejection import ThresholdRejector
 18 | 
 19 | 
 20 | class TestRejectors(unittest.TestCase):
 21 |     def setUp(self):
 22 |         # Test partitions of 1 year
 23 |         X, y, t = mock.generate_binary_test_data(10000, '2020')
 24 | 
 25 |         splits = temporal.time_aware_train_test_split(
 26 |             X, y, t, train_size=6, test_size=2, granularity='month')
 27 |         X_train, X_tests, y_train, y_tests, t_train, t_tests = splits
 28 | 
 29 |         self.X_train = X_train
 30 |         self.y_train = y_train
 31 |         self.X_tests = X_tests
 32 |         self.y_tests = y_tests
 33 |         self.t_train = t_train
 34 |         self.t_tests = t_tests
 35 | 
 36 |         self.svm = SVC(kernel='linear', probability=False)
 37 |         self.svm.fit(X_train, y_train)
 38 | 
 39 |         self.rf = RandomForestClassifier(n_estimators=101, max_depth=64)
 40 |         self.rf.fit(X_train, y_train)
 41 | 
 42 |     def test_threshold_rejector_rf(self):
 43 |         t_rejector = ThresholdRejector('<', 0.9)
 44 |         results = evaluation.fit_predict_update(
 45 |             self.rf, self.X_train, self.X_tests,
 46 |             self.y_train, self.y_tests,
 47 |             self.t_train, self.t_tests,
 48 |             rejectors=[t_rejector])
 49 | 
 50 |         metrics.print_metrics(results)
 51 | 
 52 |         # Check that something was rejected each period,
 53 |         # more thorough tests are certainly desired!
 54 | 
 55 |         for i in range(len(self.y_tests)):
 56 |             self.assertGreater(results['rejected'][i], 0)
 57 | 
 58 |     def test_threshold_rejector_svm_between(self):
 59 |         t_rejector = ThresholdRejector('><', (-5, 5))
 60 |         results = evaluation.fit_predict_update(
 61 |             self.svm, self.X_train, self.X_tests,
 62 |             self.y_train, self.y_tests,
 63 |             self.t_train, self.t_tests,
 64 |             rejectors=[t_rejector])
 65 | 
 66 |         metrics.print_metrics(results)
 67 | 
 68 |         for i in range(len(self.y_tests)):
 69 |             self.assertGreater(results['rejected'][i], 0)
 70 | 
 71 |     def test_threshold_rejector_svm_outside(self):
 72 |         t_rejector = ThresholdRejector('<>', (-5, 5))
 73 |         results = evaluation.fit_predict_update(
 74 |             self.svm, self.X_train, self.X_tests,
 75 |             self.y_train, self.y_tests,
 76 |             self.t_train, self.t_tests,
 77 |             rejectors=[t_rejector])
 78 | 
 79 |         metrics.print_metrics(results)
 80 | 
 81 |         for i in range(len(self.y_tests)):
 82 |             self.assertGreater(results['rejected'][i], 0)
 83 | 
 84 | 
 85 | class TestRejection(unittest.TestCase):
 86 |     def test_thresholds(self):
 87 |         # Test partitions of 1 year
 88 |         X, y, t = mock.generate_binary_test_data(10000, '2014', end='2016')
 89 |         splits = temporal.time_aware_train_test_split(
 90 |             X, y, t, 12, 1, granularity='month', start_date='2014')
 91 |         X_train, X_tests, y_train, y_tests, t_train, t_tests = splits
 92 | 
 93 |         clf = LinearSVC()
 94 |         aa = rejection.alpha_assessment(clf, X_train, y_train, folds=5)
 95 |         n_quartiles, p_quartiles = rejection.quartiles(aa)
 96 |         n_threshold, p_threshold = n_quartiles[3], p_quartiles[1]
 97 |         print(n_threshold, p_threshold)
 98 | 
 99 |         rejection_options = {'thresholds': [n_threshold, p_threshold],
100 |                              'comparators': ['<', '>']}
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TESSERACT
  2 | 
  3 | As malware evolves over time, the performance of malware detectors tends to degrade. Many solutions in the security literature fail to consider the time information associated with the samples while evaluating their classifier which can induce positive bias in the results. 
  4 | 
  5 | This repository contains the source code for a prototype implementation of Tesseract.  
  6 | 
  7 | Further details can be found in the paper *TESSERACT: Eliminating Experimental Bias in Malware Classification across Space and Time*. F.  Pendlebury, F. Pierazzi, R. Jordaney, J. Kinder, and L. Cavallaro.  USENIX Sec 2019. Check also `https://s2lab.cs.ucl.ac.uk/projects/tesseract` for up-to-date information on the project, e.g., a talk at USENIX Enigma 2019 at `https://www.usenix.org/conference/enigma2019/presentation/cavallaro`.
  8 | 
  9 | If you end up using Tesseract as part of a project or publication, please include a citation of the latest preprint: 
 10 | 
 11 | ```bibtex
 12 | @inproceedings{pendlebury2019,
 13 |    author = {Feargus Pendlebury, Fabio Pierazzi, Roberto Jordaney, Johannes Kinder, and Lorenzo Cavallaro},
 14 |    title = {{TESSERACT: Eliminating Experimental Bias in Malware Classification across Space and Time}},
 15 |    booktitle = {28th USENIX Security Symposium},
 16 |    year = {2019},
 17 |    address = {Santa Clara, CA},
 18 |    publisher = {USENIX Association},
 19 |    note = {USENIX Sec}
 20 | }
 21 | ```
 22 | 
 23 | ## Getting Started 
 24 | 
 25 | ### Installation
 26 | 
 27 | Tesseract requires Python 3 (preferably >= 3.5) as well as the statistical learning stack of NumPy, SciPy, and Scikit-learn. 
 28 | 
 29 | Create virtual environment (recommended) and install tesseract with script `setup.py`:
 30 | 
 31 | ```shell
 32 | python3 setup.py install 
 33 | ```
 34 | 
 35 | To download the data, run
 36 | 
 37 | ```shell
 38 | make data
 39 | ```
 40 | 
 41 | This should download the feature vectors and store them in
 42 | `data/processed`. An example that shows how to reproduce the experiments can be found in
 43 | `notebooks/reproduce-tesseract.ipynb`.
 44 | 
 45 | ### Usage 
 46 | 
 47 | Basic usage, dividing a dataset into time-aware sets and performing a time-aware evaluation. 
 48 | More complex examples can be found in the `examples/` and `test/` directories. 
 49 | 
 50 | ```python
 51 | from sklearn.svm import LinearSVC
 52 | from tesseract import evaluation, temporal, metrics, mock
 53 | 
 54 | 
 55 | def main():
 56 |     # Generate dummy predictors, labels and timestamps from Gaussians
 57 |     X, y, t = mock.generate_binary_test_data(10000, '2014', '2016')
 58 | 
 59 |     # Partition dataset
 60 |     splits = temporal.time_aware_train_test_split(
 61 |         X, y, t, train_size=12, test_size=1, granularity='month')
 62 | 
 63 |     # Perform a timeline evaluation
 64 |     clf = LinearSVC()
 65 |     results = evaluation.fit_predict_update(clf, *splits)
 66 |     
 67 |     # View results 
 68 |     metrics.print_metrics(results)
 69 |     
 70 |     # View AUT(F1, 24 months) as a measure of robustness over time 
 71 |     print(metrics.aut(results, 'f1'))
 72 | 
 73 | 
 74 | if __name__ == '__main__':
 75 |     main()
 76 | 
 77 | ```
 78 | 
 79 | ## Running the tests 
 80 | 
 81 | To run all unittests within the `test/` directory: 
 82 | 
 83 | ```shell 
 84 | python -m unittest 
 85 | ```
 86 | 
 87 | ## Current Working State 
 88 | 
 89 | Tesseract is still a research prototype and subject to breaking changes, although following a recent redesign we 
 90 | expect such changes to be kept to a minimum. Due to this redesign there may also be discrepancies between the current 
 91 | implementation and §6 of the Tesseract manuscript---although we are aiming to soon publish a short technical report
 92 | that details the new design. We know this can be frustrating and thank you for your patience!
 93 | 
 94 | If you encounter a bug or have a feature request, please feel free to contact the maintainer directly 
 95 | at `lorenzo.cavallaro [at] ucl.ac.uk` and cc `fabio.pierazzi [at] kcl.ac.uk`.
 96 | 
 97 | 
 98 | ## Acknowledgements 
 99 | 
100 | This project has been generously sponsored by the UK EP/L022710/1 and EP/P009301/1 EPSRC research grants.
101 | 


--------------------------------------------------------------------------------
/tesseract/selection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | selection.py
  5 | ~~~~~~~~~~~~
  6 | 
  7 | # TODO | Add module description
  8 | 
  9 | """
 10 | 
 11 | import numpy as np
 12 | 
 13 | from tesseract import utils
 14 | from tesseract.evaluation import TrackingStage
 15 | 
 16 | 
 17 | class Selector(TrackingStage):
 18 |     def __init__(self, schedule=1, tracking=True, interaction='intersection'):
 19 |         super().__init__(schedule, tracking, interaction)
 20 |         self.selection_history = []
 21 | 
 22 |     def query_wrapper(self, clf, X_train, y_train, t_train,
 23 |                       X_test, y_test, t_test, previously_selected):
 24 |         # Pass parameters straight through to query implementation
 25 |         selected = self.query(clf, X_train, y_train, t_train,
 26 |                               X_test, y_test, t_test, previously_selected)
 27 | 
 28 |         if self.tracking:
 29 |             self.selection_history.append(selected)
 30 | 
 31 |         # Merge results with those of previous selectors
 32 |         selected = self.merge_results(previously_selected, selected)
 33 | 
 34 |         return np.array(selected)
 35 | 
 36 |     def query(self, clf, X_train, y_train, t_train,
 37 |               X_test, y_test, t_test, previously_selected):
 38 |         raise NotImplementedError('Selector must be subclassed')
 39 | 
 40 | 
 41 | class FullRetrainingSelector(Selector):
 42 |     def __init__(self, schedule=1, tracking=True, interaction='intersection'):
 43 |         super().__init__(schedule, tracking, interaction)
 44 | 
 45 |     def query(self, clf, X_train, y_train, t_train,
 46 |               X_test, y_test, t_test, previously_selected):
 47 |         return range(len(y_test))
 48 | 
 49 | 
 50 | class ActiveLearningSelector(Selector):
 51 |     def __init__(self, n, query_strategy, schedule=1,
 52 |                  tracking=True, interaction='intersection'):
 53 |         super().__init__(schedule, tracking, interaction)
 54 |         self.n = n
 55 |         self.query_strategy = query_strategy
 56 | 
 57 |     def query(self, clf, X_train, y_train, t_train,
 58 |               X_test, y_test, t_test, previously_selected):
 59 |         # Parse percentage if string passed in as n (eg. '20%')
 60 |         m = int(utils.parse_percentage(self.n) * len(y_test)
 61 |                 if isinstance(self.n, str) else self.n)
 62 |         return self.query_strategy(clf, X_train, y_train, t_train,
 63 |                                    X_test, y_test, t_test,
 64 |                                    previously_selected, m)
 65 | 
 66 | 
 67 | class UncertaintySamplingSelector(Selector):
 68 |     def __init__(self, n, schedule=1, tracking=True,
 69 |                  interaction='intersection'):
 70 |         super().__init__(schedule, tracking, interaction)
 71 |         self.n = n
 72 | 
 73 |     def query(self, clf, X_train, y_train, t_train,
 74 |               X_test, y_test, t_test, previously_selected):
 75 |         # Parse percentage if string passed in as n (eg. '20%')
 76 |         m = int(utils.parse_percentage(self.n) * len(y_test)
 77 |                 if isinstance(self.n, str) else self.n)
 78 | 
 79 |         # e.g. clf is a RandomForestsClassifier or SVC(probability=True)
 80 |         if hasattr(clf, 'predict_proba'):
 81 |             y_probs = clf.predict_proba(X_test)
 82 |             selected_indexes = probabilistic_uncertainty(y_probs, m)
 83 | 
 84 |         # e.g. clf is a LinearSVC or SVC
 85 |         elif hasattr(clf, 'decision_function'):
 86 |             y_raw = clf.decision_function(X_test)
 87 |             selected_indexes = closest_to_hyperplane(y_raw, m)
 88 | 
 89 |         else:
 90 |             raise TypeError(
 91 |                 'Unsure how to handle uncertainty sampling with '
 92 |                 'classifier of type {}.'.format(clf.__class__))
 93 | 
 94 |         return selected_indexes
 95 | 
 96 | 
 97 | def closest_to_hyperplane(distances, n):
 98 |     """Perform uncertainty sampling using distance from the hyperplane.
 99 | 
100 |     Uncertainty sampling with SVMs is equivalent to selecting the samples
101 |     closest to the decision boundary (hyperplane in binary classification).
102 | 
103 |     This is shown by Tong and Koller [ICML 2000]:
104 |     https://dl.acm.org/citation.cfm?id=944793
105 | 
106 |     The intuition is also well explained by Kremer, Pederson, Igel [WIREs 2014]:
107 |     http://image.diku.dk/jank/papers/WIREs2014.pdf
108 | 
109 |     The process for selecting the objects is as follows:
110 | 
111 |         1. Consider only absolute distances.
112 |         2. Argsort from least distance to greatest.
113 |         3. Take the n smallest (closest to the hyperplane).
114 | 
115 |     Args:
116 |         distances: The list of distances to use as metrics.
117 |         n: The number of samples to mark as 'most uncertain'.
118 | 
119 |     Returns:
120 |         list: The indexes corresponding to the 'most uncertain' samples.
121 | 
122 |     """
123 |     absolute = np.abs(distances)
124 |     indexes = np.argsort(absolute)
125 |     return indexes[:n]
126 | 
127 | 
128 | def probabilistic_uncertainty(probs, n):
129 |     """Perform uncertainty sampling using least confidence.
130 | 
131 |     An excellent discussion of active learning strategies including a
132 |     comparison of three different uncertainty measures: least confidence,
133 |     margin sampling and entropy (all of which are equivalent in binary
134 |     classification) can be found in Burr Settles' literature review:
135 | 
136 |     http://burrsettles.com/pub/settles.activelearning.pdf
137 | 
138 |     The process for selecting the objects is as follows:
139 | 
140 |         1. Consider 'uncertainty' only (1 - the highest class probability).
141 |         2. Argsort and reverse to sort from least to most certain.
142 |         3. Take the n smallest (most uncertain).
143 | 
144 |     Args:
145 |         probs: The list of probabilities to use as metrics.
146 |         n: The number of samples to mark as 'most uncertain'.
147 | 
148 |     Returns:
149 |         list: The indexes corresponding to the 'most uncertain' samples.
150 | 
151 |     """
152 |     uncertainty = np.array([1 - np.max(x) for x in probs])
153 |     indexes = np.argsort(uncertainty)[::-1]
154 |     return indexes[:n]
155 | 


--------------------------------------------------------------------------------
/tesseract/temporal.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | temporal.py
  5 | ~~~~~~~~~~~
  6 | 
  7 | A module for working with and running time-aware evaluations. Most of the
  8 | functionality of this module falls into one of two categories: working with
  9 | arrays of datetimes or datetime-aligned series of data, and aggregating the
 10 | steps of the ML pipeline needed to conduct sound, time-aware evaluations.
 11 | 
 12 | """
 13 | import bisect
 14 | import operator
 15 | 
 16 | import numpy as np
 17 | from dateutil.relativedelta import relativedelta
 18 | 
 19 | import tesseract.utils as utils
 20 | 
 21 | 
 22 | def assert_train_test_temporal_consistency(t_train, t_test):
 23 |     """Helper function to assert train-test temporal constraint (C1).
 24 | 
 25 |     All objects in the training set need to be temporally anterior to all
 26 |     objects in the testing set. Violating this constraint will positively bias
 27 |     the results by integrating "future" knowledge into the classifier.
 28 | 
 29 |     Args:
 30 |         t_train: An array of datetimes corresponding to the training set.
 31 |         t_test: An array of datetime corresponding to the testing set.
 32 | 
 33 |     Returns:
 34 |         bool: False if the partitioned dataset does _not_ adhere to C1,
 35 |             True otherwise.
 36 | 
 37 |     """
 38 |     for train_date in t_train:
 39 |         for test_date in t_test:
 40 |             if train_date > test_date:
 41 |                 return False
 42 |     return True
 43 | 
 44 | 
 45 | def assert_positive_negative_temporal_consistency(y, t, month_variance=1):
 46 |     """Helper function to assert malware-goodware temporal constraint (C2).
 47 | 
 48 |     In any given testing period, all testing objects must be from the time
 49 |     window under test. In the malware domain this constraint has often been
 50 |     violated so that malware and goodware come from different time periods.
 51 | 
 52 |     If this is the case, it becomes impossible to tell whether a
 53 |     high-performing classifier is discriminating between malicious and benign
 54 |     objects or between old and new applications.
 55 | 
 56 |     Args:
 57 |         y: An array of ground-truth labels for each observation.
 58 |         t: An array of datetimes for each observation (aligned with y).
 59 |         month_variance: All malware and goodware should be between this many
 60 |             months.
 61 | 
 62 |     Returns:
 63 |         bool: False if the malware and goodware do not adhere to C2,
 64 |             True otherwise
 65 | 
 66 |     """
 67 |     positive = np.where(y == 1)[0]
 68 |     negative = np.where(y != 1)[0]
 69 |     positive_dates = t[positive]
 70 |     negative_dates = t[negative]
 71 | 
 72 |     for pos_date in positive_dates:
 73 |         for neg_date in negative_dates:
 74 |             if month_difference(pos_date, neg_date) > month_variance:
 75 |                 return False
 76 |     return True
 77 | 
 78 | 
 79 | def month_difference(d1, d2):
 80 |     """Get the difference in months between two datetimes."""
 81 |     return (d1.year - d2.year) * 12 + d1.month - d2.month
 82 | 
 83 | 
 84 | def time_aware_train_test_split(X, y, t, train_size, test_size,
 85 |                                 granularity, start_date=None):
 86 |     """Partition a dataset composed of time-labelled objects.
 87 | 
 88 |     Args:
 89 |         X (np.ndarray, csr_matrix): Multi-dimensional array of predictors.
 90 |         y (np.ndarray): Array of output labels.
 91 |         t (np.ndarray): Array of timestamp tags.
 92 |         train_size (int): The training window size W (in τ).
 93 |         test_size (int): The testing window size Δ (in τ).
 94 |         granularity (str): The unit of time τ, used to denote the window size.
 95 |             Acceptable values are 'year|quarter|month|week|day'.
 96 |         start_date (date): The date to begin partioning from (eg. to align with
 97 |             the start of the year).
 98 | 
 99 |     Returns:
100 |         (np.ndarray, list, np.ndarray, list, np.ndarray, list):
101 |             Training partition of predictors X.
102 |             List of testing partitions of predictors X.
103 |             Training partition of output variables y.
104 |             List of testing partitions of predictors y.
105 |             Training partition of meta t.
106 |             List of testing partitions of meta t.
107 | 
108 |     """
109 |     # Get partitioned indexes
110 |     train, tests = time_aware_indexes(t, train_size, test_size,
111 |                                       granularity, start_date)
112 | 
113 |     # Partition predictors and labels
114 |     X_actual, y_actual, t_actual = X[train], y[train], t[train]
115 | 
116 |     X_tests = [X[index_set] for index_set in tests]
117 |     y_tests = [y[index_set] for index_set in tests]
118 |     t_tests = [t[index_set] for index_set in tests]
119 | 
120 |     return X_actual, X_tests, y_actual, y_tests, t_actual, t_tests
121 | 
122 | 
123 | def time_aware_indexes(t, train_size, test_size, granularity, start_date=None):
124 |     """Return a list of indexes that partition the list t by time.
125 | 
126 |     Sorts the list of dates t before dividing into training and testing
127 |     partitions, ensuring a 'history-aware' split in the ensuing classification
128 |     task.
129 | 
130 | 
131 |     Args:
132 |         t (np.ndarray): Array of timestamp tags.
133 |         train_size (int): The training window size W (in τ).
134 |         test_size (int): The testing window size Δ (in τ).
135 |         granularity (str): The unit of time τ, used to denote the window size.
136 |             Acceptable values are 'year|quarter|month|week|day'.
137 |         start_date (date): The date to begin partioning from (eg. to align with
138 |             the start of the year).
139 | 
140 |     Returns:
141 |         (list, list):
142 |             Indexing for the training partition.
143 |             List of indexings for the testing partitions.
144 | 
145 |     """
146 |     # Order the dates as well as their original positions
147 |     with_indexes = zip(t, range(len(t)))
148 |     ordered = sorted(with_indexes, key=operator.itemgetter(0))
149 | 
150 |     # Split out the dates from the indexes
151 |     dates = [tup[0] for tup in ordered]
152 |     indexes = [tup[1] for tup in ordered]
153 | 
154 |     # Get earliest date
155 |     start_date = utils.resolve_date(start_date) if start_date else ordered[0][0]
156 | 
157 |     # Slice out training partition
158 |     boundary = start_date + get_relative_delta(train_size, granularity)
159 |     to_idx = bisect.bisect_left(dates, boundary)
160 |     train = indexes[:to_idx]
161 | 
162 |     tests = []
163 |     # Slice out testing partitions
164 |     while to_idx < len(indexes):
165 |         boundary += get_relative_delta(test_size, granularity)
166 |         from_idx = to_idx
167 |         to_idx = bisect.bisect_left(dates, boundary)
168 |         tests.append(indexes[from_idx:to_idx])
169 | 
170 |     return train, tests
171 | 
172 | 
173 | def time_aware_partition(t, proportion):
174 |     """Partition an array of dates based on the given proportion.
175 | 
176 |     The set of timestamps will be bisected with the left bisection sized by
177 |     the given proportion.
178 | 
179 |     Args:
180 |         t: An array of datetimes.
181 |         proportion: The proportion by which to split the array.
182 | 
183 |     Returns:
184 |         tuple: The two bisections of the array.
185 |     """
186 |     # Order the dates as well as their original positions
187 |     indexes = np.argsort(t)
188 | 
189 |     # Divide ordered set in two
190 |     boundary = int(proportion * len(indexes))
191 | 
192 |     return indexes[:boundary], indexes[boundary:]
193 | 
194 | 
195 | def temporal_slice(X, y, t):
196 |     raise NotImplementedError
197 | 
198 | 
199 | def get_relative_delta(offset, granularity):
200 |     """Get delta of size 'granularity'.
201 | 
202 |     Args:
203 |         offset: The number of time units to offset by.
204 |         granularity: The unit of time to offset by, expects one of
205 |             'year', 'quarter', 'month', 'week', 'day'.
206 | 
207 |     Returns:
208 |         The timedelta equivalent to offset * granularity.
209 | 
210 |     """
211 |     # Make allowances for year(s), quarter(s), month(s), week(s), day(s)
212 |     granularity = granularity[:-1] if granularity[-1] == 's' else granularity
213 |     try:
214 |         return {
215 |             'year': relativedelta(years=offset),
216 |             'quarter': relativedelta(months=3 * offset),
217 |             'month': relativedelta(months=offset),
218 |             'week': relativedelta(weeks=offset),
219 |             'day': relativedelta(days=offset),
220 |         }[granularity]
221 |     except KeyError:
222 |         raise ValueError('granularity not recognised, try: '
223 |                          'year|quarter|month|week|day')
224 | 


--------------------------------------------------------------------------------
/tesseract/rejection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | rejection.py
  5 | ~~~~~~~~~~~~
  6 | 
  7 | # TODO | Add module description
  8 | 
  9 | """
 10 | import logging
 11 | import os
 12 | 
 13 | import numpy as np
 14 | from sklearn.model_selection import KFold, cross_val_predict
 15 | from tqdm import tqdm
 16 | 
 17 | from tesseract import utils
 18 | from tesseract.evaluation import TrackingStage
 19 | from tesseract.temporal import time_aware_partition
 20 | 
 21 | 
 22 | class Rejector(TrackingStage):
 23 |     def __init__(self, schedule=1, tracking=True, interaction='intersection'):
 24 |         super().__init__(schedule, tracking, interaction)
 25 |         self.kept_history = []
 26 |         self.rejection_history = []
 27 | 
 28 |     def reject_wrapper(self, clf, X_train, y_train, t_train, X_test,
 29 |                        y_test, t_test, previously_kept, previously_rejected):
 30 |         # Pass parameters straight through to reject implementation
 31 |         kept, rejected = self.reject(clf, X_train, y_train, t_train,
 32 |                                      X_test, y_test, t_test, previously_kept,
 33 |                                      previously_rejected)
 34 | 
 35 |         if self.tracking:
 36 |             self.kept_history.append(kept)
 37 |             self.rejection_history.append(rejected)
 38 | 
 39 |         # Merge results with those of previous rejectors
 40 |         kept = self.merge_results(previously_kept, kept)
 41 |         rejected = self.merge_results(previously_rejected, rejected)
 42 | 
 43 |         return np.array(kept), np.array(rejected)
 44 | 
 45 |     def reject(self, clf, X_train, y_train, t_train,
 46 |                X_test, y_test, t_test, previously_kept, previously_rejected):
 47 |         raise NotImplementedError('Rejector must be subclassed')
 48 | 
 49 | 
 50 | class ThresholdRejector(Rejector):
 51 |     def __init__(self, operator, thresholds, point_score='credibility',
 52 |                  schedule=1, tracking=True, interaction='intersection'):
 53 |         super().__init__(schedule, tracking, interaction)
 54 | 
 55 |         self._single_threshold_ops = ('<', 'lesser',
 56 |                                       '>', 'greater')
 57 |         self._double_threshold_ops = ('<>', 'outside',
 58 |                                       '><', 'between')
 59 | 
 60 |         self._valid_operators = (self._single_threshold_ops +
 61 |                                  self._double_threshold_ops)
 62 | 
 63 |         self._valid_point_scores = ('credibility', 'confidence')
 64 | 
 65 |         self._check_params(
 66 |             operator, thresholds, point_score, tracking, interaction)
 67 | 
 68 |         self.point_score = point_score
 69 |         self.thresholds = thresholds
 70 |         self.operator = operator
 71 | 
 72 |         if hasattr(thresholds, '__len__'):
 73 |             self.threshold = max(thresholds)
 74 |             self.lower_threshold = min(thresholds)
 75 |         else:
 76 |             self.threshold = thresholds
 77 |             self.lower_threshold = None
 78 | 
 79 |     def reject(self, clf, X_train, y_train, t_train,
 80 |                X_test, y_test, t_test, previously_kept, previously_rejected):
 81 |         get_score = utils.select_prediction_function(clf)
 82 | 
 83 |         y_scores = get_score(X_test)
 84 | 
 85 |         # Resolve arrays where the scoring function outputs per-class scores
 86 | 
 87 |         if hasattr(y_scores[0], '__len__'):
 88 |             if self.point_score == 'credibility':
 89 |                 # credibility = the highest score
 90 |                 y_scores = np.array([max(v) for v in y_scores])
 91 |             elif self.point_score == 'confidence':
 92 |                 # confidence = the highest score minus the next highest
 93 |                 y_scores = np.array([max(v) - np.partition(v, -2)[-2]
 94 |                                      for v in y_scores])
 95 | 
 96 |         if self.operator in ('<', 'lesser'):
 97 |             rejected = np.where(y_scores < self.threshold)[0]
 98 | 
 99 |         elif self.operator in ('>', 'greater'):
100 |             rejected = np.where(y_scores > self.threshold)[0]
101 | 
102 |         elif self.operator in ('<>', 'outside'):
103 |             rejected = np.where(np.logical_or(y_scores < self.lower_threshold,
104 |                                               y_scores > self.threshold))[0]
105 |         elif self.operator in ('><', 'between'):
106 |             rejected = np.where(np.logical_and(y_scores > self.lower_threshold,
107 |                                                y_scores < self.threshold))[0]
108 |         else:
109 |             raise ValueError('Unrecognised comparator for rejection')
110 |             # Add indexes that didn't pass to list of quarantined samples
111 | 
112 |         kept = np.setxor1d(rejected, np.arange(len(y_scores)))
113 | 
114 |         return kept, rejected
115 | 
116 |     def _check_params(self, operator, thresholds,
117 |                       point_score, tracking, interaction):
118 | 
119 |         if hasattr(thresholds, '__len__') and len(thresholds) > 2:
120 |             raise ValueError(
121 |                 'ThresholdRejector will only accept a '
122 |                 'maximum of 2 thresholds (one upper, one lower)')
123 | 
124 |         if operator not in self._valid_operators:
125 |             raise ValueError(
126 |                 'Threshold comparison operator must be one of the '
127 |                 'following: {}'.format(self._valid_operators))
128 | 
129 |         if point_score not in self._valid_point_scores:
130 |             raise ValueError(
131 |                 'Point scores must be one of the '
132 |                 'following: {}'.format(self._valid_point_scores))
133 | 
134 |         if (operator in self._double_threshold_ops and
135 |                 not hasattr(thresholds, '__len__')):
136 |             raise ValueError('"{}" expects two thresholds'.format(operator))
137 | 
138 |         if (operator in self._single_threshold_ops and
139 |                 hasattr(thresholds, '__len__')):
140 |             raise ValueError('"{}" expects a single threshold'.format(operator))
141 | 
142 | 
143 | def quartiles(alpha_assessment_results, subkey='incorrect'):
144 |     """Considering an alpha assessment, return the quartiles from the results.
145 | 
146 |     In well-separated alpha assessment results, quartiles can be useful for
147 |     finding a good threshold (below which, predictions are discarded).
148 | 
149 |     Typically thresholds are Q3 of incorrect predictions and Q1 of correct
150 |     predictions.
151 | 
152 |     Args:
153 |         alpha_assessment_results: The results to derive quartiles from.
154 |         subkey: 'correct' or 'incorrect'.
155 | 
156 |     Returns:
157 |         tuple: The quartiles as they relate to the negative and positive class.
158 | 
159 |     """
160 |     percentiles = [0, 25, 50, 75, 100]
161 |     negative = alpha_assessment_results['negative_predictions'][subkey]
162 |     positive = alpha_assessment_results['positive_predictions'][subkey]
163 |     neg_quartiles = [np.percentile(negative, p) for p in percentiles]
164 |     pos_quartiles = [np.percentile(positive, p) for p in percentiles]
165 |     return neg_quartiles, pos_quartiles
166 | 
167 | 
168 | def alpha_assessment(clf, X, y, folds=10):
169 |     """Perform an alpha assessment on the given classifier and data.
170 | 
171 |     An alpha assessment is an assessment used in conformal evaluation to
172 |     visually discern how separable the classifier's correct and incorrect
173 |     prediction scores are.
174 | 
175 |     Highly separable scores allow the user to control a threshold below which
176 |     they can designate predictions as being low-confidence, unreliable or even
177 |     rejected. In the domain of malware classification, the rate at which a
178 |     greater proportion of samples appear _below_ the threshold is indicative
179 |     of the rate at which concept drift is occuring.
180 | 
181 |     A formal description and thorough evaluation of its uses is given in the
182 |     Transcend paper by Jordaney et. al [USENIX 2017]:
183 |     https://www.usenix.org/system/files/conference/usenixsecurity17/sec17-jordaney.pdf
184 | 
185 |     Args:
186 |         clf: The classifier to use to perform the assessment.
187 |         X: An array of predictors.
188 |         y: An array of output labels aligned with X.
189 |         folds: The number of folds to perform during the K-fold.
190 | 
191 |     Returns:
192 | 
193 |     """
194 |     if hasattr(clf, 'predict_proba'):
195 |         f = 'predict_proba'
196 |     elif hasattr(clf, 'decision_function'):
197 |         f = 'decision_function'
198 |     else:
199 |         raise TypeError(
200 |             'Unsure how to handle scoring with '
201 |             'classifier of type {}.'.format(clf.__class__))
202 | 
203 |     # random_state was set to 22 however removed due to ValueError since shuffle=False
204 |     cv = KFold(n_splits=folds, shuffle=False)
205 |     y_pred = cross_val_predict(clf, X, y, cv=cv)
206 |     y_score = cross_val_predict(clf, X, y, cv=cv, method=f)
207 | 
208 |     negative = np.where(y_pred == 0)[0]
209 |     positive = np.where(y_pred == 1)[0]
210 |     correct = np.where(y_pred == y)[0]
211 |     incorrect = np.where(y_pred != y)[0]
212 | 
213 |     return {
214 |         'negative_predictions': {
215 |             'correct': y_score[np.intersect1d(negative, correct)],
216 |             'incorrect': y_score[np.intersect1d(negative, incorrect)]},
217 |         'positive_predictions': {
218 |             'correct': y_score[np.intersect1d(positive, correct)],
219 |             'incorrect': y_score[np.intersect1d(positive, incorrect)]}
220 |     }
221 | 


--------------------------------------------------------------------------------
/tesseract/plot_utils.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import ujson as json
  4 | from datetime import datetime
  5 | 
  6 | import __main__ as main
  7 | import numpy as np
  8 | import os
  9 | import seaborn as sns
 10 | from sklearn.ensemble import RandomForestClassifier
 11 | from sklearn.feature_extraction import DictVectorizer
 12 | from sklearn.svm import LinearSVC
 13 | 
 14 | from tesseract import temporal, spatial
 15 | 
 16 | line_kwargs = {'linewidth': 1, 'markersize': 5}
 17 | 
 18 | force = False
 19 | 
 20 | 
 21 | # x_tick_size = 12
 22 | # y_tick_size = 14
 23 | # ax_label_size = 18
 24 | # fig_title_size = 20
 25 | 
 26 | def set_style():
 27 |     sns.set_context('paper')
 28 |     sns.set(font='serif')
 29 | 
 30 |     sns.set('paper', font='serif', style='ticks', rc={
 31 |         'font.family': 'serif',
 32 |         'legend.fontsize': 'medium',
 33 |         'xtick.labelsize': 'medium',
 34 |         'ytick.labelsize': 'medium',
 35 |         'axes.labelsize': 'x-large',
 36 |         'axes.titlesize': 'x-large',
 37 |         'axes.labelpad': 6.0,
 38 |         'figure.titlesize': 'x-large',
 39 |         'text.usetex': True,
 40 |         'text.latex.unicode': True,
 41 |         'figure.figsize': (7.2, 4.45),
 42 |         'figure.dpi': 1200,
 43 |         'savefig.dpi': 1200
 44 |     })
 45 | 
 46 | 
 47 | def get_dataset(approach):
 48 |     return {'drebin': 'drebin-parrot-v2-down',
 49 |             'mamadroid': 'mamadroidPackages-parrot-v2-down'}[approach]
 50 | 
 51 | 
 52 | def get_classifier(approach, balance=False):
 53 |     kwargs = {'class_weights': 'balanced'} if balance else {}
 54 |     if approach == 'drebin':
 55 |         return LinearSVC(**kwargs)
 56 |     if approach == 'mamadroid':
 57 |         return RandomForestClassifier(n_estimators=101, max_depth=64,
 58 |                                       n_jobs=-1, **kwargs)
 59 |     raise ValueError
 60 | 
 61 | 
 62 | def load_features(feature_set):
 63 |     fname = '../../features/{}-features'.format(feature_set)
 64 |     logging.info('Loading features...')
 65 |     with open('{}-X.json'.format(fname), 'rt') as f:
 66 |         X = json.load(f)
 67 |     [o.pop('sha256') for o in X]
 68 | 
 69 |     with open('{}-Y.json'.format(fname), 'rt') as f:
 70 |         y = json.load(f)
 71 |     y = [o[0] for o in y]
 72 | 
 73 |     with open('{}-meta.json'.format(fname), 'rt') as f:
 74 |         t = json.load(f)
 75 |     t = [o['dex_date'] for o in t]
 76 |     t = [datetime.strptime(o, '%Y-%m-%dT%H:%M:%S') for o in t]
 77 | 
 78 |     return X, y, t
 79 | 
 80 | 
 81 | def load_meta(feature_set):
 82 |     logging.info('Loading meta...')
 83 |     with open('../../features/{}-features-meta.json'.format(feature_set),
 84 |               'rt') as f:
 85 |         return json.load(f)
 86 | 
 87 | 
 88 | def enforce_ratios(X, y, t):
 89 |     train, tests = temporal.time_aware_indexes(t, 0, 1, 'month', '2014')
 90 |     assert len(tests) == 36
 91 | 
 92 |     downsampled = None
 93 |     print('{:^6} {:^6} {:^6} {:^6}'.format('MW', 'GW', 'TOT', '%MW'))
 94 | 
 95 |     for period_idxs in tests:
 96 |         period_idxs = np.array(period_idxs)
 97 |         y_period = y[period_idxs]
 98 | 
 99 |         # IF DOWNSAMPLING
100 |         selected_idxs = spatial.downsample_to_rate(y_period)
101 |         selected = period_idxs[selected_idxs]
102 | 
103 |         # ELSE
104 |         # selected = period_idxs
105 | 
106 |         labels = y[selected]
107 |         tot = len(labels)
108 |         p = sum(labels)
109 |         n = tot - sum(labels)
110 |         print('{:>6} {:>6} {:>6} {:>6.1f}%'.format(p, n, tot, 100 * p / tot))
111 | 
112 |         if downsampled is None:
113 |             downsampled = selected
114 |         else:
115 |             downsampled = np.hstack((downsampled, selected))
116 | 
117 |     labels = y[downsampled]
118 |     tot = len(labels)
119 |     p = sum(labels)
120 |     n = tot - sum(labels)
121 |     print('Overall')
122 |     print('{:>6} {:>6} {:>6} {:>6.1f}%'.format(p, n, tot, 100 * p / tot))
123 | 
124 |     return downsampled
125 | 
126 | 
127 | def vectorize(X, y, t):
128 |     """Transform input data into appropriate forms for an sklearn classifier.
129 | 
130 |     Args:
131 |         X (list): A list of dictionaries of input features for each sample.
132 |         y (list): A list of ground truths for the data.
133 |         t (list): A list of datetimes for the data.
134 | 
135 |     """
136 |     logging.info('Vectorizing features...')
137 |     vec = DictVectorizer()
138 |     X = vec.fit_transform(X)
139 |     y = np.asarray(y)
140 |     t = np.asarray(t)
141 |     return X, y, t
142 | 
143 | 
144 | def style_axes(axes, periods=10):
145 |     for i, ax in enumerate(axes):
146 |         # Labels
147 |         ax.set_xlabel('Testing period (month)')  # , fontsize=ax_label_size)
148 |         # ax.set_ylabel('Score')  # , fontsize=ax_label_size)
149 |         ax.set_ylabel('')
150 | 
151 |         # Ticks
152 |         ax.set_xticks(range(1, periods + 1))
153 |         ax.set_yticks(np.arange(0, 1.1, 0.1))
154 | 
155 |         labels = [str(x + 1) if x % 3 == 0 else '' for x in range(periods + 1)]
156 |         ax.set_xticklabels(labels)
157 | 
158 |         ax.tick_params(axis='x', which='major')  # , labelsize=x_tick_size)
159 |         ax.tick_params(axis='y', which='major')  # , labelsize=y_tick_size)
160 | 
161 |         ax.yaxis.grid(b=True, which='major', color='lightgrey', linestyle='-')
162 | 
163 |         # Axe limits
164 |         ax.set_xlim(0, periods)
165 |         ax.set_ylim(0, 1)
166 | 
167 |         sns.despine(ax=ax, top=True, right=True, bottom=False, left=False)
168 | 
169 | 
170 | def plot_f1(ax, data, alpha=1, neg=False, label=None, color='dodgerblue',
171 |             marker='o'):
172 |     if label is None:
173 |         label = 'F1 (gw)' if neg else 'F1 (mw)'
174 |     color = '#BCDEFE' if neg else color
175 |     series = data['f1_n'] if neg else data['f1']
176 |     ax.plot(data.index, series, label=label, alpha=alpha, marker=marker,
177 |             c=color, markeredgewidth=1, **line_kwargs)
178 | 
179 | 
180 | def plot_roc(ax, data, alpha=1, label=None, color='dodgerblue',
181 |              marker='o'):
182 |     if label is None:
183 |         label = 'AUC ROC'
184 |     series = data['auc_roc']
185 |     ax.plot(data.index, series, label=label, alpha=alpha, marker=marker,
186 |             c=color, markeredgewidth=1, **line_kwargs)
187 | 
188 | 
189 | def plot_f1_col(ax, data, alpha=1, neg=False, label=None, color='dodgerblue',
190 |                 marker='o'):
191 |     if label is None:
192 |         label = 'F1 (gw)' if neg else 'F1 (mw)'
193 |     series = data['f1_n'] if neg else data['f1']
194 |     ax.plot(data.index, series, label=label, alpha=alpha, marker=marker,
195 |             c=color, markeredgewidth=1, **line_kwargs)
196 | 
197 | 
198 | def plot_recall(ax, data, alpha=1, neg=False, color='red', marker='^'):
199 |     color = '#FDB2B3' if neg else color
200 |     label = 'Recall (gw)' if neg else 'Recall (mw)'
201 |     series = data['recall_n'] if neg else data['recall']
202 |     ax.plot(data.index, series, label=label, alpha=alpha,
203 |             marker=marker, c=color, markeredgewidth=1, **line_kwargs)
204 | 
205 | 
206 | def plot_precision(ax, data, alpha=1, neg=False, color='orange', marker='s'):
207 |     color = '#FEE2B5' if neg else color
208 |     label = 'Precision (gw)' if neg else 'Precision (mw)'
209 |     series = data['precision_n'] if neg else data['precision']
210 |     ax.plot(data.index, series, label=label, alpha=alpha,
211 |             marker=marker, c=color, markeredgewidth=1, **line_kwargs)
212 | 
213 | 
214 | def fill_under_f1(ax, data, alpha=1, neg=False):
215 |     label = 'F1 (gw)' if neg else 'F1 (mw)'
216 |     series = data['f1_n'] if neg else data['f1']
217 |     ax.fill_between(data.index, series,
218 |                     label='AUT({}, 24 months)'.format(label),
219 |                     alpha=alpha, facecolor='none', hatch='//',
220 |                     edgecolor='#BCDEFE', rasterized=True)
221 | 
222 | 
223 | def plot_old_f1(ax, data, alpha=1, neg=False, label=None,
224 |                 color='#C0C0C0', marker=''):
225 |     if label is None:
226 |         label = 'F1 (gw)' if neg else 'F1 (mw)'
227 |     series = data['f1_n'] if neg else data['f1']
228 |     ax.plot(data.index, series, label=label, alpha=alpha, linestyle='--',
229 |             marker=marker, c=color, markeredgewidth=1, linewidth=2)
230 | 
231 | 
232 | def plot_old_metric(ax, data, metric, alpha=1, neg=False, label=None,
233 |                     color='#C0C0C0', marker=''):
234 |     if label is None:
235 |         label = metric + ' (gw)' if neg else metric + ' (mw)'
236 |         label = label.title()
237 |     series = data[metric + '_n'] if neg else data[metric]
238 |     ax.plot(data.index, series, label=label, alpha=alpha, linestyle='--',
239 |             marker=marker, c=color, markeredgewidth=1, linewidth=2)
240 | 
241 | 
242 | def plot_cv_mean(ax, data, alpha=1):
243 |     ax.axhline(y=float(data), linestyle='--', linewidth=1, c='red',
244 |                alpha=alpha, label='F1 (10-fold CV)')
245 | 
246 | 
247 | def plot_x_intercept(ax, data, label='', c='limegreen', alpha=1, linewidth=1):
248 |     ax.axvline(x=float(data), linestyle='--', linewidth=linewidth, c=c,
249 |                alpha=alpha, label=label)
250 | 
251 | 
252 | def plot_prf(ax, results, alpha=1, neg=False):
253 |     plot_recall(ax, results, alpha, neg)
254 |     plot_precision(ax, results, alpha, neg)
255 |     plot_f1(ax, results, alpha, neg)
256 | 
257 | 
258 | def add_legend(ax, loc='lower left'):
259 |     lines = ax.get_lines()
260 |     legend = ax.legend(frameon=True, handles=lines, loc=loc, prop={'size': 10})
261 |     legend.get_frame().set_facecolor('#FFFFFF')
262 |     legend.get_frame().set_linewidth(0)
263 |     return legend
264 | 
265 | 
266 | def set_title_sc(ax, text):
267 |     text = text.replace('%', '\\%')  # Make TeX-safe
268 |     ax.set_title('\\textsf{{\\textsc{{{}}}}}'.format(text))
269 | 
270 | 
271 | def plotname():
272 |     return os.path.splitext(os.path.basename(main.__file__))[0]
273 | 
274 | 
275 | def save_images(plt, plot_name=None):
276 |     plt.tight_layout()
277 |     plot_name = plotname() if plot_name is None else plot_name
278 |     plt.savefig('./images/png/{}.png'.format(plot_name))
279 |     plt.savefig('./images/pdf/{}.pdf'.format(plot_name))
280 |     plt.savefig('./images/eps/{}.eps'.format(plot_name))
281 | 
282 | 
283 | def parse_args():
284 |     global force
285 |     p = argparse.ArgumentParser()
286 |     p.add_argument('-f', '--force', action='store_true', help='Rerun all data')
287 |     args = p.parse_args()
288 |     force = args.force
289 |     return args
290 | 
291 | 
292 | parse_args()
293 | 


--------------------------------------------------------------------------------
/tesseract/viz.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import pandas as pd
  6 | import seaborn as sns
  7 | from collections import defaultdict
  8 | 
  9 | # TODO | Remove pandas dependency
 10 | 
 11 | line_kwargs = {'linewidth': 1, 'markersize': 3}
 12 | 
 13 | 
 14 | # x_tick_size = 12
 15 | # y_tick_size = 14
 16 | # ax_label_size = 18
 17 | # fig_title_size = 20
 18 | 
 19 | def plot_decay(results, fill=True, titles=None, means=None, reject=False):
 20 |     # ------------------------------------------ #
 21 |     #  Plotting prologue                         #
 22 |     # ------------------------------------------ #
 23 | 
 24 |     results = [results] if isinstance(results, dict) else results
 25 |     titles = titles if titles else [''] * len(results)
 26 |     means = means if means else [None] * len(results)
 27 | 
 28 |     # FIXME | This is all a bit of a naff hack from before the redesign,
 29 |     # FIXME | when there was a dependency on Pandas, remove as soon as possible
 30 | 
 31 |     for i in range(len(results)):
 32 |         # del results[i]['auc_roc']  # Otherwise hampers the DataFrame conversion
 33 |         print(len(results[i]['f1']))
 34 |         # results[i]['f1_b'], results[i]['f1_r'], results[i]['reject_total_perc'] = [], [], []
 35 |         # for j in range(len(results[i]['transcend'])):
 36 |         #     results[i]['f1_b'].append(results[i]['transcend'][j]['f1_b'])
 37 |         #     results[i]['f1_r'].append(results[i]['transcend'][j]['f1_r'])
 38 |         #     results[i]['reject_total_perc'].append(results[i]['transcend'][j]['reject_total_perc'])
 39 |         # del results[i]['transcend']
 40 |         results[i] = pd.DataFrame(dict(results[i]),
 41 |                                   index=range(1, len(results[i]['f1']) + 1))
 42 | 
 43 |     # End of naffness
 44 | 
 45 |     set_style()
 46 |     fig, axes = plt.subplots(1, len(results))
 47 | 
 48 |     axes = axes if hasattr(axes, '__iter__') else (axes,)
 49 | 
 50 |     # ------------------------------------------ #
 51 |     #  Subplots                                  #
 52 |     # ------------------------------------------ #
 53 | 
 54 |     for res, ax, title, mean in zip(results, axes, titles, means):
 55 |         # plot_prf(ax, res, 0.3, neg=True)
 56 |         plot_prf(ax, res)
 57 |         if mean is not None:
 58 |             plot_cv_mean(ax, mean)
 59 |         if fill:
 60 |             fill_under_f1(ax, res)
 61 |         if reject:
 62 |             plot_baseline_f1(ax, res)
 63 |             plot_rej_f1(ax, res)
 64 |             plot_rejected(ax, res)
 65 |         ax.set_title(title)
 66 | 
 67 |     # Legend
 68 |     add_legend(axes[0])
 69 | 
 70 |     # ------------------------------------------ #
 71 |     #  Plotting epilogue                         #
 72 |     # ------------------------------------------ #
 73 | 
 74 |     style_axes(axes, len(results[0]['f1']))
 75 |     fig.set_size_inches(6 * len(results), 4)
 76 |     plt.tight_layout()
 77 | 
 78 |     return plt
 79 | 
 80 | 
 81 | def plot_decay1(results, fill=True, titles=None, means=None, reject=False):
 82 |     # ------------------------------------------ #
 83 |     #  Plotting prologue                         #
 84 |     # ------------------------------------------ #
 85 | 
 86 |     results = [results] if isinstance(results, dict) else results
 87 |     titles = titles if titles else [''] * len(results)
 88 |     means = means if means else [None] * len(results)
 89 | 
 90 |     # FIXME | This is all a bit of a naff hack from before the redesign,
 91 |     # FIXME | when there was a dependency on Pandas, remove as soon as possible
 92 |     data = defaultdict(lambda: [])
 93 |     for result in results:
 94 |         for i in result:
 95 |             data[i].append(result[i])
 96 |     results = [pd.DataFrame(dict(data), index=range(1, len(data['f1_b']) + 1))]
 97 | 
 98 |     # End of naffness
 99 | 
100 |     set_style()
101 |     fig, axes = plt.subplots(1, len(results))
102 | 
103 |     axes = axes if hasattr(axes, '__iter__') else (axes,)
104 | 
105 |     # ------------------------------------------ #
106 |     #  Subplots                                  #
107 |     # ------------------------------------------ #
108 | 
109 |     for res, ax, title, mean in zip(results, axes, titles, means):
110 |         # plot_prf(ax, res, 0.3, neg=True)
111 |         plot_prf(ax, res)
112 |         if mean is not None:
113 |             plot_cv_mean(ax, mean)
114 |         if fill:
115 |             fill_under_f1(ax, res)
116 |         if reject:
117 |             plot_baseline_f1(ax, res)
118 |             plot_rej_f1(ax, res)
119 |             plot_rejected(ax, res)
120 |         ax.set_title(title)
121 | 
122 |     # Legend
123 |     add_legend(axes[0])
124 | 
125 |     # ------------------------------------------ #
126 |     #  Plotting epilogue                         #
127 |     # ------------------------------------------ #
128 | 
129 |     style_axes(axes, len(results[0]['f1_b']))
130 |     fig.set_size_inches(6 * len(results), 4)
131 |     plt.tight_layout()
132 | 
133 |     return plt
134 | 
135 | 
136 | def set_style():
137 |     sns.set_context('paper')
138 |     sns.set(font='serif')
139 | 
140 |     sns.set('paper', font='serif', style='ticks', rc={
141 |         'font.family': 'serif',
142 |         'legend.fontsize': 'medium',
143 |         'xtick.labelsize': 'medium',
144 |         'ytick.labelsize': 'medium',
145 |         'axes.labelsize': 'x-large',
146 |         'axes.titlesize': 'x-large',
147 |         'axes.labelpad': 6.0,
148 |         'figure.titlesize': 'x-large',
149 |         'text.usetex': True,
150 |         'figure.figsize': (3.6, 4.45),
151 |         'figure.dpi': 1200,
152 |         'savefig.dpi': 1200
153 |     })
154 | 
155 | 
156 | def style_axes(axes, periods, granularity='Month'):
157 |     for i, ax in enumerate(axes):
158 |         # Labels
159 |         ax.set_xlabel(f'Testing period ({granularity})')  # , fontsize=ax_label_size)
160 |         # ax.set_ylabel('Score')  # , fontsize=ax_label_size)
161 |         ax.set_ylabel('')
162 | 
163 |         # Ticks
164 |         ax.set_xticks(range(1, periods + 1))
165 |         ax.set_yticks(np.arange(0, 1.1, 0.1))
166 | 
167 |         if periods > 12:
168 |             labels = [str(x + 1) if x % 3 == 0
169 |                       else '' for x in range(periods)]
170 |         else:
171 |             labels = [str(x + 1) for x in range(periods)]
172 | 
173 |         ax.set_xticklabels(labels)
174 | 
175 |         ax.tick_params(axis='x', which='major')  # , labelsize=x_tick_size)
176 |         ax.tick_params(axis='y', which='major')  # , labelsize=y_tick_size)
177 | 
178 |         ax.yaxis.grid(visible=True, which='major', color='lightgrey', linestyle='-')
179 | 
180 |         # Axe limits
181 |         ax.set_xlim(0.8, periods)
182 |         ax.set_ylim(0, 1)
183 | 
184 |         sns.despine(ax=ax, top=True, right=True, bottom=False, left=False)
185 | 
186 | 
187 | def plot_baseline_f1(ax, data, alpha=1.0, color='gray', linestyle='--'):
188 |     label = 'F1 (no rejection)'
189 |     series = data['f1_b']
190 |     ax.plot(data.index + 1, series, label=label, alpha=alpha, linestyle=linestyle,
191 |             c=color, markeredgewidth=1, **line_kwargs)
192 | 
193 | 
194 | def plot_rej_f1(ax, data, alpha=1.0, color='red', marker='o'):
195 |     label = 'F1 (rejection)'
196 |     series = data['f1_r']
197 |     ax.plot(data.index + 1, series, label=label, alpha=alpha, marker=marker,
198 |             c=color, markeredgewidth=1, **line_kwargs)
199 | 
200 | 
201 | def plot_rejected(ax, data, alpha=0.6, color='#C0C0C0'):
202 |     series = data['reject_total_perc']
203 |     ax.bar(data.index + 1, series, width=0.7, color=color, alpha=alpha)
204 | 
205 | 
206 | def plot_f1(ax, data, alpha=1.0, neg=False, label=None, color='dodgerblue',
207 |             marker='o'):
208 |     if label is None:
209 |         label = 'F1 (gw)' if neg else 'F1 (mw)'
210 | 
211 |     if neg:
212 |         if color=='dodgerblue':
213 |             color = '#BCDEFE'
214 | 
215 |     series = data['f1_n'] if neg else data['f1']
216 |     ax.plot(data.index + 1, series, label=label, alpha=alpha, marker=marker,
217 |             c=color, markeredgewidth=1, **line_kwargs)
218 | 
219 | 
220 | def plot_recall(ax, data, alpha=1.0, neg=False, color='red', marker='^'):
221 |     label = 'Recall (gw)' if neg else 'Recall (mw)'
222 |     color = '#FDB2B3' if neg else color
223 |     series = data['recall_n'] if neg else data['recall']
224 |     ax.plot(data.index + 1, series, label=label, alpha=alpha,
225 |             marker=marker, c=color, markeredgewidth=1, **line_kwargs)
226 | 
227 | 
228 | def plot_precision(ax, data, alpha=1.0, neg=False, color='orange', marker='s'):
229 |     label = 'Precision (gw)' if neg else 'Precision (mw)'
230 |     color = '#FEE2B5' if neg else color
231 |     series = data['precision_n'] if neg else data['precision']
232 |     ax.plot(data.index + 1, series, label=label, alpha=alpha,
233 |             marker=marker, c=color, markeredgewidth=1, **line_kwargs)
234 | 
235 | 
236 | def fill_under_f1(ax, data, alpha=1, neg=False):
237 |     label = 'F1 (gw)' if neg else 'F1 (mw)'
238 |     series = data['f1_n'] if neg else data['f1']
239 |     ax.fill_between(data.index + 1, series,
240 |                     label='AUT({}, 24 months)'.format(label),
241 |                     alpha=alpha, facecolor='none', hatch='//',
242 |                     edgecolor='#BCDEFE', rasterized=True)
243 | 
244 | 
245 | def plot_cv_mean(ax, data, alpha=1):
246 |     ax.axhline(y=float(data), linestyle='--', linewidth=1, c='red',
247 |                alpha=alpha, label='F1 (10-fold CV)')
248 | 
249 | 
250 | def plot_origin(ax, data, alpha=1):
251 |     ax.axhline(y=float(data), linestyle='-.', linewidth=1, c='black',
252 |                alpha=alpha, label='F1 (original paper)')
253 | 
254 | 
255 | def plot_prf(ax, results, alpha=1.0, neg=False):
256 |     plot_f1(ax, results, alpha, neg)
257 |     plot_recall(ax, results, alpha, neg)
258 |     plot_precision(ax, results, alpha, neg)
259 | 
260 | 
261 | 
262 | def add_legend(ax, loc='lower left'):
263 |     lines = ax.get_lines()
264 |     legend = ax.legend(frameon=True, handles=lines, loc=loc, prop={'size': 8},  # Reduced font size
265 |                        borderpad=0.5,  # Padding inside the legend box
266 |                        labelspacing=0.5,  # Vertical spacing between legend items
267 |                        handlelength=1,  # Length of the legend handles
268 |                        handletextpad=0.5)  # Spacing between handle and text
269 |     legend.get_frame().set_facecolor('#FFFFFF')
270 |     legend.get_frame().set_linewidth(0)
271 |     return legend
272 | 
273 | 
274 | def save_images(plt, path, plot_name):
275 |     plt.tight_layout()
276 |     plt.savefig(os.path.join(path, './png/{}.png'.format(plot_name)))
277 |     plt.savefig(os.path.join(path, './pdf/{}.pdf'.format(plot_name)))
278 |     plt.savefig(os.path.join(path, './eps/{}.eps'.format(plot_name)))
279 | 
280 | 
281 | def plot_old_f1(ax, data, alpha=1, neg=False, label=None,
282 |                 color='#C0C0C0', marker=''):
283 |     if label is None:
284 |         label = 'F1 (gw)' if neg else 'F1 (mw)'
285 |     series = data['f1_n'] if neg else data['f1']
286 |     ax.plot(data.index, series, label=label, alpha=alpha, linestyle='--',
287 |             marker=marker, c=color, markeredgewidth=1, linewidth=2)
288 | 
289 | 
290 | def plot_old_metric(ax, data, metric, alpha=1, neg=False, label=None,
291 |                     color='#C0C0C0', marker=''):
292 |     if label is None:
293 |         label = metric + ' (gw)' if neg else metric + ' (mw)'
294 |         label = label.title()
295 |     series = data[metric + '_n'] if neg else data[metric]
296 |     ax.plot(data.index, series, label=label, alpha=alpha, linestyle='--',
297 |             marker=marker, c=color, markeredgewidth=1, linewidth=2)
298 | 
299 | 
300 | def set_title_sc(ax, text):
301 |     text = text.replace('%', '\\%')  # Make TeX-safe
302 |     ax.set_title('\\textsf{{\\textsc{{{}}}}}'.format(text))
303 | 
304 | 
305 | def main():
306 |     import pickle as pkl
307 | 
308 |     results = pkl.load(
309 |         open('/Users/mark/Documents/Git/transcend-release/timeseries_cred_conf/ice_p_val_results.p', 'rb'))
310 |     plot = plot_decay1(results, reject=True, titles=['ICE default'])
311 |     plot.savefig("/Users/mark/Desktop/Tesseract-journal/ICE.pdf")
312 | 
313 | 
314 | if __name__ == '__main__':
315 |     main()
316 | 


--------------------------------------------------------------------------------
/tesseract/spatial.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | spatial.py
  5 | ~~~~~~~~~~
  6 | 
  7 | A module for working with the class balance of a dataset. Ensuring the class
  8 | distribution of the testing data is similar to what will be encountered in a
  9 | real deployment is imperative to sound evaluations -- particularly in the
 10 | security domains.
 11 | 
 12 | Unlike the testing set, the training set is entirely under the operator's
 13 | control and class balance can be manipulated in order to over or underrepresent
 14 | the positive class in order to achieve greater recall at the expense of
 15 | precision (or vice-versa) during the operational phase.
 16 | 
 17 | """
 18 | import copy
 19 | import random
 20 | 
 21 | import numpy as np
 22 | 
 23 | import tesseract.metrics as metrics
 24 | import tesseract.utils as utils
 25 | 
 26 | 
 27 | def assert_class_distribution(y, positive_rate, variance):
 28 |     """Helper function to verify the rate of the positive class across y (C3).
 29 | 
 30 |     The testing distribution must reflect the real-world class balance observed
 31 |     in real-life, otherwise results can be highly inflated (or deflated) with
 32 |     respect to realistic performance. This function will verify that this
 33 |     constraint is being respected.
 34 | 
 35 |     Args:
 36 |         y: An array of output class labels y
 37 |         positive_rate: The acceptable rate for the positive class.
 38 |         variance: The acceptable deviation (+/-) for the positive rate.
 39 | 
 40 |     Returns:
 41 |         True if the rate of the positive class is acceptable.
 42 | 
 43 |     """
 44 |     current = np.sum(y) / len(y)
 45 |     diff = np.abs(current - positive_rate)
 46 |     return diff <= variance
 47 | 
 48 | 
 49 | def search_optimal_train_ratio(clf, X_train, y_train, t_train,
 50 |                                proper_train_size, validation_size, granularity,
 51 |                                start_tr_rate=None, end_tr_rate=0.6, step=0.05,
 52 |                                test_noise=0.00, metric='f1'):
 53 |     """Find the optimal training ratio in order to maximise the given metric.
 54 | 
 55 |     This function performs a grid search between start_tr_rate and end_tr_rate,
 56 |     aiming to maximise the value of the given metric (f1|precision|recall),
 57 |     while reporting the error rates accumulated at each stage of the algorithm.
 58 | 
 59 |     In order to try and pick a training ratio that will be robust to
 60 |     fluctuations in the testing distribution, it's possible to specify a value
 61 |     for 'test_noise'. The average-best training ratio across a range of values
 62 |     between the tr_rates +/- noise will be reported at each stage of the
 63 |     algorithm.
 64 | 
 65 |     This function will be performed by taking an 'actual' training set and
 66 |     dividing it into a 'proper' training and a 'validation' set. For example,
 67 |     12 months of data might be split into 8 months and 4 months. The 4 months
 68 |     validation aim to simulate the distribution of objects expected after the
 69 |     known 12 months so that the chosen training ratio will still be effective.
 70 | 
 71 |     Note that validation size refers to a single testing period, so to use 4
 72 |     months in the above example, a value of 1 for validation_size and 'month'
 73 |     for granularity will divide the remaining objects after the initial 8
 74 |     selected for training into 1 month chunks to use for validation.
 75 | 
 76 |     Args:
 77 |         clf: The classifier to use during the search.
 78 |         X_train: The array of predictors to use.
 79 |         y_train: The array of output labels to use.
 80 |         t_train: The array of aligned datetimes for X (and therefore y).
 81 |         proper_train_size: The size of the set to train with.
 82 |         validation_size: The size of a _single_ validation period.
 83 |         granularity: The granularity of the testing period (year|month|week|day)
 84 |         start_tr_rate: The start train rate (typically the natural distribution).
 85 |         end_tr_rate: The end train date to test (typically 0.5).
 86 |         step: The learning rate of the grid search.
 87 |         test_noise: How much noise in the testing ratio to account for.
 88 |         metric: The metric to maximise (f1|precision|recall).
 89 | 
 90 |     Returns:
 91 |         A dictionary of scores and errors for each tested training ratio.
 92 | 
 93 |     """
 94 |     import tesseract.temporal as temporal
 95 |     import tesseract.evaluation as evaluation
 96 |                                    
 97 |     # Split again to get training and validation sets for finding K
 98 |     splits = temporal.time_aware_train_test_split(
 99 |         X_train, y_train, t_train, train_size=proper_train_size,
100 |         test_size=validation_size, granularity=granularity)
101 | 
102 |     aut_list, error_list, fn_list, fp_list, total_list = [], [], [], [], []
103 | 
104 |     natural_rate = np.mean([sum(y_val) / len(y_val) for y_val in splits[3]])
105 | 
106 |     if start_tr_rate is None:
107 |         # Start one step below the natural rate of malware
108 |         start_tr_rate = max(
109 |             (round(float(natural_rate) / step) * step) - step, 0)
110 | 
111 |     tr_proportions = np.arange(start_tr_rate, end_tr_rate + step, step)
112 | 
113 |     mid = np.round(natural_rate, 2)
114 |     if test_noise == 0:
115 |         te_proportions = (mid,)
116 |     else:
117 |         te_proportions = np.arange(mid - test_noise, mid + test_noise, 0.01)
118 | 
119 |     for m in tr_proportions:
120 |         X_train_proper, _, \
121 |         y_train_proper, _, \
122 |         t_train_proper, _ = copy.deepcopy(splits)
123 | 
124 |         # Downsample training to match percentage of malware n
125 |         train_idxs = downsample_to_rate(y_train_proper, m)
126 | 
127 |         X_train = X_train_proper[train_idxs]
128 |         y_train = y_train_proper[train_idxs]
129 |         t_train = t_train_proper[train_idxs]
130 | 
131 |         # Alter ratio of malware in testing periods
132 |         errors, auts, total = [], [], []
133 |         fps, fns = [], []
134 |         for n in te_proportions:
135 | 
136 |             _, X_validations, \
137 |             _, y_validations, \
138 |             __, t_validations = copy.deepcopy(splits)
139 | 
140 |             for i, _ in enumerate(y_validations):
141 |                 val_idxs = downsample_to_rate(y_validations[i], n)
142 |                 X_validations[i] = X_validations[i][val_idxs]
143 |                 y_validations[i] = y_validations[i][val_idxs]
144 |                 t_validations[i] = t_validations[i][val_idxs]
145 | 
146 |             # Compute results
147 |             results = evaluation.fit_predict_update(clf, X_train, X_validations,
148 |                                                     y_train, y_validations,
149 |                                                     t_train, t_validations)
150 | 
151 |             fps.append(np.sum(results['fp']))
152 |             fns.append(np.sum(results['fn']))
153 |             total.append(np.sum(results['p']) + np.sum(results['n']))
154 |             errors.append(metrics.error_rate(results, metric))
155 |             auts.append(metrics.aut(results, metric))
156 | 
157 |         # print(m, np.mean(total), np.mean(errors), np.mean(auts))
158 |         error_list.append(np.mean(errors))
159 |         aut_list.append(np.mean(auts))
160 |         fp_list.append(np.mean(fps))
161 |         fn_list.append(np.mean(fns))
162 |         total_list.append(np.mean(total))
163 | 
164 |     return {
165 |         'errors': error_list,
166 |         'auts': aut_list,
167 |         'phis': tr_proportions,
168 |         'fn': fn_list,
169 |         'fp': fp_list,
170 |         'total': total_list
171 |     }
172 | 
173 | 
174 | def find_optimal_train_ratio(clf, X_train, y_train, t_train,
175 |                              proper_train_size, validation_size, granularity,
176 |                              start_tr_rate=None, end_tr_rate=0.6, step=0.05,
177 |                              test_noise=0.00, metric='f1', acceptable_errors=0):
178 |     """Given an acceptable threshold for errors, find the optimal train ratio.
179 | 
180 |     NOTE: The output of the search function that this wraps has undergone quite
181 |     a few tweaks in terms of input and output, and at least until the full
182 |     release of the library, this implementation should be considered a
183 |     prototype (mileage may vary!).
184 | 
185 |     Args;
186 |         clf: The classifier to use during the search.
187 |         X_train: The array of predictors to use.
188 |         y_train: The array of output labels to use.
189 |         t_train: The array of aligned datetimes for X (and therefore y).
190 |         proper_train_size: The size of the set to train with.
191 |         validation_size: The size of a _single_ validation period.
192 |         granularity: The granularity of the testing period (year|month|week|day)
193 |         start_tr_rate: The start train rate (typically the natural distribution).
194 |         end_tr_rate: The end train date to test (typically 0.5).
195 |         step: The learning rate of the grid search.
196 |         test_noise: How much noise in the testing ratio to account for.
197 |         metric: The metric to maximise (f1|precision|recall).
198 |         acceptable_errors: The threshold of acceptable errors.
199 | 
200 |     Returns:
201 |         tuple: The optimal discovered ratio, it's AUT and error rate.
202 | 
203 |     """
204 |     rates = search_optimal_train_ratio(
205 |         clf, X_train, y_train, t_train, proper_train_size,
206 |         validation_size, granularity, start_tr_rate, end_tr_rate,
207 |         step, test_noise, metric)
208 | 
209 |     phis, auts, errors = rates['phis'], rates['auts'], rates['errors']
210 | 
211 |     for i in np.argsort(auts)[::-1]:
212 |         if errors[i] <= acceptable_errors:
213 |             return phis[i], auts[i], errors[i]
214 | 
215 |     print('Warning: No training rate found that allows acceptable error rate')
216 |     return None
217 | 
218 | 
219 | def downsample_set(X, y, t, min_pos_rate, max_pos_rate=None,
220 |                    noise_deviation=0.0, fixed_size=False):
221 |     """Enforce a class distribution by downsampling.
222 | 
223 |     Args:
224 |         X: The array of predictors to use.
225 |         y: The array of output labels to use.
226 |         t: The array of aligned datetimes for X (and therefore y).
227 |         min_pos_rate: The minimum proportion of the positive class acceptable.
228 |         max_pos_rate: The maximum proportion of the positive class acceptable.
229 |         noise_deviation: Addition of noise either side of the given proportions.
230 |         fixed_size: Whether to fix the total size of X to the size of the
231 |             minimum class.
232 | 
233 |     Returns:
234 |         tuple: A resized X, y and t
235 |     """
236 |     new_idxs = downsample_to_rate(y, min_pos_rate, max_pos_rate,
237 |                                   noise_deviation, fixed_size)
238 |     return X[new_idxs], y[new_idxs], t[new_idxs]
239 | 
240 | 
241 | def downsample_to_rate(y, min_pos_rate, max_pos_rate=None,
242 |                        noise_deviation=0.0, fixed_size=False):
243 |     """Enforce a class distribution by downsampling.
244 | 
245 |     Args:
246 |         y: The array of output labels to use.
247 |         min_pos_rate: The minimum proportion of the positive class acceptable.
248 |         max_pos_rate: The maximum proportion of the positive class acceptable.
249 |         noise_deviation: Addition of noise either side of the given proportions.
250 |         fixed_size: Whether to fix the total size of X to the size of the
251 |             minimum class.
252 | 
253 |     Returns:
254 |         An array of selected indexes.
255 | 
256 |     """
257 |     # random.seed(33)
258 |     if max_pos_rate is None:
259 |         max_pos_rate = min_pos_rate
260 | 
261 |     min_pos_rate = utils.resolve_percentage(min_pos_rate)
262 |     max_pos_rate = utils.resolve_percentage(max_pos_rate)
263 | 
264 |     if not (0 <= min_pos_rate <= 1 or 0 <= max_pos_rate <= 1):
265 |         raise ValueError(
266 |             'Please supply a proportion in the interval [0, 1]')
267 | 
268 |     n_pos, n_neg = np.sum(y), np.sum(y == 0)
269 | 
270 |     # Fix the training set size while downsampling to minority class size
271 |     if fixed_size:
272 |         n_tot = min(n_pos, n_neg)
273 |     else:
274 |         n_tot = n_pos + n_neg
275 | 
276 |     current_pos_perc = float(n_pos) / float(n_tot)
277 | 
278 |     if current_pos_perc < min_pos_rate:
279 |         pos_perc = min_pos_rate
280 |     elif current_pos_perc > max_pos_rate:
281 |         pos_perc = max_pos_rate
282 |     else:  # min_pos <= current_pos_perc <= max_pos:
283 |         neg_indexes = np.where(y == 0)[0]
284 |         pos_indexes = np.where(y == 1)[0]
285 |         return np.hstack((neg_indexes, pos_indexes))
286 | 
287 |     pos_perc += np.random.normal(0, noise_deviation)
288 | 
289 |     # print("Starting downsampling {:.1f}% malware function: n_gw = {:,} ; n_mw = {:,} ; n_tot = {:,}".format(perc_mw*100, n_gw, n_mw, n_tot))
290 | 
291 |     can_downsample_pos = True
292 |     can_downsample_neg = True
293 | 
294 |     # First, try downsampling goodware
295 |     if fixed_size:
296 |         n_neg_to_choose = int((1 - pos_perc) * n_tot)
297 |     else:
298 |         n_neg_to_choose = int(
299 |             (float(1 - pos_perc) / float(pos_perc)) * n_pos)
300 | 
301 |     if n_neg_to_choose > n_neg:
302 |         n_neg_to_choose = n_neg
303 |         can_downsample_neg = False
304 |         # print("Failed to downsample goodware, since: n_gw_to_pick ({}) > n_gw ({})".format(n_gw_to_pick, n_gw))
305 | 
306 |     # updating the value n_tot after downsampling the goodware
307 | 
308 |     if fixed_size:
309 |         n_pos_to_choose = int(pos_perc * n_tot)
310 |     else:
311 |         n_pos_to_choose = int(
312 |             (float(pos_perc) / float(1 - pos_perc)) * n_neg)
313 | 
314 |     if n_pos_to_choose > n_pos:
315 |         can_downsample_pos = False
316 |         # print("Cannot oversample malware to {:.1f}% of {:,}!".format(perc_mw*100, n_tot))
317 | 
318 |     # elif n_mw_to_pick < n_pos:
319 |     # print("Downsampled malware to {:.1f}% (n_mw = {:,}, n_mw_to_pick = {:,})".format(perc_mw*100, n_mw, n_mw_to_pick))
320 | 
321 |     # import IPython; IPython.embed(); exit()
322 | 
323 |     # print("After downsampling: n_gw = {:,} ; n_mw = {:,} ; n_tot = {:,}".format(n_gw_to_pick, n_mw_to_pick, n_gw_to_pick+n_mw_to_pick))
324 | 
325 |     neg_indexes = np.where(y == 0)[0]
326 |     pos_indexes = np.where(y == 1)[0]
327 | 
328 |     neg_idx_subsample, pos_idx_subsample = neg_indexes, pos_indexes
329 | 
330 |     # Downsample goodware
331 |     if can_downsample_neg:
332 |         neg_idx_subsample = random.sample(list(neg_indexes),
333 |                                           n_neg_to_choose)
334 | 
335 |     if can_downsample_pos:
336 |         pos_idx_subsample = random.sample(list(pos_indexes),
337 |                                           n_pos_to_choose)
338 | 
339 |     if not (can_downsample_neg or can_downsample_pos):
340 |         raise Exception("Downsampling failed")
341 | 
342 |     sampled = np.hstack((np.array(neg_idx_subsample),
343 |                          np.array(pos_idx_subsample)))
344 | 
345 |     return np.array(sampled, dtype=int)
346 | 


--------------------------------------------------------------------------------
/tesseract/metrics.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | metrics.py
  5 | ~~~~~~~~~~
  6 | 
  7 | A set of measurement tools to aid users designing time-aware experiments.
  8 | 
  9 | """
 10 | from collections import defaultdict
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | import numpy as np
 14 | import pandas as pd
 15 | from sklearn import metrics as skmetrics
 16 | from sklearn.linear_model import LinearRegression
 17 | from sklearn.metrics import confusion_matrix
 18 | from sklearn.preprocessing import LabelEncoder
 19 | 
 20 | from tesseract import utils, temporal
 21 | 
 22 | 
 23 | def t_slope(metric):
 24 |     """Compute the slope with respect to the given metric.
 25 | 
 26 |     Args:
 27 |         metric: The metric to operate with respect to.
 28 | 
 29 |     Returns:
 30 |         float: A measure of the trend in the given time range
 31 | 
 32 |     """
 33 |     reg = LinearRegression()
 34 |     reg.fit(np.arange(1, len(metric)+1).reshape(-1, 1), np.array(metric).reshape(-1, 1))
 35 | 
 36 |     return reg.coef_[0][0]
 37 | 
 38 | 
 39 | def aut(results, metric=None, s_idn=None, e_ind=None):
 40 |     """Compute the AUT with respect to the given metric.
 41 | 
 42 |     Note that for results spanning a _single_ time period, AUT = 0 as this is
 43 |     not considered a time-aware evaluation.
 44 | 
 45 |     Args:
 46 |         results: The set of time-aware results to operate over.
 47 |         metric: The metric to operate with respect to granularity.
 48 |         s_idn: The index to start aut evaluation, default is beginning of results.
 49 |         e_ind: The index to end aut evaluation, default is end of results.
 50 |     Returns:
 51 |         float: A measure of robustness for the applied model over the time
 52 |             spanning the results.
 53 | 
 54 |     """
 55 |     if isinstance(results, dict) or isinstance(results, pd.DataFrame):
 56 |         results = results[metric][s_idn:e_ind]
 57 | 
 58 |     if len(results) <= 1:
 59 |         return 0
 60 | 
 61 |     return np.trapz(results) / (len(results) - 1)
 62 | 
 63 | 
 64 | def aut_with_observation_window(results, metric=None, window=None):
 65 |     """Compute the AUT with respect to the given metric broken down by a window size.
 66 | 
 67 |     Note that for results spanning a _single_ time period, AUT = 0 as this is
 68 |     not considered a time-aware evaluation.
 69 | 
 70 |     Args:
 71 |         results: The set of time-aware results to operate over.
 72 |         metric: The metric to operate with respect to granularity.
 73 |         window: The size of window to break aut down into. Eval period mod window must be 0
 74 |     Returns:
 75 |         list: A list of aut measures for the applied model over the time
 76 |             spanning the results, split into the window size.
 77 | 
 78 |     """
 79 |     return [aut(results, metric, s_idn=w*window, e_ind=(1+w)*window) for w in range(len(results[metric])//window)]
 80 | 
 81 | 
 82 | def aut_with_granularity(results, granularity, metric=None):
 83 |     """Compute the AUT with respect to the given metric.
 84 | 
 85 |     Note that for results spanning a _single_ time period, AUT = 0 as this is
 86 |     not considered a time-aware evaluation.
 87 | 
 88 |     Args:
 89 |         granularity:
 90 |         results: The set of time-aware results to operate over.
 91 |         metric: The metric to operate with respect to.
 92 | 
 93 |     Returns:
 94 |         float: A measure of robustness for the applied model over the time
 95 |             spanning the results.
 96 | 
 97 |     """
 98 |     metric = {"f1": skmetrics.f1_score,
 99 |               "accuracy": skmetrics.accuracy_score,
100 |               "precision": skmetrics.precision_score,
101 |               "recall": skmetrics.recall_score,
102 |               }[metric]
103 | 
104 |     if len(results["t_tests"]) <= 1:
105 |         return 0
106 | 
107 |     y_tests = results["y_tests"]
108 |     y_preds = results["y_preds"]
109 |     t_tests = results["t_tests"]
110 | 
111 |     results = aut_granularity_split(y_tests, y_preds, t_tests, granularity, metric)
112 | 
113 |     return np.trapz(results) / (len(results) - 1)
114 | 
115 | 
116 | def aut_granularity_split(y_tests, y_preds, t_tests, granularity, metric, test_size=1):
117 |     results = []
118 |     y_tests = np.concatenate(y_tests, axis=None)
119 |     y_preds = np.concatenate(y_preds, axis=None)
120 |     t_tests = np.concatenate(t_tests, axis=None)
121 | 
122 |     _, idxes = temporal.time_aware_indexes(t_tests, 0, test_size, granularity)
123 |     for idx in idxes:
124 |         y_test = y_tests[idx]
125 |         y_pred = y_preds[idx]
126 |         result = metric(y_test, y_pred)
127 |         results.append(result)
128 | 
129 |     return results
130 | 
131 | 
132 | def error_rate(results, metric='f1'):
133 |     """Return the error rate formulation as it relates to the given metric.
134 | 
135 |     Args:
136 |         results: The set of time-aware evaluation results to operate over.
137 |         metric: The metric to operate with respect to (f1|precision|recall).
138 | 
139 |     Returns:
140 |         float: The rate representing error for the given metric.
141 | 
142 |     """
143 |     return {
144 |         'f1': errors(results) / (np.sum(results['p']) + np.sum(results['n'])),
145 |         'precision': np.sum(results['fn']) / (
146 |                 np.sum(results['tp']) + np.sum(results['fn'])),
147 |         'recall': np.sum(results['fp']) / (
148 |                 np.sum(results['tn']) + np.sum(results['fp'])),
149 |     }[metric]
150 | 
151 | 
152 | def errors(results):
153 |     """Return the total misclassifications in the results."""
154 |     return np.sum(results['fn']) + np.sum(results['fp'])
155 | 
156 | 
157 | def plot_alpha_assessment(alpha_assessment_results, outfile=None):
158 |     fig = plt.figure()
159 |     ax = fig.add_subplot(111)
160 |     ax.boxplot((alpha_assessment_results['negative_predictions']['correct'],
161 |                 alpha_assessment_results['negative_predictions']['incorrect'],
162 |                 alpha_assessment_results['positive_predictions']['correct'],
163 |                 alpha_assessment_results['positive_predictions']['incorrect']))
164 |     ax.set_xticklabels(('Neg C', 'Neg IC', 'Pos C', 'Pos IC'))
165 | 
166 |     if outfile:
167 |         plt.savefig(outfile)
168 |     else:
169 |         plt.show()
170 | 
171 |     return fig, ax
172 | 
173 | 
174 | # def plot_results(results, outfile=None, fields=None, title='Scores over time',
175 | #                  quiet=False):
176 | #     if not quiet:
177 | #         logging.info(results)
178 | #     if outfile:
179 | #         results.to_csv(os.path.splitext(outfile)[0] + '.csv')
180 | #
181 | #     if fields is None:
182 | #         fields = ['f1', 'precision', 'recall',
183 | #                   'f1_n', 'precision_n', 'recall_n']
184 | #
185 | #     colors = ('#F2385A', '#F5A503', '#4AD9D9',
186 | #               '#FF9999', '#FFDD99', '#AAEEEE')
187 | #     ax = results[fields].plot(linestyle='--', marker='o', color=colors)
188 | #
189 | #     plt.title(title)
190 | #     ax.set_xlabel('Testing round')
191 | #     ax.set_ylabel('Score')
192 | #     ax.set_ylim([0, 1])
193 | #     ax.set_yticks(np.arange(0, 1.1, 0.1))
194 | #     ax.set_xticks(results.index)
195 | #     ax.grid('on', which='major', linestyle=':', axis='y')
196 | #     plt.tight_layout()
197 | #
198 | #     if outfile:
199 | #         plt.savefig(outfile)
200 | #     else:
201 | #         plt.show()
202 | #
203 | #     return ax
204 | 
205 | 
206 | # def plot_by_time(y, t, granularity='month', type='line', outfile=None):
207 | #     df = pd.DataFrame(y, columns=['positive'], index=t)
208 | #     df['negative'] = [1 ^ x for x in df['positive']]
209 | #
210 | #     try:
211 | #         offset_alias = {
212 | #             'year': '1Y',
213 | #             'quarter': '1Q',
214 | #             'month': '1M',
215 | #             'week': '1W',
216 | #             'day': '1D'
217 | #         }[granularity]
218 | #     except KeyError:
219 | #         # Allow a specific offset alias to be passed in
220 | #         offset_alias = granularity
221 | #
222 | #     df = df.resample(offset_alias).sum()
223 | #
224 | #     colors = ('#cc0000', '#66b3ff')
225 | #     plot_fn = df.plot.bar if type == 'bar' else df.plot
226 | #     ax = plot_fn(color=colors, marker='o', linestyle='--')
227 | #
228 | #     plt.title('Frequency of class membership by {}'.format(granularity))
229 | #     ax.set_xlabel('{}(s)'.format(granularity))
230 | #     ax.set_ylabel('Frequency')
231 | #     ax.grid('on', which='major', linestyle=':', axis='y')
232 | #     plt.tight_layout()
233 | #
234 | #     if outfile:
235 | #         plt.savefig(outfile)
236 | #     else:
237 | #         plt.show()
238 | #
239 | #     return ax
240 | 
241 | 
242 | def summarize(y):
243 |     positive = sum(y)
244 |     negative = len(y) - positive
245 |     print('Class counts:')
246 |     print('-' * 20)
247 |     print('negative: {}'.format(negative))
248 |     print('positive: {}'.format(positive))
249 |     print('\nTotal objects:')
250 |     print('-' * 20)
251 |     print('{} ({:.04}% positive)'.format(len(y), positive / len(y) * 100))
252 | 
253 | 
254 | def get_train_info(X_train, y_train, t_train, existing=None):
255 |     # Ensure results are a defaultdict(list)
256 | 
257 |     results = defaultdict(list, existing) if existing else defaultdict(list)
258 | 
259 |     # Ensure label array is a numpy array
260 | 
261 |     y_train = np.array(y_train)
262 | 
263 |     train_pos = np.sum(y_train)
264 | 
265 |     results['train_pos'].append(train_pos)
266 |     results['train_neg'].append(len(y_train) - train_pos)
267 |     results['train_tot'].append(len(y_train))
268 | 
269 |     return results
270 | 
271 | 
272 | def calculate_metrics(y_true, y_pred, existing=None,
273 |                       raw_scores=None, periods=1):
274 |     periods = len(y_pred) if periods == -1 else periods
275 | 
276 |     if periods > 1:
277 |         for y_t, y_p in zip(y_true, y_pred):
278 |             existing = calculate_metrics(y_t, y_p, existing, raw_scores)
279 |         return existing
280 | 
281 |     # Ensure results are a defaultdict(list)
282 | 
283 |     results = defaultdict(list, existing) if existing else defaultdict(list)
284 | 
285 |     # Ensure both label vectors are Numpy arrays
286 | 
287 |     y_true = np.array(y_true)
288 |     y_pred = np.array(y_pred)
289 | 
290 |     # Heuristic to check if input are raw scores
291 | 
292 |     y_raw = None
293 |     if (raw_scores or
294 |             (raw_scores is None and
295 |              utils.check_for_raw_scores(y_pred))):
296 |         y_raw = y_pred
297 | 
298 |     # Convert output scores or categorical labels to integer labels
299 | 
300 |     y_pred = utils.resolve_categorical(y_pred)
301 |     y_true = utils.resolve_categorical(y_true)
302 | 
303 |     # Ensure labels are encoded as integer labels
304 | 
305 |     if isinstance(y_pred[0], str):
306 |         if isinstance(y_true[0], str):
307 |             try:
308 |                 y_pred = np.array(y_pred, dtype='int32')
309 |                 y_true = np.array(y_true, dtype='int32')
310 |             except ValueError:
311 |                 enc = LabelEncoder().fit(y_true)
312 |                 y_true = enc.transform(y_true)
313 |                 y_pred = enc.transform(y_pred)
314 |         else:
315 |             try:
316 |                 y_pred = np.array(y_pred, dtype='int32')
317 |             except ValueError:
318 |                 y_pred = LabelEncoder().fit_transform(y_pred)
319 | 
320 |     assert len(set(y_true)) <= 2 and len(set(y_pred)) <= 2
321 | 
322 |     # Update total positive and negative predictions
323 | 
324 |     tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=(0, 1)).ravel()
325 |     p = tp + fn
326 |     n = tn + fp
327 | 
328 |     results['tp'].append(tp)
329 |     results['fp'].append(fp)
330 |     results['tn'].append(tn)
331 |     results['fn'].append(fn)
332 | 
333 |     results['p'].append(p)
334 |     results['n'].append(n)
335 |     results['tot'].append(p + n)
336 | 
337 |     # Update cumulative totals
338 | 
339 |     results['tp_cumu'].append(np.sum(results['tp']))
340 |     results['fp_cumu'].append(np.sum(results['fp']))
341 |     results['tn_cumu'].append(np.sum(results['tn']))
342 |     results['fn_cumu'].append(np.sum(results['fn']))
343 | 
344 |     results['p_cumu'].append(np.sum(results['p']))
345 |     results['n_cumu'].append(np.sum(results['n']))
346 |     results['tot_cumu'].append(np.sum(results['tot']))
347 | 
348 |     # Update true/false positive/negative rates
349 | 
350 |     if p == 0:
351 |         results['tpr'].append(np.nan)
352 |         results['fnr'].append(np.nan)
353 |     else:
354 |         results['tpr'].append(tp / p)
355 |         results['fnr'].append(fn / p)
356 | 
357 |     if n == 0:
358 |         results['fpr'].append(np.nan)
359 |         results['tnr'].append(np.nan)
360 |     else:
361 |         results['fpr'].append(fp / n)
362 |         results['tnr'].append(tn / n)
363 | 
364 |     # Calculate AUC-ROC if raw scores have been supplied
365 | 
366 |     if y_raw is not None:
367 | 
368 |         # Some classifiers output with a score/prob for each class, this
369 |         # simply includes only the score/prob of the predicted class as
370 |         # skmetrics.roc_auc_score expects both inputs to be the same shape
371 |         if y_raw.shape != y_true.shape:
372 |             y_scores = np.array([np.max(v) for v in y_raw])
373 |         else:
374 |             y_scores = y_raw
375 | 
376 |         try:
377 |             results['auc_roc'].append(skmetrics.roc_auc_score(y_true, y_scores))
378 |         except ValueError as e:
379 |             print(e)
380 |             results['auc_roc'].append(np.nan)
381 | 
382 |     # Calculate precision, recall and F1 wrt positive and negative classes
383 | 
384 |     results['precision'].append(
385 |         skmetrics.precision_score(y_true, y_pred, pos_label=1))
386 |     results['recall'].append(
387 |         skmetrics.recall_score(y_true, y_pred, pos_label=1))
388 |     results['f1'].append(skmetrics.f1_score(y_true, y_pred, pos_label=1))
389 | 
390 |     results['precision_n'].append(
391 |         skmetrics.precision_score(y_true, y_pred, pos_label=0))
392 |     results['recall_n'].append(
393 |         skmetrics.recall_score(y_true, y_pred, pos_label=0))
394 |     results['f1_n'].append(skmetrics.f1_score(y_true, y_pred, pos_label=0))
395 | 
396 |     return results
397 | 
398 | 
399 | def print_metrics(results, keys=None, header=True):
400 |     if keys is None:
401 |         keys = [
402 |             ('Actual pos', 'p'),
403 |             ('Actual neg', 'n'),
404 |             ('Total', 'tot'),
405 |             ('hline', 'hline'),
406 |             ('TPR', 'tpr'),
407 |             ('FPR', 'fpr'),
408 |             ('TNR', 'tnr'),
409 |             ('FNR', 'fnr'),
410 |             ('AUC ROC', 'auc_roc'),
411 |             ('hline', 'hline'),
412 |             ('Precision', 'precision'),
413 |             ('Recall', 'recall'),
414 |             ('F1', 'f1'),
415 |             ('hline', 'hline')]
416 |     else:
417 |         if isinstance(keys[0], str):
418 |             keys = [(k.title(), k) for k in keys]
419 | 
420 |     periods = max(len(v) for v in results.values())
421 | 
422 |     def print_hline():
423 |         print(('-' * 12) + '+' + ('-' * 7 * periods))
424 | 
425 |     if header:
426 |         header = '{:12}|  '.format('Test period')
427 |         header += ''.join(['{:^7}'.format(i) for i in range(1, periods + 1)])
428 |         print_hline()
429 |         print(header)
430 |         print_hline()
431 | 
432 |     for label, key in keys:
433 |         if label == 'hline':
434 |             print_hline()
435 | 
436 |         elif results[key]:
437 |             row = '{:12}|'.format(label)
438 |             for result in results[key]:
439 |                 if isinstance(result, float):
440 |                     row += '{:>7.3f}'.format(result)
441 |                 else:
442 |                     row += '{:>7}'.format(result)
443 |             print(row)
444 | 
445 |         else:
446 |             pass  # Silently skip missing keys
447 | 
448 | # def cumulative(results, metric):
449 | #     if metric not in ('f1', 'precision', 'recall',
450 | #                       'f1_n', 'precision_n', 'recall_n'):
451 | #         return np.cumsum(results[metric])
452 | #
453 | #     tps = np.cumsum(results['tp'])
454 | #     tns = np.cumsum(results['tn'])
455 | #     fps = np.cumsum(results['fp'])
456 | #     fns = np.cumsum(results['fn'])
457 | #
458 | #     precision = tps / (tps + fps)
459 | #     recall = tps / (tps + fns)
460 | #     f1 = 2 * precision * recall / (precision + recall)
461 | #
462 | #     if metric == 'f1':
463 | #         return f1
464 | #     if metric == 'precision':
465 | #         return precision
466 | #     if metric == 'recall':
467 | #         return recall
468 | #
469 | #     precision_n = tns / (tns + fns)
470 | #     recall_n = tns / (tns + fps)
471 | #     f1_n = 2 * precision_n * recall_n / (precision_n + recall_n)
472 | #
473 | #     if metric == 'f1_n':
474 | #         return f1_n
475 | #     if metric == 'precision_n':
476 | #         return precision_n
477 | #     if metric == 'recall_n':
478 | #         return recall_n
479 | 


--------------------------------------------------------------------------------
/tesseract/evaluation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | evaluation.py
  5 | ~~~~~~~~~~~~~
  6 | 
  7 | 
  8 | 
  9 | """
 10 | import multiprocessing as mp
 11 | 
 12 | import numpy as np
 13 | import scipy.sparse
 14 | from sklearn.model_selection import train_test_split
 15 | from tqdm import tqdm
 16 | 
 17 | from tesseract import utils as utils, metrics as metrics, temporal
 18 | from tesseract.transcendent import *
 19 | 
 20 | 
 21 | class Stage:
 22 |     """Parent class representing stage of the time-aware evaluation cycle.
 23 | 
 24 |     The time-aware evaluation cycle is divided into stages, offering the
 25 |     ability for the system designer to interact with the classification
 26 |     process. The stages can generally be thought of as the following:
 27 | 
 28 |         * Rebalancing: Alterations can be made to the training set composition.
 29 |         * Training: The classifier is fit to the training data.
 30 |         * Prediction: Labels are predicted by the classifier.
 31 |         * Rejection: Low-quality predictions can be discarded/quarantined.
 32 |         * Selection: Test objects can be selected and added to the training.
 33 | 
 34 |     The rebalancing, prediction and selection stages can all be implemented by
 35 |     subclassing Stage or its children.
 36 | 
 37 |     Subclasses of Stage can be coupled together with Stages of the same type,
 38 |     for example, tesseract.evaluation.fit_predict_update accepts lists of
 39 |     Rejectors which will be activated in order during the rejection 'stage' of
 40 |     the evaluation cycle. To determine whether a Stage is activated during that
 41 |     cycle, it contains a schedule.
 42 | 
 43 |     A schedule is simply a list of booleans, the length of the total periods
 44 |     expected during that cycle; the Stage is active if the index of the
 45 |     schedule for that period is True. Some special values exist which will be
 46 |     resolved to valid schedules:
 47 | 
 48 |         * 'first': Activate on the first cycle only.
 49 |         * 'last': Activate on the last cycle only.
 50 |         * 1: Activate every cycle.
 51 |         * 0: Never activate.
 52 | 
 53 |     These settings don't require the total number of test periods to be known
 54 |     in advance, the schedule will be resolved once fit_predict_update has been
 55 |     called, by checking the X_tests parameter.
 56 | 
 57 |     Attributes:
 58 |         schedule (list): A list of booleans indicating when the Stage should be
 59 |             active during the evaluation cycle.
 60 | 
 61 |     """
 62 | 
 63 |     def __init__(self, schedule=1):
 64 |         self.schedule = schedule
 65 | 
 66 |     def resolve_schedule(self, total_periods):
 67 |         """Produces a valid schedule for the total periods specified.
 68 | 
 69 |         A schedule is a list of booleans, the length of the total periods
 70 |         expected during that cycle; the Stage is active if the index of the
 71 |         schedule for that period is True.
 72 | 
 73 |         Some special values exist which will be resolved to valid schedules:
 74 | 
 75 |             * 'first': Activate on the first cycle only.
 76 |             * 'last': Activate on the last cycle only.
 77 |             * 1: Activate every cycle.
 78 |             * 0: Never activate.
 79 | 
 80 |         """
 81 |         if self.schedule == 'first':
 82 |             self.schedule = [True] + [False] * (total_periods - 1)
 83 |         elif self.schedule == 'last':
 84 |             self.schedule = [False] * (total_periods - 1) + [True]
 85 |         elif self.schedule in (1, '1'):
 86 |             self.schedule = [True] * total_periods
 87 |         elif self.schedule in (0, '0'):
 88 |             self.schedule = [False] * total_periods
 89 |         elif hasattr(self.schedule, '__iter__'):
 90 |             self.schedule = [int(x) == 0 for x in self.schedule]
 91 |         else:
 92 |             raise ValueError('Schedule `{}` cannot be understood.'.format(
 93 |                 self.schedule))
 94 | 
 95 | 
 96 | class TrackingStage(Stage):
 97 |     """
 98 | 
 99 |     """
100 | 
101 |     def __init__(self, schedule=1, tracking=True, interaction='intersection'):
102 |         super().__init__(schedule=schedule)
103 | 
104 |         self._interactions = ('intersection', 'union', 'sym_diff', 'ignore')
105 | 
106 |         self.tracking = tracking
107 |         self.interaction = interaction
108 | 
109 |         if interaction not in self._interactions:
110 |             raise ValueError('Interaction mode must be one of {}'.format(
111 |                 self._interactions))
112 | 
113 |     def merge_results(self, past, present):
114 |         # Case for first test period in a cycle
115 |         # (distinct from when past is an empty array)
116 |         if past is None:
117 |             return present
118 | 
119 |         if self.interaction == 'union':
120 |             return np.union1d(past, present)
121 |         elif self.interaction == 'intersection':
122 |             return np.intersect1d(past, present)
123 |         elif self.interaction == 'sym_diff':
124 |             return np.setxor1d(past, present)
125 | 
126 | 
127 | def fit_predict_update(clf, X_train, X_tests,
128 |                        y_train, y_tests, t_train, t_tests,
129 |                        fit_function=None, predict_function=None,
130 |                        rebalancers=(), rejectors=(), selectors=()):
131 |     """Sliding window classification of a timestamp partitioned dataset.
132 | 
133 |     This function assumes that the dataset has been partitioned into
134 |     historically coherent training and testing sets such that all objects in
135 |     the training set are historically anterior to all objects in the testing
136 |     sets, and in each testing set i, all objects in the set are historically
137 |     anterior to all objects in testing set i + 1.
138 | 
139 |     The set of testing objects X_tests is split into a series of rolling
140 |     testing windows (as are the corresponding y_tests). Each round of
141 |     prediction is performed on the next test partition in the series.
142 | 
143 |     This arrangement is depicted here with the parameters:
144 | 
145 |         * Training dataset size: 6 months
146 |         * Testing dataset size: 2 months
147 |         * Date range of the dataset: 12 months (Jan - Dec)
148 | 
149 |     Months tagged ■ are included in the training dataset.
150 |     Months tagged □ are included in the testing dataset.
151 |     Months tagged ▣ are included in training dataset but the results from the
152 |        previous round of testing are concatenated to the latest results.
153 | 
154 |     Rolling testing
155 |     ---------------
156 | 
157 |        Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
158 |     1    ■   ■   ■   ■   ■   ■   □   □
159 |     2    ■   ■   ■   ■   ■   ■           □   □
160 |     3    ■   ■   ■   ■   ■   ■                   □   □
161 | 
162 |     Example:
163 |         >>> from sklearn.svm import LinearSVC
164 |         >>> from tesseract import mock, temporal, evaluation
165 |         >>>
166 |         >>> X, y, t = mock.generate_binary_test_data(10000, '2000')
167 |         >>>
168 |         >>> splits = temporal.time_aware_train_test_split(
169 |         >>>     X, y, t, train_size=6, test_size=2, granularity='month')
170 |         >>>
171 |         >>> clf = LinearSVC()
172 |         >>>
173 |         >>> results = evaluation.fit_predict_update(clf, *splits)
174 | 
175 |     For comparison, here's the same set of parameters combined with
176 |     a FullRetrainingSelector to achieve incremental retraining at each
177 |     testing period:
178 | 
179 |     Rolling testing, incremental retraining
180 |     ---------------------------------------
181 | 
182 |        Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
183 |     1    ■   ■   ■   ■   ■   ■   □   □
184 |     2    ■   ■   ■   ■   ■   ■   ■   ■   □   □
185 |     3    ■   ■   ■   ■   ■   ■   ■   ■   ■   ■   □   □
186 | 
187 |     Example:
188 |         >>> from tesseract.selection import FullRetrainingSelector
189 |         >>>
190 |         >>> results = evaluation.fit_predict_update(
191 |         >>>     clf, *splits, selectors=[FullRetrainingSelector()])
192 |         selectors=[ActiveLearningSelector()]
193 | 
194 |     The time-aware evaluation cycle is divided into stages, offering the
195 |     ability for the system designer to interact with the classification
196 |     process. The stages can generally be thought of as the following:
197 | 
198 |         * Rebalancing: Alterations can be made to the training set composition.
199 |         * Training: The classifier is fit to the training data.
200 |         * Prediction: Labels are predicted by the classifier.
201 |         * Rejection: Low-quality predictions can be discarded/quarantined.
202 |         * Selection: Test objects can be selected and added to the training.
203 | 
204 |     This cycle repeats for each testing period. The rebalancing, prediction
205 |     and selection stages are each triggered by passing in lists of Rebalancer,
206 |     Rejector or Selector objects respectively. These are then invoked
207 |     (in order) at the appropriate stages in the training phase. Stages can be
208 |     switched on and off for certain testing periods by passing them a
209 |     schedule and the way they interact with previous stages of the same type
210 |     can also be controlled.
211 | 
212 |     Fitting will use the fit() method of the classifier while prediction will
213 |     try to resolve the most appropriate one for the classifier (either to
214 |     produce output labels or raw scores). This behaviour can be overridden by
215 |     passing a function to fit_function or predict_function.
216 | 
217 |     The form of these functions must maintain the following contract:
218 | 
219 |         * fit_function(X_train, y_train)
220 |         * y_pred = predict_function(X_test)
221 | 
222 |     Note, there are plans to improve the rudimentary predict-function-detection
223 |     and to perhaps replace the fit_function and predict_function parameters
224 |     with Fitter and Predictor objects which would allow for greater control.
225 | 
226 |     Args:
227 |         clf: A scikit-learn or Keras classifier with fit and predict methods.
228 |         X_train (np.ndarray): Training partition of predictors X.
229 |         X_tests (list): List of testing partitions of predictors X.
230 |         y_train (np.ndarray): Training partition of output variables y.
231 |         y_tests (list): List of testing partitions of predictors y.
232 |         t_train (np.ndarray): Training partition of datetimes for X.
233 |         t_tests (list): List of testing partitions of datetimes for X.
234 |         fit_function (function): The function to use to fit clf.
235 |         predict_function (function): The function to predict with.
236 |         rebalancers (list): A list of rebalancers to alter the training set.
237 |         rejectors (list): A list of rejectors to reject poor predictions.
238 |         selectors (list): A list of selectors to pick test items to train with.
239 | 
240 |     Returns:
241 |         dict: Performance metrics for each round of predictions, including
242 |             precision, recall, F1 score, AUC ROC, TPR, TNR, FPR, FNR, TP, FP,
243 |             TN, FN, actual positive and actual negative counts.
244 | 
245 |     See Also:
246 |         tesseract.temporal.time_aware_train_test_split
247 |         tesseract.evaluation.Stage
248 |         tesseract.selection.Selector
249 |         tesseract.rejection.Rejector
250 |         tesseract.rebalancing.Rebalancer
251 | 
252 |     """
253 |     fit_function = clf.fit if fit_function is None else fit_function
254 |     predict_function = (utils.select_prediction_function(clf, labels_only=True)
255 |                         if predict_function is None else predict_function)
256 | 
257 |     for stage in tuple(rebalancers) + tuple(rejectors) + tuple(selectors):
258 |         stage.resolve_schedule(len(X_tests))
259 | 
260 |     results = {}
261 |     selected_indexes = None
262 |     for i, (X_test, y_test, t_test) in tqdm(enumerate(
263 |             zip(X_tests, y_tests, t_tests))):
264 | 
265 |         # --------------------------------------------------------------- #
266 |         # Make alterations to the dataset before testing (optional)       #
267 |         # --------------------------------------------------------------- #
268 | 
269 |         for rebalancer in rebalancers:
270 |             if not rebalancer.schedule[i]:
271 |                 continue
272 | 
273 |             X_train, y_train, t_train = rebalancer.alter(
274 |                 clf, X_train, y_train, t_train, X_test, y_test, t_test)
275 | 
276 |         # --------------------------------------------------------------- #
277 |         # (Re)fit and predict                                             #
278 |         # --------------------------------------------------------------- #
279 | 
280 |         results = metrics.get_train_info(
281 |             X_train, y_train, t_train, existing=results)
282 | 
283 |         if selected_indexes is not None or i == 0:
284 |             fit_function(X_train, y_train)
285 | 
286 |         y_pred = predict_function(X_test)
287 | 
288 |         # --------------------------------------------------------------- #
289 |         # Discard/quarantine observations (optional)                      #
290 |         # --------------------------------------------------------------- #
291 | 
292 |         kept_indexes, rejected_indexes = None, None
293 |         for rejector in rejectors:
294 |             if not rejector.schedule[i]:
295 |                 continue
296 | 
297 |             kept_indexes, rejected_indexes = rejector.reject_wrapper(
298 |                 clf, X_train, y_train, t_train,
299 |                 X_test, y_test, t_test,
300 |                 kept_indexes, rejected_indexes)
301 | 
302 |         # cause bug that X_test doesn't change
303 |         if kept_indexes is not None:
304 |             y_test = y_test[kept_indexes]
305 |             y_pred = y_pred[kept_indexes]
306 |             t_test = t_test[kept_indexes]
307 | 
308 |             results['rejected'].append(rejected_indexes.size)
309 |         else:
310 |             results['rejected'].append(0)
311 | 
312 |         # --------------------------------------------------------------- #
313 |         # Calculate performance                                           #
314 |         # --------------------------------------------------------------- #
315 | 
316 |         results = metrics.calculate_metrics(
317 |             y_test, y_pred, existing=results)
318 | 
319 |         # --------------------------------------------------------------- #
320 |         # Select test observations for retraining (optional)              #
321 |         # --------------------------------------------------------------- #
322 | 
323 |         selected_indexes = None
324 |         for selector in selectors:
325 |             if not selector.schedule[i]:
326 |                 continue
327 | 
328 |             selected_indexes = selector.query_wrapper(
329 |                 clf, X_train, y_train, t_train,
330 |                 X_test, y_test, t_test, selected_indexes)
331 | 
332 |         if selected_indexes is not None:
333 |             # Select observations for training using chosen indices
334 |             X_selected = X_test[selected_indexes]
335 |             y_selected = y_test[selected_indexes]
336 |             t_selected = t_test[selected_indexes]
337 | 
338 |             # Update training model with N selected points
339 |             X_train = scipy.sparse.vstack((X_train, X_selected))
340 |             y_train = np.hstack((y_train, y_selected))
341 |             t_train = np.hstack((t_train, t_selected))
342 | 
343 |             results['selected'].append(selected_indexes.size)
344 |         else:
345 |             results['selected'].append(0)
346 | 
347 |         if 'y_preds' not in results:
348 |             results['y_tests'] = [y_test]
349 |             results['y_preds'] = [y_pred]
350 |             results['t_tests'] = [t_test]
351 |         else:
352 |             results['y_tests'].append(y_test)
353 |             results['y_preds'].append(y_pred)
354 |             results['t_tests'].append(t_test)
355 | 
356 |     return results
357 | 
358 | 
359 | def predict(clf, X_tests, decision_threshold=None,
360 |             labels_only=False, predict_function=None, nproc=1):
361 |     """Standalone prediction of a set of test periods.
362 | 
363 |     Takes a set of historically aware test periods and performs prediction
364 |     across them. This can be useful when there is no need for the interactive
365 |     stages of a prediction as in that case the process can be performed in
366 |     parallel.
367 | 
368 |     Example:
369 |         >>> from sklearn.ensemble import RandomForestClassifier
370 |         >>> from tesseract import mock, temporal, evaluation, metrics
371 |         >>>
372 |         >>> X, y, t = mock.generate_binary_test_data(10000, '2000')
373 |         >>>
374 |         >>> splits = temporal.time_aware_train_test_split(
375 |         >>>     X, y, t, train_size=6, test_size=2, granularity='month')
376 |         >>>
377 |         >>> X_train, X_tests, y_train, y_tests, t_train, t_tests = splits
378 |         >>>
379 |         >>> clf = RandomForestClassifier(n_estimators=101, max_depth=64)
380 |         >>> clf.fit(X_train, y_train)
381 |         >>>
382 |         >>> y_preds = evaluation.predict(clf, X_tests, nproc=4)
383 |         >>> results = metrics.calculate_metrics(y_tests, y_preds, periods=-1)
384 |         >>> metrics.print_metrics(results)
385 | 
386 |     Args:
387 |         clf: A scikit-learn or Keras classifier with fit and predict methods.
388 |         X_tests (list): List of testing partitions of predictors X.
389 |         decision_threshold (float): Calibrate prediction function by
390 |             supplying a threshold over which scores are labelled positive.
391 |             This is intended for classifiers that output probabilities only.
392 |         labels_only (bool): Prefer a labelling prediction function over one
393 |             that outputs raw scores.
394 |         predict_function (function): A custom function to predict with.
395 |         nproc (int): The number of processors to use.
396 | 
397 |     Returns:
398 |         list: A list of np.array objects containing the classification results
399 |             for each test period in X_tests.
400 | 
401 |     """
402 |     predict_function = (
403 |         utils.select_prediction_function(clf, labels_only=labels_only) if
404 |         predict_function is None else predict_function)
405 | 
406 |     # `nproc = -1` becomes `nproc = mp.cpu_count() + (- 1)`, etc
407 |     nproc = mp.cpu_count() + nproc if nproc < 0 else nproc
408 | 
409 |     # Predictions have no dependencies in this context, we can parallelize them
410 |     if nproc > 1:
411 |         with mp.Pool(nproc) as p:
412 |             y_preds = list(tqdm(
413 |                 p.imap(predict_function, X_tests), total=len(X_tests)))
414 | 
415 |     # Avoid invoking parallelism and associated overhead for a single CPU
416 |     else:
417 |         y_preds = []
418 |         for X_test in tqdm(X_tests):
419 |             y_pred = predict_function(X_test)
420 |             y_preds.append(y_pred)
421 | 
422 |     # TODO | Move to an "apply_decision_threshold" function to better test
423 |     # TODO | and include the option in fit_predict_update (probas only).
424 |     if decision_threshold:
425 |         for i, y_pred in enumerate(y_preds):
426 |             if y_pred.ndim > 1:
427 |                 y_scores = np.array([np.max(v) for v in y_pred])
428 |             else:
429 |                 y_scores = y_pred
430 |             y_preds[i] = np.array(y_scores > decision_threshold, dtype=int)
431 | 
432 |     return y_preds
433 | 
434 | 
435 | 


--------------------------------------------------------------------------------
/tesseract/transcendent.py:
--------------------------------------------------------------------------------
   1 | import logging
   2 | import os
   3 | import statistics
   4 | import numpy as np
   5 | from tqdm import tqdm
   6 | import pickle as pkl
   7 | from sklearn import metrics as mtcs
   8 | import multiprocessing as mp
   9 | from itertools import repeat
  10 | 
  11 | 
  12 | def sort_by_predicted_label(
  13 |         scores, predicted_labels, groundtruth_labels, consider='correct'):
  14 |     """Sort scores into lists of their respected predicted classes.
  15 | 
  16 |     Divide a set of scores into 'predicted positive' and 'predicted
  17 |     negative' results. Optionally consider only correct or incorrect
  18 |     predictions. `scores`, `predicted_labels`, and `groundtruth_labels`
  19 |     should be aligned (one per observation).
  20 | 
  21 |     Example:
  22 |         >>> s = np.array([0.8, 0.7, 0.6, 0.9])
  23 |         >>> y_pred = np.array([1, 1, 0, 0])
  24 |         >>> y_true = np.array([1, 0, 1, 0])
  25 |         >>> sort_by_predicted_label(s, y_pred, y_true, 'correct')
  26 |         (array([0.8]), array([0.9]))
  27 |         >>> sort_by_predicted_label(s, y_pred, y_true, 'incorrect')
  28 |         (array([0.7]), array([0.6]))
  29 |         >>> sort_by_predicted_label(s, y_pred, y_true, 'all')
  30 |         (array([0.8, 0.7]), array([0.6, 0.9]))
  31 | 
  32 |     Args:
  33 |         scores (np.ndarray): Predicted scores to be sorted.
  34 |         predicted_labels (np.ndarray): The prediction outcome for each object.
  35 |         groundtruth_labels (np.ndarray): The groundtruth label for each object.
  36 |         consider (str): ['correct'|'incorrect'|'all']. Whether to consider only
  37 |             correct predictions, incorrect predictions, or not to distinguish
  38 |             between them.
  39 | 
  40 |     Returns:
  41 |         (np.ndarray, np.ndarray): Tuple of sorted scores (malware, goodware).
  42 | 
  43 |     """
  44 | 
  45 |     def predicted(i, k):
  46 |         return predicted_labels[i] == k
  47 | 
  48 |     def correct(i, k):
  49 |         return predicted(i, k) and (groundtruth_labels[i] == k)
  50 | 
  51 |     def incorrect(i, k):
  52 |         return predicted(i, k) and (groundtruth_labels[i] == (k ^ 1))
  53 | 
  54 |     if consider == 'correct':
  55 |         select = correct
  56 |     elif consider == 'incorrect':
  57 |         select = incorrect
  58 |     elif consider == 'all':
  59 |         select = predicted
  60 |     else:
  61 |         raise ValueError('Unknown thresholding criteria!')
  62 | 
  63 |     scores_mw = [scores[i] for i in range(len(scores)) if select(i, 1)]
  64 |     scores_gw = [scores[i] for i in range(len(scores)) if select(i, 0)]
  65 | 
  66 |     return np.array(scores_mw), np.array(scores_gw)
  67 | 
  68 | 
  69 | def apply_threshold(binary_thresholds, test_scores, y_test):
  70 |     """Returns a 'keep mask' describing which elements to include.
  71 | 
  72 |     Elements that fall above the threshold (and should be kept) have
  73 |     their indexes marked TRUE.
  74 | 
  75 |     Elements that fall below the threshold (and should be rejected) have
  76 |     their indexes marked FALSE.
  77 | 
  78 |     `binary_thresholds` expects a dictionary keyed by 'cred' and/or 'conf',
  79 |     with sub-dictionaries containing the thresholds for the mw and gw classes.
  80 | 
  81 |     Note that the keys of `binary_thresholds` determine _which_ thresholding
  82 |     criteria will be enforced. That is, if only a 'cred' dictionary is supplied
  83 |     thresholding will be enforced on cred-only and the same for 'conf'.
  84 |     Supplying cred and conf dictionaries will enforce the 'cred+conf'
  85 |     thresholding criteria (all thresholds will be applied).
  86 | 
  87 |     `test_scores` expects a dictionary in much the same way, with at least the
  88 |     same keys as `binary_thresholds` ('cred' and/or 'conf' at the top level).
  89 | 
  90 |     Example:
  91 |         >>> thresholds = {'cred': {'mw': 0.4, 'gw': 0.6},
  92 |         ...               'conf': {'mw': 0.5, 'gw': 0.8}}
  93 |         >>> scores = {'cred': [0.4, 0.2, 0.7, 0.8, 0.6],
  94 |         ...           'conf': [0.6, 0.8, 0.3, 0.2, 0.4]}
  95 |         >>> y = np.array([1, 1, 1, 0, 0])
  96 |         >>> apply_threshold(thresholds, scores, y)
  97 |         array([ True, False, False, False, False])
  98 | 
  99 |     Args:
 100 |         binary_thresholds(dict): The threshold to apply.
 101 |         test_scores (dict): The test scores to apply the threshold to.
 102 |         y_test (np.ndarray): The set of predictions to decide which 'per-class'
 103 |             threshold to use. Depending on the stage of conformal evaluation,
 104 |             this could be either the predicted or ground truth labels.
 105 | 
 106 |     Returns:
 107 |         np.ndarray: Boolean mask to use on the elements (1 = kept, 0 = reject).
 108 | 
 109 |     """
 110 |     # Assert preconditions
 111 |     assert (set(binary_thresholds.keys()) in
 112 |             [{'cred'}, {'conf'}, {'cred', 'conf'}])
 113 | 
 114 |     for key in binary_thresholds.keys():
 115 |         assert key in test_scores.keys()
 116 |         assert set(binary_thresholds[key].keys()) == {'mw', 'gw'}
 117 | 
 118 |     def get_class_threshold(criteria, k):
 119 |         return (binary_thresholds[criteria]['mw'] if k == 1
 120 |                 else binary_thresholds[criteria]['gw'])
 121 | 
 122 |     keep_mask = []
 123 |     for i, y_prediction in enumerate(y_test):
 124 | 
 125 |         cred_threshold, conf_threshold = 0, 0
 126 |         current_cred, current_conf = 0, 0
 127 | 
 128 |         if 'cred' in binary_thresholds:
 129 |             key = 'cred'
 130 |             current_cred = test_scores[key][i]
 131 |             cred_threshold = get_class_threshold(key, y_prediction)
 132 | 
 133 |         if 'conf' in binary_thresholds:
 134 |             key = 'conf'
 135 |             current_conf = test_scores[key][i]
 136 |             conf_threshold = get_class_threshold(key, y_prediction)
 137 | 
 138 |         keep_mask.append(
 139 |             (current_cred >= cred_threshold) and
 140 |             (current_conf >= conf_threshold))
 141 | 
 142 |     return np.array(keep_mask, dtype=bool)
 143 | 
 144 | 
 145 | def get_performance_with_rejection(y_true, y_pred, keep_mask, full=True):
 146 |     """Get test results, rejecting predictions based on a given keep mask.
 147 | 
 148 |     Args:
 149 |         y_true (np.ndarray): The groundtruth label for each object.
 150 |         y_pred (np.ndarray): The set of predictions to decide which 'per-class'
 151 |             threshold to use. Depending on the stage of conformal evaluation,
 152 |             this could be either the predicted or ground truth labels.
 153 |         keep_mask (np.ndarray): A boolean mask describing which elements to
 154 |             keep (True) or reject (False).
 155 |         full (bool): True if full statistics are required, False otherwise.
 156 |             False is computationally less expensive.
 157 | 
 158 |     Returns:
 159 |         dict: A dictionary of results for baseline, kept, and rejected metrics.
 160 | 
 161 |     """
 162 |     y_true = np.array(y_true)
 163 |     y_pred = np.array(y_pred)
 164 | 
 165 |     d = {}
 166 | 
 167 |     total_neg = len(y_pred) - sum(y_pred)
 168 |     total_pos = sum(y_pred)
 169 | 
 170 |     kept_total_perc = sum(keep_mask) / len(keep_mask)
 171 |     reject_total_perc = sum(~keep_mask) / len(keep_mask)
 172 | 
 173 |     kept_neg = len(y_pred[keep_mask]) - sum(y_pred[keep_mask])
 174 |     kept_pos = sum(y_pred[keep_mask])
 175 | 
 176 |     reject_neg = total_neg - kept_neg
 177 |     reject_pos = total_pos - kept_pos
 178 | 
 179 |     kept_neg_perc = (kept_neg / total_neg)
 180 |     kept_pos_perc = (kept_pos / total_pos)
 181 | 
 182 |     reject_neg_perc = 1 - kept_neg_perc
 183 |     reject_pos_perc = 1 - kept_pos_perc
 184 | 
 185 |     reject_neg_total = reject_neg / len(y_pred)
 186 |     reject_pos_total = reject_pos / len(y_pred)
 187 | 
 188 |     d.update({'total_neg': total_neg,
 189 |               'total_pos': total_pos,
 190 |               'kept_total_perc': kept_total_perc,
 191 |               'reject_total_perc': reject_total_perc,
 192 |               'kept_neg': kept_neg, 'kept_pos': kept_pos,
 193 |               'reject_neg': reject_neg, 'reject_pos': reject_pos,
 194 |               'kept_neg_perc': kept_neg_perc,
 195 |               'kept_pos_perc': kept_pos_perc,
 196 |               'reject_neg_perc': reject_neg_perc,
 197 |               'reject_pos_perc': reject_pos_perc,
 198 |               'reject_neg_total': reject_neg_total,
 199 |               'reject_pos_total': reject_pos_total})
 200 | 
 201 |     f1_b = mtcs.f1_score(y_true, y_pred)
 202 |     f1_k = mtcs.f1_score(y_true[keep_mask],
 203 |                          y_pred[keep_mask])
 204 |     f1_r = mtcs.f1_score(y_true[~keep_mask],
 205 |                          y_pred[~keep_mask])
 206 | 
 207 |     d.update({'f1_b': f1_b, 'f1_k': f1_k, 'f1_r': f1_r})
 208 | 
 209 |     precision_b = mtcs.precision_score(y_true, y_pred)
 210 | 
 211 |     precision_k = mtcs.precision_score(y_true[keep_mask],
 212 |                                        y_pred[keep_mask])
 213 |     precision_r = mtcs.precision_score(y_true[~keep_mask],
 214 |                                        y_pred[~keep_mask])
 215 |     d.update({'precision_b': precision_b,
 216 |               'precision_k': precision_k,
 217 |               'precision_r': precision_r})
 218 | 
 219 |     recall_b = mtcs.recall_score(y_true, y_pred)
 220 | 
 221 |     recall_k = mtcs.recall_score(y_true[keep_mask],
 222 |                                  y_pred[keep_mask])
 223 |     recall_r = mtcs.recall_score(y_true[~keep_mask],
 224 |                                  y_pred[~keep_mask])
 225 |     d.update({'recall_b': recall_b, 'recall_k': recall_k, 'recall_r': recall_r})
 226 | 
 227 |     if full:
 228 |         cf_baseline = mtcs.confusion_matrix(y_true, y_pred)
 229 | 
 230 |         cf_keep = mtcs.confusion_matrix(y_true[keep_mask],
 231 |                                         y_pred[keep_mask])
 232 |         cf_reject = mtcs.confusion_matrix(y_true[~keep_mask],
 233 |                                           y_pred[~keep_mask])
 234 |         try:
 235 |             tn_b, fp_b, fn_b, tp_b = cf_baseline.ravel()
 236 |             tn_k, fp_k, fn_k, tp_k = cf_keep.ravel()
 237 |             tn_r, fp_r, fn_r, tp_r = cf_reject.ravel()
 238 |         except Exception as e:
 239 |             print(f'Transcendent met a problem: {e}')
 240 | 
 241 |             return d
 242 | 
 243 |         d.update({
 244 |             'tn_b': tn_b, 'fp_b': fp_b, 'fn_b': fn_b, 'tp_b': tp_b,
 245 |             'tn_k': tn_k, 'fp_k': fp_k, 'fn_k': fn_k, 'tp_k': tp_k,
 246 |             'tn_r': tn_r, 'fp_r': fp_r, 'fn_r': fn_r, 'tp_r': tp_r
 247 |         })
 248 | 
 249 |         d['tpr_b'] = tp_b / (tp_b + fn_b)
 250 |         d['tpr_k'] = tp_k / (tp_k + fn_k)
 251 |         d['tpr_r'] = tp_r / (tp_r + fn_r)
 252 | 
 253 |         d['fpr_b'] = fp_b / (fp_b + tn_b)
 254 |         d['fpr_k'] = fp_k / (fp_k + tn_k)
 255 |         d['fpr_r'] = fp_r / (fp_r + tn_r)
 256 | 
 257 |     return d
 258 | 
 259 | 
 260 | def test_with_rejection(
 261 |         binary_thresholds, test_scores, groundtruth_labels, predicted_labels, full=True):
 262 |     """Get test results, rejecting predictions based on a given threshold.
 263 | 
 264 |     `binary_thresholds` expects a dictionary keyed by 'cred' and/or 'conf',
 265 |     with sub-dictionaries containing the thresholds for the mw and gw classes.
 266 | 
 267 |     Note that the keys of `binary_thresholds` determine _which_ thresholding
 268 |     criteria will be enforced. That is, if only a 'cred' dictionary is supplied
 269 |     thresholding will be enforced on cred-only and the same for 'conf'.
 270 |     Supplying cred and conf dictionaries will enforce the 'cred+conf'
 271 |     thresholding criteria (all thresholds will be applied).
 272 | 
 273 |     `test_scores` expects a dictionary in much the same way, with at least the
 274 |     same keys as `binary_thresholds` ('cred' and/or 'conf' at the top level).
 275 | 
 276 |     See Also:
 277 |         - `apply_threshold`
 278 |         - `get_performance_with_rejection`
 279 | 
 280 |     Args:
 281 |         binary_thresholds (dict): The threshold to apply.
 282 |         test_scores (dict): The test scores to apply the threshold to.
 283 |         groundtruth_labels (np.ndarray): The groundtruth label for each object.
 284 |         predicted_labels (np.ndarray): The set of predictions to decide which
 285 |             'per-class' threshold to use. Depending on the stage of conformal
 286 |             evaluation, this could be either the predicted or ground truth
 287 |             labels.
 288 |         full (boolean): Optimization flag which dictates how much data to return,
 289 |             default is True. False gives a lot more performance but removes a lot 
 290 |             of metrics. 
 291 | 
 292 |     Returns:
 293 |         dict: A dictionary of results for baseline, kept, and rejected metrics.
 294 | 
 295 |     """
 296 |     keep_mask = apply_threshold(
 297 |         binary_thresholds=binary_thresholds,
 298 |         test_scores=test_scores,
 299 |         y_test=predicted_labels)
 300 | 
 301 |     results = get_performance_with_rejection(
 302 |         y_true=groundtruth_labels,
 303 |         y_pred=predicted_labels,
 304 |         keep_mask=keep_mask,
 305 |         full=full)
 306 | 
 307 |     return results
 308 | 
 309 | 
 310 | def random_threshold(scores, predicted_labels):
 311 |     """Produce random thresholds over the given scores.
 312 | 
 313 |     Args:
 314 |         scores (dict): The test scores on which to produce a threshold.
 315 |         predicted_labels (np.ndarray): The set of predictions to decide which
 316 |             'per-class' threshold to use.
 317 | 
 318 |     Returns:
 319 |         dict: Set of thresholds for malware ('gw') and goodware ('gw') classes.
 320 | 
 321 |     """
 322 |     scores_mw, scores_gw = sort_by_predicted_label(
 323 |         scores, predicted_labels, np.array([]), 'all')
 324 |     mw_threshold = np.random.uniform(min(scores_mw), max(scores_mw))
 325 |     gw_threshold = np.random.uniform(min(scores_gw), max(scores_gw))
 326 |     return {'mw': mw_threshold, 'gw': gw_threshold}
 327 | 
 328 | 
 329 | def format_opts(metrics, results):
 330 |     """Helper function for formatting the results of a list of metrics."""
 331 |     return ('{}: {:.4f} | ' * len(metrics)).format(
 332 |         *[item for sublist in
 333 |           zip(metrics, [results[k] for k in metrics]) for
 334 |           item in sublist])
 335 | 
 336 | 
 337 | def find_random_search_thresholds(
 338 |         scores, predicted_labels, groundtruth_labels,
 339 |         max_metrics='f1_k,kept_total_perc', min_metrics='f1_r',
 340 |         ceiling=0.25, max_samples=100, objective_func=None):
 341 |     """Perform a random grid search to find the best thresholds on `scores`.
 342 | 
 343 |     `scores` expects a dictionary keyed by 'cred' and/or 'conf',
 344 |     with sub-dictionaries containing the thresholds for the mw and gw classes.
 345 | 
 346 |     Note that the keys of `scores` determine _which_ thresholding criteria will
 347 |     be enforced. That is, if only a 'cred' dictionary is supplied, thresholding
 348 |     will be enforced on cred-only and the same for 'conf'. Supplying cred and
 349 |     conf dictionaries will enforce the 'cred+conf' thresholding criteria (all
 350 |     thresholds will be applied).
 351 | 
 352 |     `max_metrics` and `min_metrics` describe the metrics that should be
 353 |     maximised or minimised if the default objective function is being used
 354 |     (a harmonic mean, selected with `objective_func=None`). It expects either
 355 |     a list of possible metrics, or a string or comma separated metrics.
 356 | 
 357 |     For example, both of the following are acceptable:
 358 | 
 359 |         > max_metrics = ['f1_k', 'kept_total_perc']
 360 |         > max_metrics = 'f1_k,kept_total_perc'
 361 | 
 362 |     `ceiling` describes the constraints of the optimization function. If any of
 363 |     the selected metrics exceed the value given then the thresholds chosen are
 364 |     discarded. `ceiling` expects a dictionary of metrics and maximum acceptable
 365 |     values. Alternatively, arguments can be given in string form as comma-
 366 |     separated key:value pairs, e.g., 'key1:value1,key2:value2,key3:value3'.
 367 |     Finally, if a float is provided, it's interpreted as being the maximum
 368 |     acceptable value for the total number of rejected predictions.
 369 | 
 370 |     To summarise, all of the following are equivalent:
 371 | 
 372 |         > ceiling = {'total_reject_perc': 0.25}
 373 |         > ceiling = 'total_reject_perc:0.25'
 374 |         > ceiling = 0.25
 375 | 
 376 |     For a list of possible metrics, see the keys in the dict produced by
 377 |     `get_performance_with_rejection()`. Note that the default objective
 378 |     function assumes that the provided metrics are in the interval [0,1].
 379 | 
 380 |     `objective_func` is the objective function to maximise during the random
 381 |     search. By default (`objective_func=None`), it will maximise the harmonic
 382 |     mean of the given `max_metrics` and 1 - each of the given `min_metrics`.
 383 | 
 384 |     A custom objective function can be provided which should expect a result
 385 |     dictionary of metrics just like the dictionary produced by
 386 |     `get_performance_with_rejection()`.
 387 | 
 388 |     See Also:
 389 |         - `get_performance_with_rejection`
 390 | 
 391 |     Args:
 392 |         scores (dict): The test scores on which to perform the random search.
 393 |         predicted_labels (np.ndarray): The set of predictions to decide which
 394 |             'per-class' threshold to use.
 395 |         groundtruth_labels (np.ndarray): The groundtruth label for each object.
 396 |         max_metrics: The metrics that should be maximised.
 397 |         min_metrics: The metrics that should be minimised.
 398 |         ceiling: Can be passed an empty dict if you don't want to enforce any
 399 |             constraint in this way.
 400 |         max_samples (int): The maximum number of random threshold combinations
 401 |             to try before settling for the best performance up to that point.
 402 |         objective_func (function): The objective function to maximise.
 403 | 
 404 |     Returns:
 405 |         dict: Set of thresholds for malware ('gw') and goodware ('gw') classes.
 406 | 
 407 |     """
 408 | 
 409 |     # Resolve possible formats for `max_metrics` and `min_metrics`.
 410 |     def resolve_opt_list(x):
 411 |         return x.split(',') if isinstance(x, str) else x
 412 | 
 413 |     min_metrics = resolve_opt_list(min_metrics)
 414 |     max_metrics = resolve_opt_list(max_metrics)
 415 | 
 416 |     # Resolve possible formats of `ceiling`.
 417 |     ceiling = {} if ceiling is None else ceiling
 418 | 
 419 |     ceiling = ({'total_reject_perc': ceiling}
 420 |                if isinstance(ceiling, (int, float)) else ceiling)
 421 | 
 422 |     if isinstance(ceiling, str):
 423 |         pairs = ceiling.split(',')
 424 |         pairs = [x.split(':') for x in pairs]
 425 |         ceiling = {k: float(v) for k, v in pairs}
 426 | 
 427 |     # Resolve objective function to use during the optimization.
 428 |     def harm_mean(d):
 429 |         maximise = [d[m] for m in max_metrics]
 430 |         maximise.extend([1 - d[m] for m in min_metrics])
 431 |         return statistics.harmonic_mean(maximise)
 432 | 
 433 |     objective_func = harm_mean if objective_func is None else objective_func
 434 | 
 435 |     best_outcome, n_samples = 0, 0
 436 |     best_thresholds, best_results = {}, {}
 437 | 
 438 |     while True:
 439 |         # Choose and package random thresholds
 440 |         thresholds = {}
 441 |         if 'cred' in scores:
 442 |             cred_thresholds = random_threshold(scores['cred'], predicted_labels)
 443 |             thresholds['cred'] = cred_thresholds
 444 |         if 'conf' in scores:
 445 |             conf_thresholds = random_threshold(scores['conf'], predicted_labels)
 446 |             thresholds['conf'] = conf_thresholds
 447 | 
 448 |         # Test with chosen thresholds
 449 |         results = test_with_rejection(
 450 |             thresholds, scores, groundtruth_labels, predicted_labels, full=True)
 451 | 
 452 |         # Check if any results exceed given constraints (e.g. too many rejects)
 453 |         unacceptable = [results[k] > v for k, v in ceiling.items()]
 454 |         if any(unacceptable):
 455 |             continue
 456 | 
 457 |         # 'Score' current thresholds with objective function
 458 |         outcome = objective_func(results)
 459 | 
 460 |         # If current thresholds are better, save new best outcomes
 461 |         if outcome > best_outcome:
 462 |             best_outcome = outcome
 463 |             best_thresholds = thresholds
 464 |             best_results = results
 465 | 
 466 |             logging.info('New best: [{:.4f}] @ {} || Max: {}Min: {}'.format(
 467 |                 outcome, thresholds,
 468 |                 format_opts(max_metrics, results),
 469 |                 format_opts(min_metrics, results)))
 470 |             # report_results(results)
 471 |             logging.warning('{} combinations sampled so far!'.format(n_samples))
 472 | 
 473 |         # If the maximum number of thresholds have been sampled, abort search
 474 |         if max_samples is not None and n_samples >= max_samples:
 475 |             logging.warning(
 476 |                 'Max samples reached ({}) - search aborted'.format(max_samples))
 477 |             logging.info('Settling for: [{}] @ {} || Max: {}Min: {}'.format(
 478 |                 best_outcome, best_thresholds,
 479 |                 format_opts(max_metrics, best_results),
 480 |                 format_opts(min_metrics, best_results)))
 481 |             # report_results(results)
 482 | 
 483 |             return best_thresholds
 484 | 
 485 |         n_samples += 1
 486 | 
 487 | 
 488 | def package_cred_conf(cred_values, conf_values, criteria):
 489 |     package = {}
 490 |     if 'cred' in criteria:
 491 |         package['cred'] = cred_values
 492 |     if 'conf' in criteria:
 493 |         package['conf'] = conf_values
 494 | 
 495 |     return package
 496 | 
 497 | 
 498 | def compute_single_cred_p_value(
 499 |         train_ncms, groundtruth_train, single_test_ncm, single_y_test):
 500 |     """Compute a single credibility p-value.
 501 | 
 502 |     Credibility p-values describe how 'conformal' a point is with respect to
 503 |     the other objects of that class. They're computed as the proportion of
 504 |     points with greater NCMs (the number of points _less conforming_ than the
 505 |     reference point) over the total number of points.
 506 | 
 507 |     Intuitively, a point predicted as malware which is the further away from
 508 |     the decision boundary than any other point will have the highest p-value
 509 |     out of all other malware points. It will have the smallest NCM (as it is
 510 |     the least _non-conforming_) and thus no other points will have a greater
 511 |     NCM and it will have a credibility p-value of 1.
 512 | 
 513 |     Args:
 514 |         train_ncms (np.ndarray): An array of training NCMs to compare the
 515 |             reference point against.
 516 |         groundtruth_train (np.ndarray): An array of ground truths corresponding
 517 |             to `train_ncms`.
 518 |         single_test_ncm (float): A single reference point to compute the
 519 |             p-value of.
 520 |         single_y_test (int): Either the ground truth (calibration) or predicted
 521 |             label (testing) of `single_test_ncm`.
 522 | 
 523 |     See Also:
 524 |         - `compute_p_values_cred_and_conf`
 525 |         - `compute_single_conf_p_value`
 526 | 
 527 |     Returns:
 528 |         float: The p-value for `single_test_ncm` w.r.t. `train_ncms`.
 529 | 
 530 |     """
 531 |     assert len(set(groundtruth_train)) == 2  # binary classification tasks only
 532 | 
 533 |     how_many_are_greater_than_single_test_ncm = 0
 534 | 
 535 |     for ncm, groundtruth in zip(train_ncms, groundtruth_train):
 536 |         if groundtruth == single_y_test and ncm >= single_test_ncm:
 537 |             how_many_are_greater_than_single_test_ncm += 1
 538 | 
 539 |     single_cred_p_value = (how_many_are_greater_than_single_test_ncm /
 540 |                            sum(1 for y in groundtruth_train if
 541 |                                y == single_y_test))
 542 |     return single_cred_p_value
 543 | 
 544 | 
 545 | def compute_single_conf_p_value(
 546 |         train_ncms, groundtruth_train, single_test_ncm, single_y_test):
 547 |     """Compute a single confidence p-value.
 548 | 
 549 |     The confidence p-value is computed similarly to the credibility p-value,
 550 |     except it aims to capture the confidence that the classifier has that the
 551 |     point _doesn't_ belong to the opposite class.
 552 | 
 553 |     To achieve this we assume that point has the label of the second highest
 554 |     scoring class---in binary classification, simply the opposite class---and
 555 |     compute the credibility p-value with respect to other points of that class.
 556 |     The confidence p-value is (1 - this value).
 557 | 
 558 |     Note that in transductive conformal evaluation, the entire classifier
 559 |     should be retrained with the reference point given the label of the
 560 |     opposite class. Usually, this is computationally prohibitive, and so this
 561 |     approximation assumes that the decision boundary undergoes only minimal
 562 |     changes when the label of a single point is flipped.
 563 | 
 564 |     See Also:
 565 |         - `compute_p_values_cred_and_conf`
 566 |         - `compute_single_cred_p_value`
 567 | 
 568 |     Args:
 569 |         train_ncms (np.ndarray): An array of training NCMs to compare the
 570 |             reference point against.
 571 |         groundtruth_train (np.ndarray): An array of ground truths corresponding
 572 |             to `train_ncms`.
 573 |         single_test_ncm (float): A single reference point to compute the
 574 |             p-value of.
 575 |         single_y_test (int): Either the ground truth (calibration) or predicted
 576 |             label (testing) of `single_test_ncm`.
 577 | 
 578 |     Returns:
 579 |         float: The p-value for `single_test_ncm` w.r.t. `train_ncms`.
 580 | 
 581 |     """
 582 |     assert len(set(groundtruth_train)) == 2  # binary classification tasks only
 583 | 
 584 |     # 'Cast' NCMs to NCMs with respect to the opposite class (binary only)
 585 |     # train_ncms_opposite_class = -1 * np.array(train_ncms)
 586 |     single_y_test_opposite_class = 0 if single_y_test == 1 else 1
 587 |     single_test_ncm_opposite_class = -1 * single_test_ncm
 588 | 
 589 |     how_many_are_greater_than_single_test_ncm = 0
 590 | 
 591 |     for ncm, groundtruth in zip(train_ncms, groundtruth_train):
 592 |         if (groundtruth == single_y_test_opposite_class
 593 |                 and ncm >= single_test_ncm_opposite_class):
 594 |             how_many_are_greater_than_single_test_ncm += 1
 595 | 
 596 |     single_cred_p_value_opposite_class = (
 597 |             how_many_are_greater_than_single_test_ncm /
 598 |             sum(1 for y in groundtruth_train if
 599 |                 y == single_y_test_opposite_class))
 600 | 
 601 |     return 1 - single_cred_p_value_opposite_class  # confidence p value
 602 | 
 603 | 
 604 | def compute_p_values_cred_and_conf(
 605 |         train_ncms, groundtruth_train, test_ncms, y_test):
 606 |     """Helper function to compute p-values across an entire array."""
 607 |     cred = [compute_single_cred_p_value(train_ncms=train_ncms,
 608 |                                         groundtruth_train=groundtruth_train,
 609 |                                         single_test_ncm=ncm,
 610 |                                         single_y_test=y)
 611 |             for ncm, y in tqdm(
 612 |             zip(test_ncms, y_test), total=len(y_test), desc='cred pvals', position=0, leave=True)]
 613 |     # conf = [compute_single_conf_p_value(train_ncms=train_ncms,
 614 |     #                                     groundtruth_train=groundtruth_train,
 615 |     #                                     single_test_ncm=ncm,
 616 |     #                                     single_y_test=y)
 617 |     #         for ncm, y in tqdm(
 618 |     #         zip(test_ncms, y_test), total=len(y_test), desc='conf pvals', position=0, leave=True)]
 619 | 
 620 |     return {'cred': cred}
 621 |     # , 'conf': conf
 622 | 
 623 | 
 624 | def get_svm_ncms(decision_function, X_in, y_in):
 625 |     """Helper functions to get NCMs across an entire pair of X,y arrays. """
 626 |     return [get_single_svm_ncm(decision_function, x, y) for x, y in
 627 |             tqdm(zip(X_in, y_in), total=len(y_in), desc='svm ncms', position=0, leave=True)]
 628 | 
 629 | 
 630 | def get_single_svm_ncm(decision_function, single_x, single_y):
 631 |     """Collect a non-conformity measure from the classifier for `single_x`.
 632 | 
 633 |     A note about SVM ncms: In binary classification with a linear SVM, the
 634 |     output score is the distance from the hyperplane with respect to the
 635 |     positive class. If the score is negative, the prediction is class 0, if
 636 |     positive, it's class 1 (in sklearn technically it will be clf.class_[0] and
 637 |     clf.class_[1] respectively). To perform thresholding with conformal
 638 |     evaluator, we need the distance from the hyperplane with respect to *both*
 639 |     classes, so we simply flip the sign to get the 'reflection' for the other
 640 |     class.
 641 | 
 642 |     Args:
 643 |         clf (sklearn.svm.SVC): The classifier to use for the NCMs.
 644 |         single_x (np.ndarray): An single feature vector to get the NCM for.
 645 |         single_y (int): The ground truth corresponding to feature vector
 646 |             `single_x`.
 647 | 
 648 |     Returns:
 649 |         float: The NCM for the given `single_x`.
 650 | 
 651 |     """
 652 |     decision = decision_function(single_x)
 653 | 
 654 |     # If y (ground truth in calibration, prediction in testing) is malware
 655 |     # then flip the sign to ensure the most conforming point is most minimal.
 656 |     # decision = -abs(decision)
 657 |     # mal;1 -> 0
 658 |     if single_y == 1:
 659 |         return -decision
 660 |     elif single_y == 0:
 661 |         return decision
 662 |     raise Exception('Unknown class? Only binary decisions supported.')
 663 | 
 664 | 
 665 | def cache_data(model, data_path):
 666 |     """Cache data (trained model, computed p-values, etc).
 667 | 
 668 |     Args:
 669 |         model: The data to save.
 670 |         data_path: (str) To avoid mix-ups, and to allow safe caching of models
 671 |             produced during calibration, it's advised to keep this location
 672 |             'fold-specific'.
 673 | 
 674 |     See Also:
 675 |         - `load_cached_data`
 676 | 
 677 |     """
 678 | 
 679 |     model_folder_path = os.path.dirname(data_path)
 680 | 
 681 |     if not os.path.exists(model_folder_path):
 682 |         os.makedirs(model_folder_path)
 683 | 
 684 |     logging.info('Saving data to {}...'.format(data_path))
 685 |     with open(data_path, 'wb') as f:
 686 |         pkl.dump(model, f)
 687 |     logging.debug('Done.')
 688 | 
 689 | 
 690 | def load_cached_data(data_path):
 691 |     """Load cached data (trained model, computed p-values, etc).
 692 | 
 693 |     Args:
 694 |         data_path: (str) To avoid mix-ups, and to allow safe caching of models
 695 |             produced during calibration, it's advised to keep this location
 696 |             'fold-specific'.
 697 | 
 698 |     See Also:
 699 |         - `cache_data`
 700 | 
 701 |     Returns:
 702 |         The previously cached data.
 703 | 
 704 |     """
 705 |     logging.info('Loading data from {}...'.format(data_path))
 706 |     with open(data_path, 'rb') as f:
 707 |         model = pkl.load(f)
 708 |     logging.debug('Done.')
 709 |     return model
 710 | 
 711 | 
 712 | def train_calibration_ice(clf, X_proper_train, X_cal,
 713 |                           y_proper_train, y_cal,
 714 |                           fold_index):
 715 |     """Train calibration set (for a single fold).
 716 | 
 717 |     Quite a bit of information is needed here for the later p-value
 718 |     computation and probability comparison. The returned dictionary has
 719 |     the following structure:
 720 | 
 721 |         'cred_p_val_cal_fold'  -->  # Calibration credibility p values
 722 |         'conf_p_val_cal_fold'  -->  # Calibration confidence p values
 723 |         'ncms_cal_fold'        -->  # Calibration NCMs
 724 |         'pred_cal_fold'        -->  # Calibration predictions
 725 |         'groundtruth_cal_fold' -->  # Calibration groundtruth
 726 |         'probas_cal_fold'      -->  # Calibration probabilities
 727 |         'pred_proba_cal_fold'  -->  # Calibration predictions
 728 | 
 729 |     Args:
 730 |         X_proper_train (np.ndarray): Features for the 'proper training
 731 |             set' partition.
 732 |         X_cal (np.ndarray): Features for a single calibration set
 733 |             partition.
 734 |         y_proper_train (np.ndarray): Ground truths for the 'proper
 735 |             training set' partition.
 736 |         y_cal (np.ndarray): Ground truths for a single calibration set
 737 |             partition.
 738 |         fold_index: An index to identify the current fold (used for caching).
 739 | 
 740 |     Returns:
 741 |         dict: Fold results, structure as in the docstring above.
 742 | 
 743 |     """
 744 |     # Train model with proper training
 745 |     clf.fit(X_proper_train, y_proper_train)
 746 | 
 747 |     # Get ncms for proper training fold
 748 |     logging.debug('Getting training ncms for fold {}...'.format(fold_index))
 749 |     groundtruth_proper_train_fold = y_proper_train
 750 | 
 751 |     # Get ncms for calibration fold
 752 | 
 753 |     logging.debug('Getting calibration ncms for fold {}...'.format(fold_index))
 754 |     pred_cal_fold = clf.predict(X_cal)
 755 |     groundtruth_cal_fold = y_cal
 756 | 
 757 |     # Compute p values for calibration fold
 758 | 
 759 |     logging.debug('Computing cal p values for fold {}...'.format(fold_index))
 760 |     ncms_cal_fold = get_svm_ncms(clf.decision_function, X_cal, y_cal)
 761 |     # data.cache_data(ncms_cal_fold, saved_ncms_name)
 762 | 
 763 |     # saved_pvals_name = 'p_vals_{}_cal_fold_{}.p'.format(alg_name, fold_index)
 764 |     # saved_pvals_name = os.path.join(saved_data_folder, saved_pvals_name)
 765 |     #
 766 |     # if os.path.exists(saved_pvals_name):
 767 |     #     p_val_cal_fold_dict = data.load_cached_data(saved_pvals_name)
 768 |     # else:
 769 |     #     # TODO | Doublecheck implications of duplicating the reference
 770 |     #     # TODO | point in the 'train_ncms'
 771 |     p_val_cal_fold_dict = compute_p_values_cred_and_conf(
 772 |         train_ncms=ncms_cal_fold,
 773 |         groundtruth_train=groundtruth_cal_fold,
 774 |         test_ncms=ncms_cal_fold,
 775 |         y_test=groundtruth_cal_fold)
 776 |     # data.cache_data(p_val_cal_fold_dict, saved_pvals_name)
 777 | 
 778 |     # Compute values for calibration probabilities
 779 |     # logging.debug('Computing cal probas for fold {}...'.format(fold_index))
 780 |     # probas_cal_fold, pred_proba_cal_fold = get_svm_probs(clf, X_cal)
 781 | 
 782 |     return {
 783 |         # Calibration credibility p values
 784 |         'cred_p_val_cal': p_val_cal_fold_dict['cred'],
 785 |         # Calibration confidence p values
 786 |         # 'conf_p_val_cal': p_val_cal_fold_dict['conf'],
 787 |         'ncms_cal': ncms_cal_fold,  # Calibration NCMs
 788 |         'pred_cal': pred_cal_fold,  # Calibration predictions
 789 |         'groundtruth_cal': groundtruth_cal_fold,  # Calibration groundtruth
 790 |         # 'probas_cal': probas_cal_fold,  # Calibration probabilities
 791 |         # 'pred_proba_cal': pred_proba_cal_fold,  # Calibration predictions
 792 |     }
 793 | 
 794 | 
 795 | def train_calibration_ice_withmodel(
 796 |         X_proper_train, X_cal,
 797 |         y_proper_train, y_cal, alg_name, fold_index, saved_data_folder, model_name):
 798 |     """Train calibration set (for a single fold).
 799 | 
 800 |     Quite a bit of information is needed here for the later p-value
 801 |     computation and probability comparison. The returned dictionary has
 802 |     the following structure:
 803 | 
 804 |         'cred_p_val_cal_fold'  -->  # Calibration credibility p values
 805 |         'conf_p_val_cal_fold'  -->  # Calibration confidence p values
 806 |         'ncms_cal_fold'        -->  # Calibration NCMs
 807 |         'pred_cal_fold'        -->  # Calibration predictions
 808 |         'groundtruth_cal_fold' -->  # Calibration groundtruth
 809 |         'probas_cal_fold'      -->  # Calibration probabilities
 810 |         'pred_proba_cal_fold'  -->  # Calibration predictions
 811 | 
 812 |     Args:
 813 |         X_proper_train (np.ndarray): Features for the 'proper training
 814 |             set' partition.
 815 |         X_cal (np.ndarray): Features for a single calibration set
 816 |             partition.
 817 |         y_proper_train (np.ndarray): Ground truths for the 'proper
 818 |             training set' partition.
 819 |         y_cal (np.ndarray): Ground truths for a single calibration set
 820 |             partition.
 821 |         fold_index: An index to identify the current fold (used for caching).
 822 | 
 823 |     Returns:
 824 |         dict: Fold results, structure as in the docstring above.
 825 | 
 826 |     """
 827 |     # Train model with proper training
 828 |     model_name = os.path.join(saved_data_folder, model_name)
 829 |     svm = load_cached_data(model_name)
 830 | 
 831 |     # Get ncms for calibration fold
 832 |     logging.debug('Getting calibration ncms for fold {}...'.format(fold_index))
 833 |     pred_cal_fold = svm.predict(X_cal)
 834 |     groundtruth_cal_fold = y_cal
 835 | 
 836 |     # Compute p values for calibration fold
 837 | 
 838 |     logging.debug('Computing cal p values for fold {}...'.format(fold_index))
 839 | 
 840 |     ncms_cal_fold = get_svm_ncms(svm, X_cal, y_cal)
 841 |     p_val_cal_fold_dict = compute_p_values_cred_and_conf(
 842 |         train_ncms=ncms_cal_fold,
 843 |         groundtruth_train=groundtruth_cal_fold,
 844 |         test_ncms=ncms_cal_fold,
 845 |         y_test=groundtruth_cal_fold)
 846 | 
 847 |     return {
 848 |         # Calibration credibility p values
 849 |         'cred_p_val_cal': p_val_cal_fold_dict['cred'],
 850 |         # Calibration confidence p values
 851 |         # 'conf_p_val_cal': p_val_cal_fold_dict['conf'],
 852 |         'ncms_cal': ncms_cal_fold,  # Calibration NCMs
 853 |         'pred_cal': pred_cal_fold,  # Calibration predictions
 854 |         'groundtruth_cal': groundtruth_cal_fold,  # Calibration groundtruth
 855 |         # 'probas_cal': probas_cal_fold,  # Calibration probabilities
 856 |         # 'pred_proba_cal': pred_proba_cal_fold,  # Calibration predictions
 857 |         'model': svm
 858 |     }
 859 | 
 860 | 
 861 | def test_with_rejection_keep_masks(
 862 |         binary_thresholds, test_scores, groundtruth_labels, predicted_labels, full=True):
 863 |     keep_mask = apply_threshold(
 864 |         binary_thresholds=binary_thresholds,
 865 |         test_scores=test_scores,
 866 |         y_test=predicted_labels)
 867 | 
 868 |     results = get_performance_with_rejection(
 869 |         y_true=groundtruth_labels,
 870 |         y_pred=predicted_labels,
 871 |         keep_mask=keep_mask,
 872 |         full=full)
 873 | 
 874 |     return results, keep_mask
 875 | 
 876 | 
 877 | def report_results(d, quiet=False):
 878 |     """Produce a textual report based on the given results.
 879 | 
 880 |     Args:
 881 |         d (dict): Results for baseline, kept, and rejected metrics.
 882 |         quiet (bool): Whether to also print the results to stdout.
 883 | 
 884 |     Returns:
 885 |         str: A textual report of the results.
 886 | 
 887 |     """
 888 |     report_str = ''
 889 | 
 890 |     def print_and_extend(report_line):
 891 |         nonlocal report_str
 892 |         if not quiet:
 893 |             print(report_line)
 894 |         report_str += report_line + '\n'
 895 | 
 896 |     s = '% kept elements: {:.1f}, % rejected elements: {:.1f}'.format(
 897 |         d['kept_total_perc'] * 100, d['reject_total_perc'] * 100)
 898 |     print_and_extend(s)
 899 | 
 900 |     s = '% benign rejected elements: {:.1f}, % malware rejected elements: {:.1f}'.format(
 901 |         d['reject_neg_total'] * 100, d['reject_pos_total'] * 100)
 902 |     print_and_extend(s)
 903 | 
 904 |     s = '% benign kept: {:.1f}, % benign rejected: {:.1f}'.format(
 905 |         d['kept_neg_perc'] * 100, d['reject_neg_perc'] * 100)
 906 | 
 907 |     print_and_extend(s)
 908 | 
 909 |     s = '% malware kept: {:.1f}, % malware rejected: {:.1f}'.format(
 910 |         d['kept_pos_perc'] * 100, d['reject_pos_perc'] * 100)
 911 | 
 912 |     print_and_extend(s)
 913 | 
 914 |     s = ('F1 baseline:  {:>12.2f} | '
 915 |          'F1 keep:      {:>12.2f} | '
 916 |          'F1 reject:    {:>12.2f}').format(
 917 |         d['f1_b'], d['f1_k'], d['f1_r'])
 918 | 
 919 |     print_and_extend(s)
 920 | 
 921 |     s = ('Pr baseline:  {:>12.2f} | '
 922 |          'Pr keep:      {:>12.2f} | '
 923 |          'Pr reject:    {:>12.2f}'.format(
 924 |         d['precision_b'], d['precision_k'], d['precision_r']))
 925 | 
 926 |     print_and_extend(s)
 927 | 
 928 |     s = ('Rec baseline: {:>12.2f} | '
 929 |          'Rec keep:     {:>12.2f} | '
 930 |          'Rec reject:   {:>12.2f}'.format(
 931 |         d['recall_b'], d['recall_k'], d['recall_r']))
 932 | 
 933 |     print_and_extend(s)
 934 | 
 935 |     s = ('TP baseline:  {:>12.2f} | '
 936 |          'TP keep:      {:>12.2f} | '
 937 |          'TP reject:    {:>12.2f}'.format(d['tp_b'], d['tp_k'], d['tp_r']))
 938 |     print_and_extend(s)
 939 | 
 940 |     s = ('FP baseline:  {:>12.2f} | '
 941 |          'FP keep:      {:>12.2f} | '
 942 |          'FP reject:    {:>12.2f}'.format(d['fp_b'], d['fp_k'], d['fp_r']))
 943 |     print_and_extend(s)
 944 | 
 945 |     s = ('TN baseline:  {:>12.2f} | '
 946 |          'TN keep:      {:>12.2f} | '
 947 |          'TN reject:    {:>12.2f}'.format(d['tn_b'], d['tn_k'], d['tn_r']))
 948 |     print_and_extend(s)
 949 | 
 950 |     s = ('FN baseline:  {:>12.2f} | '
 951 |          'FN keep:      {:>12.2f} | '
 952 |          'FN reject:    {:>12.2f}'.format(d['fn_b'], d['fn_k'], d['fn_r']))
 953 |     print_and_extend(s)
 954 | 
 955 |     s = ('TPR baseline: {:>12.2f} | '
 956 |          'TPR keep:     {:>12.2f} | '
 957 |          'TPR reject:   {:>12.2f}'.format(d['tpr_b'], d['tpr_k'], d['tpr_r']))
 958 |     print_and_extend(s)
 959 | 
 960 |     s = ('FPR baseline: {:>12.2f} | '
 961 |          'FPR keep:     {:>12.2f} | '
 962 |          'FPR reject:   {:>12.2f}'.format(d['fpr_b'], d['fpr_k'], d['fpr_r']))
 963 |     print_and_extend(s)
 964 | 
 965 |     return report_str
 966 | 
 967 | 
 968 | def find_random_search_thresholds_with_constraints(
 969 |         scores, predicted_labels, groundtruth_labels, maximise_vals,
 970 |         constraint_vals, max_samples=100, quiet=False, ncpu=-1):
 971 |     """Perform a random grid search to find the best thresholds on `scores` in
 972 |     parallel.
 973 | 
 974 |     This method wraps `find_random_search_thresholds_with_constraints_discrete`
 975 |     and parallelizes it. For a full description of this, read the documentation
 976 |     of the aformentioned method.
 977 | 
 978 |     See Also:
 979 |         - `find_random_search_threhsolds_with_constraint_discrete``
 980 | 
 981 |     Args:
 982 |         scores (dict): The test scores on which to perform the random search.
 983 |         predicted_labels (np.ndarray): The set of predictions to decide which
 984 |             'per-class' threshold to use.
 985 |         groundtruth_labels (np.ndarray): The groundtruth label for each object.
 986 |         maximise_vals: The metrics that should be maximised.
 987 |         constraint_vals: The metrics that are constrained.
 988 |         max_samples (int): The maximum number of random threshold combinations
 989 |             to try before settling for the best performance up to that point.
 990 |         quiet (bool): If True, logging will be disabled.
 991 |         ncpu (int): Number of cpus to use, if negative then we compute it as
 992 |             total_cpu + ncpu, if ncpu=1 then we do not parallelize, this is done
 993 |             to avoid problems with nested parallelization
 994 | 
 995 |     Returns:
 996 |         dict: Set of thresholds for malware ('gw') and goodware ('gw') classes.
 997 | 
 998 |     """
 999 | 
1000 |     ncpu = mp.cpu_count() + ncpu if ncpu < 0 else ncpu
1001 | 
1002 |     if ncpu == 1:
1003 |         results, thresholds = find_random_search_thresholds_with_constraints_discrete(
1004 |             scores, predicted_labels, groundtruth_labels, maximise_vals,
1005 |             constraint_vals, max_samples, quiet)
1006 | 
1007 |         return thresholds
1008 | 
1009 |     samples = [max_samples // ncpu for _ in range(ncpu)]
1010 | 
1011 |     with mp.Pool(processes=ncpu) as pool:
1012 |         results = pool.starmap(find_random_search_thresholds_with_constraints_discrete,
1013 |                                zip(repeat(scores), repeat(predicted_labels), repeat(groundtruth_labels),
1014 |                                    repeat(maximise_vals), repeat(constraint_vals), samples, repeat(quiet)))
1015 | 
1016 |         results_list = [res[0] for res in results]
1017 |         thresholds_list = [res[1] for res in results]
1018 | 
1019 |     def resolve_keyvals(s):
1020 |         if isinstance(s, str):
1021 |             pairs = s.split(',')
1022 |             pairs = [x.split(':') for x in pairs]
1023 |             return {k: float(v) for k, v in pairs}
1024 |         return s
1025 | 
1026 |     maximise_vals = resolve_keyvals(maximise_vals)
1027 |     constraint_vals = resolve_keyvals(constraint_vals)
1028 | 
1029 |     best_maximised = {k: 0 for k in maximise_vals}
1030 |     best_constrained = {k: 0 for k in constraint_vals}
1031 |     best_thresholds, best_result = {}, {}
1032 | 
1033 |     for result, thresholds in zip(results_list, thresholds_list):
1034 |         if any([result[k] > best_maximised[k] for k in maximise_vals]):
1035 |             best_maximised = {k: result[k] for k in maximise_vals}
1036 |             best_constrained = {k: result[k] for k in constraint_vals}
1037 |             best_thresholds = thresholds
1038 |             best_result = result
1039 | 
1040 |             if not quiet:
1041 |                 logging.info('New best: {} {} @ {} '.format(
1042 |                     format_opts(maximise_vals.keys(), result),
1043 |                     format_opts(constraint_vals.keys(), result),
1044 |                     best_thresholds))
1045 |                 report_results(best_result)
1046 | 
1047 |             continue
1048 | 
1049 |         if all([result[k] == best_maximised[k] for k in maximise_vals]):
1050 |             if all([result[k] >= best_constrained[k] for k in constraint_vals]):
1051 |                 best_maximised = {k: result[k] for k in maximise_vals}
1052 |                 best_constrained = {k: result[k] for k in constraint_vals}
1053 |                 best_thresholds = thresholds
1054 |                 best_result = result
1055 | 
1056 |                 if not quiet:
1057 |                     logging.info('New best: {} {} @ {} '.format(
1058 |                         format_opts(maximise_vals.keys(), result),
1059 |                         format_opts(constraint_vals.keys(), result),
1060 |                         best_thresholds))
1061 |                     report_results(best_result)
1062 | 
1063 |             continue
1064 |     print(best_thresholds)
1065 |     return best_thresholds
1066 | 
1067 | 
1068 | def find_random_search_thresholds_with_constraints_discrete(
1069 |         scores, predicted_labels, groundtruth_labels, maximise_vals,
1070 |         constraint_vals, max_samples=100, quiet=False, stop_condition=3000):
1071 |     """Perform a random grid search to find the best thresholds on `scores`.
1072 | 
1073 |     `scores` expects a dictionary keyed by 'cred' and/or 'conf',
1074 |     with sub-dictionaries containing the thresholds for the mw and gw classes.
1075 | 
1076 |     Note that the keys of `scores` determine _which_ thresholding criteria will
1077 |     be enforced. That is, if only a 'cred' dictionary is supplied, thresholding
1078 |     will be enforced on cred-only and the same for 'conf'. Supplying cred and
1079 |     conf dictionaries will enforce the 'cred+conf' thresholding criteria (all
1080 |     thresholds will be applied).
1081 | 
1082 |     `maximise_vals` describes the metrics that should be maximised and their
1083 |     minimum acceptable values. It expects either a dictionary of metrics, or a
1084 |     string or comma separated metrics.
1085 | 
1086 |     `constrained_vals` describes the floors for metrics that a threshold must
1087 |     pass in order to be acceptable. The algorithm will also try to maximise
1088 |     these metrics if possible, although never at the expense of `maximise_vals`.
1089 | 
1090 |     Both `maximise_vals` and `constrained_vals` expect a dictionary of metrics
1091 |     and maximum acceptable values. Alternatively, arguments can be given in
1092 |     string form as comma-separated key:value pairs, for example,
1093 |     'key1:value1,key2:value2,key3:value3'.
1094 | 
1095 |     Concretely, any of the following are acceptable:
1096 | 
1097 |         > maximise_vals = {'f1': 0.95}
1098 |         > maximise_vals = 'f1_k:0.95'
1099 | 
1100 |         > constrained_vals = {'kept_pos_perc': 0.76, 'kept_neg_perc': 0.76}
1101 |         > constrained_vals = kept_pos_perc:0.76,kept_neg_perc:0.76
1102 | 
1103 |     For a list of possible metrics, see the keys in the dict produced by
1104 |     `get_performance_with_rejection()`. Note that the default objective
1105 |     function assumes that the provided metrics are in the interval [0,1].
1106 | 
1107 |     See Also:
1108 |         - `get_performance_with_rejection`
1109 | 
1110 |     Args:
1111 |         scores (dict): The test scores on which to perform the random search.
1112 |         predicted_labels (np.ndarray): The set of predictions to decide which
1113 |             'per-class' threshold to use.
1114 |         groundtruth_labels (np.ndarray): The groundtruth label for each object.
1115 |         maximise_vals: The metrics that should be maximised.
1116 |         constraint_vals: The metrics that are constrained.
1117 |         max_samples (int): The maximum number of random threshold combinations
1118 |             to try before settling for the best performance up to that point.
1119 |         quiet (bool): If True, logging will be disabled.
1120 | 
1121 |     Returns:
1122 |         dict: Set of thresholds for malware ('gw') and goodware ('gw') classes.
1123 | 
1124 |     """
1125 | 
1126 |     # as this method is called from multiprocessing, we want to make sure each
1127 |     # process has a different seed
1128 |     seed = 0
1129 |     for l in os.urandom(10): seed += l
1130 |     np.random.seed(seed)
1131 | 
1132 |     def resolve_keyvals(s):
1133 |         if isinstance(s, str):
1134 |             pairs = s.split(',')
1135 |             pairs = [x.split(':') for x in pairs]
1136 |             return {k: float(v) for k, v in pairs}
1137 |         return s
1138 | 
1139 |     maximise_vals = resolve_keyvals(maximise_vals)
1140 |     constraint_vals = resolve_keyvals(constraint_vals)
1141 | 
1142 |     best_maximised = {k: 0 for k in maximise_vals}
1143 |     best_constrained = {k: 0 for k in constraint_vals}
1144 |     best_thresholds, best_result = {}, {}
1145 | 
1146 |     logging.info('Searching for threshold on calibration data...')
1147 | 
1148 |     stop_counter = 0
1149 | 
1150 |     for _ in tqdm(range(max_samples)):
1151 |         # Choose and package random thresholds
1152 |         thresholds = {}
1153 |         if 'cred' in scores:
1154 |             cred_thresholds = random_threshold(scores['cred'], predicted_labels)
1155 |             thresholds['cred'] = cred_thresholds
1156 |         if 'conf' in scores:
1157 |             conf_thresholds = random_threshold(scores['conf'], predicted_labels)
1158 |             thresholds['conf'] = conf_thresholds
1159 | 
1160 |         # Test with chosen thresholds
1161 |         result = test_with_rejection(
1162 |             thresholds, scores, groundtruth_labels, predicted_labels)
1163 | 
1164 |         # Check if any results exceed given constraints (e.g. too many rejects)
1165 |         if any([result[k] < constraint_vals[k] for k in constraint_vals]):
1166 |             if stop_counter > stop_condition:
1167 |                 logging.info('Exceeded stop condition, terminating calibration search...')
1168 |                 break
1169 | 
1170 |             stop_counter += 1
1171 |             continue
1172 | 
1173 |         if any([result[k] < best_maximised[k] for k in maximise_vals]):
1174 |             if stop_counter > stop_condition:
1175 |                 logging.info('Exceeded stop condition, terminating calibration search...')
1176 |                 break
1177 | 
1178 |             stop_counter += 1
1179 |             continue
1180 | 
1181 |         if any([result[k] > best_maximised[k] for k in maximise_vals]):
1182 |             best_maximised = {k: result[k] for k in maximise_vals}
1183 |             best_constrained = {k: result[k] for k in constraint_vals}
1184 |             best_thresholds = thresholds
1185 |             best_result = result
1186 | 
1187 |             if not quiet:
1188 |                 logging.info('New best: {} {} @ {} '.format(
1189 |                     format_opts(maximise_vals.keys(), result),
1190 |                     format_opts(constraint_vals.keys(), result),
1191 |                     best_thresholds))
1192 |                 report_results(best_result)
1193 | 
1194 |             stop_counter = 0
1195 |             continue
1196 | 
1197 |         if all([result[k] == best_maximised[k] for k in maximise_vals]):
1198 |             if all([result[k] >= best_constrained[k] for k in constraint_vals]):
1199 |                 best_maximised = {k: result[k] for k in maximise_vals}
1200 |                 best_constrained = {k: result[k] for k in constraint_vals}
1201 |                 best_thresholds = thresholds
1202 |                 best_result = result
1203 | 
1204 |                 if not quiet:
1205 |                     logging.info('New best: {} {} @ {} '.format(
1206 |                         format_opts(maximise_vals.keys(), result),
1207 |                         format_opts(constraint_vals.keys(), result),
1208 |                         best_thresholds))
1209 |                     report_results(best_result)
1210 | 
1211 |             stop_counter = 0
1212 |             continue
1213 | 
1214 |     if not bool(best_result):
1215 |         best_result = result
1216 | 
1217 |     return (best_result, best_thresholds)
1218 | 
1219 | 
1220 | def get_svm_probs(clf, X_in):
1221 |     """Get scores and predictions for comparison with probabilities.
1222 | 
1223 |     Note that this function returns the predictions _and_ probabilities given
1224 |     by the classifier and that these predictions may different from other
1225 |     outputs from the same classifier (such as `predict` or `decision_function`.
1226 |     This is due to Platt's scaling (and it's implementation in scikit-learn) in
1227 |     which a 5-fold SVM is trained and used to score the observation
1228 |     (`predict_proba()` is actually the average of these 5 classifiers).
1229 | 
1230 |     The takeaway is to be sure that you're always using probability scores with
1231 |     probability predictions and not with the output of other SVC functions.
1232 | 
1233 |     Args:
1234 |         clf (sklearn.svm.SVC): The classifier to use for the probabilities.
1235 |         X_in (np.ndarray): An array of feature vectors to classify.
1236 | 
1237 |     Returns:
1238 |         (list, list): (Probability scores, probability labels) for `X_in`.
1239 | 
1240 |     """
1241 |     assert hasattr(clf, 'predict_proba')
1242 |     probability_results = clf.predict_proba(X_in)
1243 |     probas_cal_fold = [np.max(t) for t in probability_results]
1244 |     pred_proba_cal_fold = [np.argmax(t) for t in probability_results]
1245 |     return probas_cal_fold, pred_proba_cal_fold
1246 | 


--------------------------------------------------------------------------------